Initial commit: FastAPI JBOD monitor backend

This commit is contained in:
2026-03-07 02:14:17 +00:00
commit 9f918a3308
26 changed files with 651 additions and 0 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
__pycache__
*.pyc
.git
.venv

16
Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.13-slim
RUN apt-get update && \
apt-get install -y --no-install-recommends smartmontools sg3-utils && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

45
README.md Normal file
View File

@@ -0,0 +1,45 @@
# JBOD Monitor
REST API for monitoring drive health in JBOD enclosures on Linux.
Auto-discovers SES enclosures via sysfs, maps drives to physical slots, and exposes SMART health data.
## Prerequisites
- Linux with SAS/SATA JBODs connected via HBA
- `smartmontools` — for `smartctl` (SMART data)
- `sg3-utils` — for `sg_ses` (SES enclosure data)
- Python 3.11+
```bash
# Debian/Ubuntu
apt install smartmontools sg3-utils
# RHEL/Fedora
dnf install smartmontools sg3_utils
```
## Install
```bash
pip install -r requirements.txt
```
## Run
The API needs root access for `smartctl` to query drives:
```bash
sudo uvicorn main:app --host 0.0.0.0 --port 8000
```
## API Endpoints
| Endpoint | Description |
|---|---|
| `GET /api/health` | Service health + tool availability |
| `GET /api/enclosures` | List all discovered SES enclosures |
| `GET /api/enclosures/{id}/drives` | List drive slots for an enclosure |
| `GET /api/drives/{device}` | SMART detail for a block device |
| `GET /api/overview` | Aggregate enclosure + drive health |
| `GET /docs` | Interactive API docs (Swagger UI) |

Binary file not shown.

52
main.py Normal file
View File

@@ -0,0 +1,52 @@
import logging
import os
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from models.schemas import HealthCheck
from routers import drives, enclosures, overview
from services.smart import sg_ses_available, smartctl_available
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
app = FastAPI(
title="JBOD Monitor",
description="Drive health monitoring for JBOD enclosures",
version="0.1.0",
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
app.include_router(enclosures.router)
app.include_router(drives.router)
app.include_router(overview.router)
_tool_status: dict[str, bool] = {}
@app.on_event("startup")
async def check_dependencies():
_tool_status["smartctl"] = smartctl_available()
_tool_status["sg_ses"] = sg_ses_available()
if not _tool_status["smartctl"]:
logger.warning("smartctl not found — install smartmontools for SMART data")
if not _tool_status["sg_ses"]:
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
if os.geteuid() != 0:
logger.warning("Not running as root — smartctl may fail on some devices")
@app.get("/api/health", response_model=HealthCheck, tags=["health"])
async def health():
return HealthCheck(status="ok", tools=_tool_status)

0
models/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

76
models/schemas.py Normal file
View File

@@ -0,0 +1,76 @@
from pydantic import BaseModel
class Enclosure(BaseModel):
id: str
sg_device: str | None = None
vendor: str
model: str
revision: str
total_slots: int
populated_slots: int
class SlotInfo(BaseModel):
slot: int
populated: bool
device: str | None = None
class DriveDetail(BaseModel):
device: str
model: str | None = None
serial: str | None = None
wwn: str | None = None
firmware: str | None = None
capacity_bytes: int | None = None
smart_healthy: bool | None = None
smart_supported: bool = True
temperature_c: int | None = None
power_on_hours: int | None = None
reallocated_sectors: int | None = None
pending_sectors: int | None = None
uncorrectable_errors: int | None = None
wear_leveling_percent: int | None = None
smart_attributes: list[dict] = []
class DriveHealthSummary(BaseModel):
device: str
model: str | None = None
serial: str | None = None
smart_healthy: bool | None = None
smart_supported: bool = True
temperature_c: int | None = None
power_on_hours: int | None = None
class SlotWithDrive(BaseModel):
slot: int
populated: bool
device: str | None = None
drive: DriveHealthSummary | None = None
class EnclosureWithDrives(BaseModel):
id: str
sg_device: str | None = None
vendor: str
model: str
revision: str
total_slots: int
populated_slots: int
slots: list[SlotWithDrive]
class Overview(BaseModel):
healthy: bool
drive_count: int
warning_count: int
error_count: int
enclosures: list[EnclosureWithDrives]
class HealthCheck(BaseModel):
status: str
tools: dict[str, bool]

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
fastapi>=0.115.0
uvicorn>=0.34.0
pydantic>=2.10.0

0
routers/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

20
routers/drives.py Normal file
View File

@@ -0,0 +1,20 @@
from fastapi import APIRouter, HTTPException
from models.schemas import DriveDetail
from services.smart import get_smart_data
router = APIRouter(prefix="/api/drives", tags=["drives"])
@router.get("/{device}", response_model=DriveDetail)
async def get_drive_detail(device: str):
"""Get SMART detail for a specific block device."""
try:
data = await get_smart_data(device)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
if "error" in data:
raise HTTPException(status_code=502, detail=data["error"])
return DriveDetail(**data)

24
routers/enclosures.py Normal file
View File

@@ -0,0 +1,24 @@
from fastapi import APIRouter, HTTPException
from models.schemas import Enclosure, SlotInfo
from services.enclosure import discover_enclosures, list_slots
router = APIRouter(prefix="/api/enclosures", tags=["enclosures"])
@router.get("", response_model=list[Enclosure])
async def get_enclosures():
"""Discover all SES enclosures."""
return discover_enclosures()
@router.get("/{enclosure_id}/drives", response_model=list[SlotInfo])
async def get_enclosure_drives(enclosure_id: str):
"""List all drive slots for an enclosure."""
slots = list_slots(enclosure_id)
if not slots:
# Check if the enclosure exists at all
enclosures = discover_enclosures()
if not any(e["id"] == enclosure_id for e in enclosures):
raise HTTPException(status_code=404, detail=f"Enclosure '{enclosure_id}' not found")
return slots

105
routers/overview.py Normal file
View File

@@ -0,0 +1,105 @@
import asyncio
import logging
from fastapi import APIRouter
from models.schemas import (
DriveHealthSummary,
EnclosureWithDrives,
Overview,
SlotWithDrive,
)
from services.enclosure import discover_enclosures, list_slots
from services.smart import get_smart_data
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/overview", tags=["overview"])
@router.get("", response_model=Overview)
async def get_overview():
"""Aggregate view of all enclosures, slots, and drive health."""
enclosures_raw = discover_enclosures()
enc_results: list[EnclosureWithDrives] = []
total_drives = 0
warnings = 0
errors = 0
all_healthy = True
for enc in enclosures_raw:
slots_raw = list_slots(enc["id"])
# Gather SMART data for all populated slots concurrently
populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]]
smart_tasks = [get_smart_data(dev) for _, dev in populated]
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
smart_map: dict[str, dict] = {}
for (slot_info, dev), result in zip(populated, smart_results):
if isinstance(result, Exception):
logger.warning("SMART query failed for %s: %s", dev, result)
smart_map[dev] = {"device": dev, "smart_supported": False}
else:
smart_map[dev] = result
slots_out: list[SlotWithDrive] = []
for s in slots_raw:
drive_summary = None
if s["device"] and s["device"] in smart_map:
sd = smart_map[s["device"]]
total_drives += 1
healthy = sd.get("smart_healthy")
if healthy is False:
errors += 1
all_healthy = False
elif healthy is None and sd.get("smart_supported", True):
warnings += 1
# Check for concerning SMART values
if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0:
warnings += 1
if sd.get("pending_sectors") and sd["pending_sectors"] > 0:
warnings += 1
if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0:
warnings += 1
drive_summary = DriveHealthSummary(
device=sd["device"],
model=sd.get("model"),
serial=sd.get("serial"),
smart_healthy=healthy,
smart_supported=sd.get("smart_supported", True),
temperature_c=sd.get("temperature_c"),
power_on_hours=sd.get("power_on_hours"),
)
elif s["populated"]:
total_drives += 1
slots_out.append(SlotWithDrive(
slot=s["slot"],
populated=s["populated"],
device=s["device"],
drive=drive_summary,
))
enc_results.append(EnclosureWithDrives(
id=enc["id"],
sg_device=enc.get("sg_device"),
vendor=enc["vendor"],
model=enc["model"],
revision=enc["revision"],
total_slots=enc["total_slots"],
populated_slots=enc["populated_slots"],
slots=slots_out,
))
return Overview(
healthy=all_healthy and errors == 0,
drive_count=total_drives,
warning_count=warnings,
error_count=errors,
enclosures=enc_results,
)

0
services/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

29
services/cache.py Normal file
View File

@@ -0,0 +1,29 @@
import time
from typing import Any
class TTLCache:
"""Simple in-memory TTL cache."""
def __init__(self, ttl_seconds: int = 60):
self._ttl = ttl_seconds
self._store: dict[str, tuple[float, Any]] = {}
def get(self, key: str) -> Any | None:
entry = self._store.get(key)
if entry is None:
return None
ts, value = entry
if time.monotonic() - ts > self._ttl:
del self._store[key]
return None
return value
def set(self, key: str, value: Any) -> None:
self._store[key] = (time.monotonic(), value)
def clear(self) -> None:
self._store.clear()
smart_cache = TTLCache(ttl_seconds=60)

118
services/enclosure.py Normal file
View File

@@ -0,0 +1,118 @@
import os
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
ENCLOSURE_BASE = Path("/sys/class/enclosure")
def _read_sysfs(path: Path) -> str:
"""Read a sysfs attribute file, return stripped content or empty string."""
try:
return path.read_text().strip()
except (OSError, IOError):
return ""
def _find_sg_device(enclosure_path: Path) -> str | None:
"""Resolve the sg device for an enclosure from its sysfs path."""
# The enclosure sysfs directory has a 'device' symlink. Under that,
# there's a scsi_generic directory containing the sg device name.
sg_dir = enclosure_path / "device" / "scsi_generic"
if sg_dir.is_dir():
entries = list(sg_dir.iterdir())
if entries:
return f"/dev/{entries[0].name}"
return None
def discover_enclosures() -> list[dict]:
"""Walk /sys/class/enclosure/ to discover SES enclosures."""
if not ENCLOSURE_BASE.is_dir():
logger.warning("No enclosure sysfs directory found at %s", ENCLOSURE_BASE)
return []
enclosures = []
for enc_dir in sorted(ENCLOSURE_BASE.iterdir()):
if not enc_dir.is_dir():
continue
enc_id = enc_dir.name
device_dir = enc_dir / "device"
vendor = _read_sysfs(device_dir / "vendor")
model = _read_sysfs(device_dir / "model")
revision = _read_sysfs(device_dir / "rev")
sg_device = _find_sg_device(enc_dir)
slots = list_slots(enc_id)
total = len(slots)
populated = sum(1 for s in slots if s["populated"])
enclosures.append({
"id": enc_id,
"sg_device": sg_device,
"vendor": vendor,
"model": model,
"revision": revision,
"total_slots": total,
"populated_slots": populated,
})
return enclosures
def list_slots(enclosure_id: str) -> list[dict]:
"""Enumerate drive slots for an enclosure via sysfs."""
enc_dir = ENCLOSURE_BASE / enclosure_id
if not enc_dir.is_dir():
return []
slots = []
for entry in sorted(enc_dir.iterdir()):
# Slot entries are directories like "Slot 00", "Slot 01", etc.
# Some enclosures use "Disk" or "ArrayDevice" prefixes.
if not entry.is_dir():
continue
name = entry.name
slot_num = _parse_slot_number(name)
if slot_num is None:
continue
# Check if a block device is linked in this slot
block_dir = entry / "device" / "block"
device = None
populated = False
if block_dir.is_dir():
devs = list(block_dir.iterdir())
if devs:
device = devs[0].name
populated = True
else:
# Also check the 'status' file — "not installed" means empty
status = _read_sysfs(entry / "status")
if status and status != "not installed":
populated = True
slots.append({
"slot": slot_num,
"populated": populated,
"device": device,
})
return slots
def _parse_slot_number(name: str) -> int | None:
"""Extract the slot number from a sysfs slot directory name."""
# Handles "Slot 00", "Slot00", "Disk 1", "ArrayDevice00", etc.
for prefix in ("Slot ", "Slot", "Disk ", "Disk", "ArrayDevice", "SLOT "):
if name.startswith(prefix):
num_str = name[len(prefix):].strip()
try:
return int(num_str)
except ValueError:
return None
return None

159
services/smart.py Normal file
View File

@@ -0,0 +1,159 @@
import asyncio
import json
import logging
import re
import shutil
from services.cache import smart_cache
logger = logging.getLogger(__name__)
# SMART attribute IDs of interest
ATTR_REALLOCATED = 5
ATTR_POWER_ON_HOURS = 9
ATTR_TEMPERATURE = 194
ATTR_TEMPERATURE_ALT = 190
ATTR_PENDING = 197
ATTR_UNCORRECTABLE = 198
ATTR_WEAR_LEVELING = 177 # SSD wear leveling
def smartctl_available() -> bool:
return shutil.which("smartctl") is not None
def sg_ses_available() -> bool:
return shutil.which("sg_ses") is not None
async def get_smart_data(device: str) -> dict:
"""Run smartctl -a -j against a device, with caching."""
# Sanitize device name: only allow alphanumeric and hyphens
if not re.match(r"^[a-zA-Z0-9\-]+$", device):
raise ValueError(f"Invalid device name: {device}")
cached = smart_cache.get(device)
if cached is not None:
return cached
result = await _run_smartctl(device)
smart_cache.set(device, result)
return result
async def _run_smartctl(device: str) -> dict:
"""Execute smartctl and parse JSON output."""
dev_path = f"/dev/{device}"
try:
proc = await asyncio.create_subprocess_exec(
"smartctl", "-a", "-j", dev_path,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
except FileNotFoundError:
return {"error": "smartctl not found", "smart_supported": False}
# smartctl returns non-zero for many non-fatal reasons (bit flags).
# The JSON output is still valid even with non-zero exit codes.
if not stdout:
return {
"error": f"smartctl produced no output (exit code {proc.returncode})",
"smart_supported": False,
}
try:
data = json.loads(stdout)
except json.JSONDecodeError:
return {"error": "Failed to parse smartctl JSON output", "smart_supported": False}
return _parse_smart_json(device, data)
def _parse_smart_json(device: str, data: dict) -> dict:
"""Extract relevant fields from smartctl JSON output."""
result: dict = {"device": device, "smart_supported": True}
# Identity
result["model"] = data.get("model_name")
result["serial"] = data.get("serial_number")
result["firmware"] = data.get("firmware_version")
result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes")
# WWN
wwn_data = data.get("wwn")
if wwn_data:
# Reconstruct WWN string from components
naa = wwn_data.get("naa", 0)
oui = wwn_data.get("oui", 0)
wwn_id = wwn_data.get("id", 0)
result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}"
# SMART health
smart_status = data.get("smart_status", {})
result["smart_healthy"] = smart_status.get("passed")
# Temperature
temp = data.get("temperature", {})
result["temperature_c"] = temp.get("current")
# Power-on hours
poh = data.get("power_on_time", {})
result["power_on_hours"] = poh.get("hours")
# SMART attributes (ATA)
attrs = data.get("ata_smart_attributes", {}).get("table", [])
result["smart_attributes"] = attrs
result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED)
result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING)
result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE)
result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING)
# Power-on hours fallback from attributes
if result["power_on_hours"] is None:
result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS)
# Temperature fallback from attributes
if result["temperature_c"] is None:
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE)
if result["temperature_c"] is None:
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT)
# NVMe attributes (different structure)
nvme_health = data.get("nvme_smart_health_information_log")
if nvme_health:
result["smart_attributes"] = [{"nvme_health": nvme_health}]
if result["temperature_c"] is None:
result["temperature_c"] = nvme_health.get("temperature")
if result["power_on_hours"] is None:
result["power_on_hours"] = nvme_health.get("power_on_hours")
if result["wear_leveling_percent"] is None:
pct_used = nvme_health.get("percentage_used")
if pct_used is not None:
result["wear_leveling_percent"] = 100 - pct_used
# SAS/SCSI drives use different error counters
scsi_errors = data.get("scsi_error_counter_log")
if scsi_errors and not attrs:
result["smart_attributes"] = [{"scsi_error_log": scsi_errors}]
return result
def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None:
"""Get the raw_value for a SMART attribute by ID."""
for attr in attrs:
if attr.get("id") == attr_id:
raw = attr.get("raw", {})
return raw.get("value")
return None
def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None:
"""Get the normalized value for a SMART attribute by ID."""
for attr in attrs:
if attr.get("id") == attr_id:
return attr.get("value")
return None