commit 9f918a33085534e2c8debd1aade21496e95fce8c Author: adam Date: Sat Mar 7 02:14:17 2026 +0000 Initial commit: FastAPI JBOD monitor backend diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..b0917de --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +__pycache__ +*.pyc +.git +.venv diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..24bae9b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.13-slim + +RUN apt-get update && \ + apt-get install -y --no-install-recommends smartmontools sg3-utils && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..fd6e13b --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# JBOD Monitor + +REST API for monitoring drive health in JBOD enclosures on Linux. + +Auto-discovers SES enclosures via sysfs, maps drives to physical slots, and exposes SMART health data. + +## Prerequisites + +- Linux with SAS/SATA JBODs connected via HBA +- `smartmontools` — for `smartctl` (SMART data) +- `sg3-utils` — for `sg_ses` (SES enclosure data) +- Python 3.11+ + +```bash +# Debian/Ubuntu +apt install smartmontools sg3-utils + +# RHEL/Fedora +dnf install smartmontools sg3_utils +``` + +## Install + +```bash +pip install -r requirements.txt +``` + +## Run + +The API needs root access for `smartctl` to query drives: + +```bash +sudo uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +## API Endpoints + +| Endpoint | Description | +|---|---| +| `GET /api/health` | Service health + tool availability | +| `GET /api/enclosures` | List all discovered SES enclosures | +| `GET /api/enclosures/{id}/drives` | List drive slots for an enclosure | +| `GET /api/drives/{device}` | SMART detail for a block device | +| `GET /api/overview` | Aggregate enclosure + drive health | +| `GET /docs` | Interactive API docs (Swagger UI) | diff --git a/__pycache__/main.cpython-314.pyc b/__pycache__/main.cpython-314.pyc new file mode 100644 index 0000000..26aa45a Binary files /dev/null and b/__pycache__/main.cpython-314.pyc differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..b674ab0 --- /dev/null +++ b/main.py @@ -0,0 +1,52 @@ +import logging +import os + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from models.schemas import HealthCheck +from routers import drives, enclosures, overview +from services.smart import sg_ses_available, smartctl_available + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) + +app = FastAPI( + title="JBOD Monitor", + description="Drive health monitoring for JBOD enclosures", + version="0.1.0", +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(enclosures.router) +app.include_router(drives.router) +app.include_router(overview.router) + +_tool_status: dict[str, bool] = {} + + +@app.on_event("startup") +async def check_dependencies(): + _tool_status["smartctl"] = smartctl_available() + _tool_status["sg_ses"] = sg_ses_available() + + if not _tool_status["smartctl"]: + logger.warning("smartctl not found — install smartmontools for SMART data") + if not _tool_status["sg_ses"]: + logger.warning("sg_ses not found — install sg3-utils for enclosure SES data") + if os.geteuid() != 0: + logger.warning("Not running as root — smartctl may fail on some devices") + + +@app.get("/api/health", response_model=HealthCheck, tags=["health"]) +async def health(): + return HealthCheck(status="ok", tools=_tool_status) diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/__pycache__/__init__.cpython-314.pyc b/models/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..72cf939 Binary files /dev/null and b/models/__pycache__/__init__.cpython-314.pyc differ diff --git a/models/__pycache__/schemas.cpython-314.pyc b/models/__pycache__/schemas.cpython-314.pyc new file mode 100644 index 0000000..cef658c Binary files /dev/null and b/models/__pycache__/schemas.cpython-314.pyc differ diff --git a/models/schemas.py b/models/schemas.py new file mode 100644 index 0000000..2c2359d --- /dev/null +++ b/models/schemas.py @@ -0,0 +1,76 @@ +from pydantic import BaseModel + + +class Enclosure(BaseModel): + id: str + sg_device: str | None = None + vendor: str + model: str + revision: str + total_slots: int + populated_slots: int + + +class SlotInfo(BaseModel): + slot: int + populated: bool + device: str | None = None + + +class DriveDetail(BaseModel): + device: str + model: str | None = None + serial: str | None = None + wwn: str | None = None + firmware: str | None = None + capacity_bytes: int | None = None + smart_healthy: bool | None = None + smart_supported: bool = True + temperature_c: int | None = None + power_on_hours: int | None = None + reallocated_sectors: int | None = None + pending_sectors: int | None = None + uncorrectable_errors: int | None = None + wear_leveling_percent: int | None = None + smart_attributes: list[dict] = [] + + +class DriveHealthSummary(BaseModel): + device: str + model: str | None = None + serial: str | None = None + smart_healthy: bool | None = None + smart_supported: bool = True + temperature_c: int | None = None + power_on_hours: int | None = None + + +class SlotWithDrive(BaseModel): + slot: int + populated: bool + device: str | None = None + drive: DriveHealthSummary | None = None + + +class EnclosureWithDrives(BaseModel): + id: str + sg_device: str | None = None + vendor: str + model: str + revision: str + total_slots: int + populated_slots: int + slots: list[SlotWithDrive] + + +class Overview(BaseModel): + healthy: bool + drive_count: int + warning_count: int + error_count: int + enclosures: list[EnclosureWithDrives] + + +class HealthCheck(BaseModel): + status: str + tools: dict[str, bool] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fc70211 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +fastapi>=0.115.0 +uvicorn>=0.34.0 +pydantic>=2.10.0 diff --git a/routers/__init__.py b/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/routers/__pycache__/__init__.cpython-314.pyc b/routers/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..087e016 Binary files /dev/null and b/routers/__pycache__/__init__.cpython-314.pyc differ diff --git a/routers/__pycache__/drives.cpython-314.pyc b/routers/__pycache__/drives.cpython-314.pyc new file mode 100644 index 0000000..1fec6b6 Binary files /dev/null and b/routers/__pycache__/drives.cpython-314.pyc differ diff --git a/routers/__pycache__/enclosures.cpython-314.pyc b/routers/__pycache__/enclosures.cpython-314.pyc new file mode 100644 index 0000000..4dc5f0c Binary files /dev/null and b/routers/__pycache__/enclosures.cpython-314.pyc differ diff --git a/routers/__pycache__/overview.cpython-314.pyc b/routers/__pycache__/overview.cpython-314.pyc new file mode 100644 index 0000000..a2ce041 Binary files /dev/null and b/routers/__pycache__/overview.cpython-314.pyc differ diff --git a/routers/drives.py b/routers/drives.py new file mode 100644 index 0000000..62695a4 --- /dev/null +++ b/routers/drives.py @@ -0,0 +1,20 @@ +from fastapi import APIRouter, HTTPException + +from models.schemas import DriveDetail +from services.smart import get_smart_data + +router = APIRouter(prefix="/api/drives", tags=["drives"]) + + +@router.get("/{device}", response_model=DriveDetail) +async def get_drive_detail(device: str): + """Get SMART detail for a specific block device.""" + try: + data = await get_smart_data(device) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + + if "error" in data: + raise HTTPException(status_code=502, detail=data["error"]) + + return DriveDetail(**data) diff --git a/routers/enclosures.py b/routers/enclosures.py new file mode 100644 index 0000000..59858f5 --- /dev/null +++ b/routers/enclosures.py @@ -0,0 +1,24 @@ +from fastapi import APIRouter, HTTPException + +from models.schemas import Enclosure, SlotInfo +from services.enclosure import discover_enclosures, list_slots + +router = APIRouter(prefix="/api/enclosures", tags=["enclosures"]) + + +@router.get("", response_model=list[Enclosure]) +async def get_enclosures(): + """Discover all SES enclosures.""" + return discover_enclosures() + + +@router.get("/{enclosure_id}/drives", response_model=list[SlotInfo]) +async def get_enclosure_drives(enclosure_id: str): + """List all drive slots for an enclosure.""" + slots = list_slots(enclosure_id) + if not slots: + # Check if the enclosure exists at all + enclosures = discover_enclosures() + if not any(e["id"] == enclosure_id for e in enclosures): + raise HTTPException(status_code=404, detail=f"Enclosure '{enclosure_id}' not found") + return slots diff --git a/routers/overview.py b/routers/overview.py new file mode 100644 index 0000000..13ea7ff --- /dev/null +++ b/routers/overview.py @@ -0,0 +1,105 @@ +import asyncio +import logging + +from fastapi import APIRouter + +from models.schemas import ( + DriveHealthSummary, + EnclosureWithDrives, + Overview, + SlotWithDrive, +) +from services.enclosure import discover_enclosures, list_slots +from services.smart import get_smart_data + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/api/overview", tags=["overview"]) + + +@router.get("", response_model=Overview) +async def get_overview(): + """Aggregate view of all enclosures, slots, and drive health.""" + enclosures_raw = discover_enclosures() + + enc_results: list[EnclosureWithDrives] = [] + total_drives = 0 + warnings = 0 + errors = 0 + all_healthy = True + + for enc in enclosures_raw: + slots_raw = list_slots(enc["id"]) + + # Gather SMART data for all populated slots concurrently + populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]] + smart_tasks = [get_smart_data(dev) for _, dev in populated] + smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True) + + smart_map: dict[str, dict] = {} + for (slot_info, dev), result in zip(populated, smart_results): + if isinstance(result, Exception): + logger.warning("SMART query failed for %s: %s", dev, result) + smart_map[dev] = {"device": dev, "smart_supported": False} + else: + smart_map[dev] = result + + slots_out: list[SlotWithDrive] = [] + for s in slots_raw: + drive_summary = None + if s["device"] and s["device"] in smart_map: + sd = smart_map[s["device"]] + total_drives += 1 + + healthy = sd.get("smart_healthy") + if healthy is False: + errors += 1 + all_healthy = False + elif healthy is None and sd.get("smart_supported", True): + warnings += 1 + + # Check for concerning SMART values + if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0: + warnings += 1 + if sd.get("pending_sectors") and sd["pending_sectors"] > 0: + warnings += 1 + if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0: + warnings += 1 + + drive_summary = DriveHealthSummary( + device=sd["device"], + model=sd.get("model"), + serial=sd.get("serial"), + smart_healthy=healthy, + smart_supported=sd.get("smart_supported", True), + temperature_c=sd.get("temperature_c"), + power_on_hours=sd.get("power_on_hours"), + ) + elif s["populated"]: + total_drives += 1 + + slots_out.append(SlotWithDrive( + slot=s["slot"], + populated=s["populated"], + device=s["device"], + drive=drive_summary, + )) + + enc_results.append(EnclosureWithDrives( + id=enc["id"], + sg_device=enc.get("sg_device"), + vendor=enc["vendor"], + model=enc["model"], + revision=enc["revision"], + total_slots=enc["total_slots"], + populated_slots=enc["populated_slots"], + slots=slots_out, + )) + + return Overview( + healthy=all_healthy and errors == 0, + drive_count=total_drives, + warning_count=warnings, + error_count=errors, + enclosures=enc_results, + ) diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/services/__pycache__/__init__.cpython-314.pyc b/services/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..d17dc73 Binary files /dev/null and b/services/__pycache__/__init__.cpython-314.pyc differ diff --git a/services/__pycache__/cache.cpython-314.pyc b/services/__pycache__/cache.cpython-314.pyc new file mode 100644 index 0000000..6c56633 Binary files /dev/null and b/services/__pycache__/cache.cpython-314.pyc differ diff --git a/services/__pycache__/enclosure.cpython-314.pyc b/services/__pycache__/enclosure.cpython-314.pyc new file mode 100644 index 0000000..95360a3 Binary files /dev/null and b/services/__pycache__/enclosure.cpython-314.pyc differ diff --git a/services/__pycache__/smart.cpython-314.pyc b/services/__pycache__/smart.cpython-314.pyc new file mode 100644 index 0000000..bdb7560 Binary files /dev/null and b/services/__pycache__/smart.cpython-314.pyc differ diff --git a/services/cache.py b/services/cache.py new file mode 100644 index 0000000..657217a --- /dev/null +++ b/services/cache.py @@ -0,0 +1,29 @@ +import time +from typing import Any + + +class TTLCache: + """Simple in-memory TTL cache.""" + + def __init__(self, ttl_seconds: int = 60): + self._ttl = ttl_seconds + self._store: dict[str, tuple[float, Any]] = {} + + def get(self, key: str) -> Any | None: + entry = self._store.get(key) + if entry is None: + return None + ts, value = entry + if time.monotonic() - ts > self._ttl: + del self._store[key] + return None + return value + + def set(self, key: str, value: Any) -> None: + self._store[key] = (time.monotonic(), value) + + def clear(self) -> None: + self._store.clear() + + +smart_cache = TTLCache(ttl_seconds=60) diff --git a/services/enclosure.py b/services/enclosure.py new file mode 100644 index 0000000..fae13c9 --- /dev/null +++ b/services/enclosure.py @@ -0,0 +1,118 @@ +import os +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + +ENCLOSURE_BASE = Path("/sys/class/enclosure") + + +def _read_sysfs(path: Path) -> str: + """Read a sysfs attribute file, return stripped content or empty string.""" + try: + return path.read_text().strip() + except (OSError, IOError): + return "" + + +def _find_sg_device(enclosure_path: Path) -> str | None: + """Resolve the sg device for an enclosure from its sysfs path.""" + # The enclosure sysfs directory has a 'device' symlink. Under that, + # there's a scsi_generic directory containing the sg device name. + sg_dir = enclosure_path / "device" / "scsi_generic" + if sg_dir.is_dir(): + entries = list(sg_dir.iterdir()) + if entries: + return f"/dev/{entries[0].name}" + return None + + +def discover_enclosures() -> list[dict]: + """Walk /sys/class/enclosure/ to discover SES enclosures.""" + if not ENCLOSURE_BASE.is_dir(): + logger.warning("No enclosure sysfs directory found at %s", ENCLOSURE_BASE) + return [] + + enclosures = [] + for enc_dir in sorted(ENCLOSURE_BASE.iterdir()): + if not enc_dir.is_dir(): + continue + + enc_id = enc_dir.name + device_dir = enc_dir / "device" + + vendor = _read_sysfs(device_dir / "vendor") + model = _read_sysfs(device_dir / "model") + revision = _read_sysfs(device_dir / "rev") + sg_device = _find_sg_device(enc_dir) + + slots = list_slots(enc_id) + total = len(slots) + populated = sum(1 for s in slots if s["populated"]) + + enclosures.append({ + "id": enc_id, + "sg_device": sg_device, + "vendor": vendor, + "model": model, + "revision": revision, + "total_slots": total, + "populated_slots": populated, + }) + + return enclosures + + +def list_slots(enclosure_id: str) -> list[dict]: + """Enumerate drive slots for an enclosure via sysfs.""" + enc_dir = ENCLOSURE_BASE / enclosure_id + if not enc_dir.is_dir(): + return [] + + slots = [] + for entry in sorted(enc_dir.iterdir()): + # Slot entries are directories like "Slot 00", "Slot 01", etc. + # Some enclosures use "Disk" or "ArrayDevice" prefixes. + if not entry.is_dir(): + continue + name = entry.name + slot_num = _parse_slot_number(name) + if slot_num is None: + continue + + # Check if a block device is linked in this slot + block_dir = entry / "device" / "block" + device = None + populated = False + + if block_dir.is_dir(): + devs = list(block_dir.iterdir()) + if devs: + device = devs[0].name + populated = True + else: + # Also check the 'status' file — "not installed" means empty + status = _read_sysfs(entry / "status") + if status and status != "not installed": + populated = True + + slots.append({ + "slot": slot_num, + "populated": populated, + "device": device, + }) + + return slots + + +def _parse_slot_number(name: str) -> int | None: + """Extract the slot number from a sysfs slot directory name.""" + # Handles "Slot 00", "Slot00", "Disk 1", "ArrayDevice00", etc. + for prefix in ("Slot ", "Slot", "Disk ", "Disk", "ArrayDevice", "SLOT "): + if name.startswith(prefix): + num_str = name[len(prefix):].strip() + try: + return int(num_str) + except ValueError: + return None + return None diff --git a/services/smart.py b/services/smart.py new file mode 100644 index 0000000..a6fe68e --- /dev/null +++ b/services/smart.py @@ -0,0 +1,159 @@ +import asyncio +import json +import logging +import re +import shutil + +from services.cache import smart_cache + +logger = logging.getLogger(__name__) + +# SMART attribute IDs of interest +ATTR_REALLOCATED = 5 +ATTR_POWER_ON_HOURS = 9 +ATTR_TEMPERATURE = 194 +ATTR_TEMPERATURE_ALT = 190 +ATTR_PENDING = 197 +ATTR_UNCORRECTABLE = 198 +ATTR_WEAR_LEVELING = 177 # SSD wear leveling + + +def smartctl_available() -> bool: + return shutil.which("smartctl") is not None + + +def sg_ses_available() -> bool: + return shutil.which("sg_ses") is not None + + +async def get_smart_data(device: str) -> dict: + """Run smartctl -a -j against a device, with caching.""" + # Sanitize device name: only allow alphanumeric and hyphens + if not re.match(r"^[a-zA-Z0-9\-]+$", device): + raise ValueError(f"Invalid device name: {device}") + + cached = smart_cache.get(device) + if cached is not None: + return cached + + result = await _run_smartctl(device) + smart_cache.set(device, result) + return result + + +async def _run_smartctl(device: str) -> dict: + """Execute smartctl and parse JSON output.""" + dev_path = f"/dev/{device}" + + try: + proc = await asyncio.create_subprocess_exec( + "smartctl", "-a", "-j", dev_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + except FileNotFoundError: + return {"error": "smartctl not found", "smart_supported": False} + + # smartctl returns non-zero for many non-fatal reasons (bit flags). + # The JSON output is still valid even with non-zero exit codes. + if not stdout: + return { + "error": f"smartctl produced no output (exit code {proc.returncode})", + "smart_supported": False, + } + + try: + data = json.loads(stdout) + except json.JSONDecodeError: + return {"error": "Failed to parse smartctl JSON output", "smart_supported": False} + + return _parse_smart_json(device, data) + + +def _parse_smart_json(device: str, data: dict) -> dict: + """Extract relevant fields from smartctl JSON output.""" + result: dict = {"device": device, "smart_supported": True} + + # Identity + result["model"] = data.get("model_name") + result["serial"] = data.get("serial_number") + result["firmware"] = data.get("firmware_version") + result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes") + + # WWN + wwn_data = data.get("wwn") + if wwn_data: + # Reconstruct WWN string from components + naa = wwn_data.get("naa", 0) + oui = wwn_data.get("oui", 0) + wwn_id = wwn_data.get("id", 0) + result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}" + + # SMART health + smart_status = data.get("smart_status", {}) + result["smart_healthy"] = smart_status.get("passed") + + # Temperature + temp = data.get("temperature", {}) + result["temperature_c"] = temp.get("current") + + # Power-on hours + poh = data.get("power_on_time", {}) + result["power_on_hours"] = poh.get("hours") + + # SMART attributes (ATA) + attrs = data.get("ata_smart_attributes", {}).get("table", []) + result["smart_attributes"] = attrs + + result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED) + result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING) + result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE) + result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING) + + # Power-on hours fallback from attributes + if result["power_on_hours"] is None: + result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS) + + # Temperature fallback from attributes + if result["temperature_c"] is None: + result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE) + if result["temperature_c"] is None: + result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT) + + # NVMe attributes (different structure) + nvme_health = data.get("nvme_smart_health_information_log") + if nvme_health: + result["smart_attributes"] = [{"nvme_health": nvme_health}] + if result["temperature_c"] is None: + result["temperature_c"] = nvme_health.get("temperature") + if result["power_on_hours"] is None: + result["power_on_hours"] = nvme_health.get("power_on_hours") + if result["wear_leveling_percent"] is None: + pct_used = nvme_health.get("percentage_used") + if pct_used is not None: + result["wear_leveling_percent"] = 100 - pct_used + + # SAS/SCSI drives use different error counters + scsi_errors = data.get("scsi_error_counter_log") + if scsi_errors and not attrs: + result["smart_attributes"] = [{"scsi_error_log": scsi_errors}] + + return result + + +def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None: + """Get the raw_value for a SMART attribute by ID.""" + for attr in attrs: + if attr.get("id") == attr_id: + raw = attr.get("raw", {}) + return raw.get("value") + return None + + +def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None: + """Get the normalized value for a SMART attribute by ID.""" + for attr in attrs: + if attr.get("id") == attr_id: + return attr.get("value") + return None