Initial commit: FastAPI JBOD monitor backend

2026-03-07 02:14:17 +00:00
commit 9f918a3308
26 changed files with 651 additions and 0 deletions
--- a/services/init.py
+++ b/services/init.py
--- a/services/pycache/init.cpython-314.pyc
+++ b/services/pycache/init.cpython-314.pyc
--- a/services/pycache/cache.cpython-314.pyc
+++ b/services/pycache/cache.cpython-314.pyc
--- a/services/pycache/enclosure.cpython-314.pyc
+++ b/services/pycache/enclosure.cpython-314.pyc
--- a/services/pycache/smart.cpython-314.pyc
+++ b/services/pycache/smart.cpython-314.pyc
--- a/services/cache.py
+++ b/services/cache.py
@@ -0,0 +1,29 @@
+import time
+from typing import Any
+
+
+class TTLCache:
+    """Simple in-memory TTL cache."""
+
+    def __init__(self, ttl_seconds: int = 60):
+        self._ttl = ttl_seconds
+        self._store: dict[str, tuple[float, Any]] = {}
+
+    def get(self, key: str) -> Any | None:
+        entry = self._store.get(key)
+        if entry is None:
+            return None
+        ts, value = entry
+        if time.monotonic() - ts > self._ttl:
+            del self._store[key]
+            return None
+        return value
+
+    def set(self, key: str, value: Any) -> None:
+        self._store[key] = (time.monotonic(), value)
+
+    def clear(self) -> None:
+        self._store.clear()
+
+
+smart_cache = TTLCache(ttl_seconds=60)
--- a/services/enclosure.py
+++ b/services/enclosure.py
@@ -0,0 +1,118 @@
+import os
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+ENCLOSURE_BASE = Path("/sys/class/enclosure")
+
+
+def _read_sysfs(path: Path) -> str:
+    """Read a sysfs attribute file, return stripped content or empty string."""
+    try:
+        return path.read_text().strip()
+    except (OSError, IOError):
+        return ""
+
+
+def _find_sg_device(enclosure_path: Path) -> str | None:
+    """Resolve the sg device for an enclosure from its sysfs path."""
+    # The enclosure sysfs directory has a 'device' symlink. Under that,
+    # there's a scsi_generic directory containing the sg device name.
+    sg_dir = enclosure_path / "device" / "scsi_generic"
+    if sg_dir.is_dir():
+        entries = list(sg_dir.iterdir())
+        if entries:
+            return f"/dev/{entries[0].name}"
+    return None
+
+
+def discover_enclosures() -> list[dict]:
+    """Walk /sys/class/enclosure/ to discover SES enclosures."""
+    if not ENCLOSURE_BASE.is_dir():
+        logger.warning("No enclosure sysfs directory found at %s", ENCLOSURE_BASE)
+        return []
+
+    enclosures = []
+    for enc_dir in sorted(ENCLOSURE_BASE.iterdir()):
+        if not enc_dir.is_dir():
+            continue
+
+        enc_id = enc_dir.name
+        device_dir = enc_dir / "device"
+
+        vendor = _read_sysfs(device_dir / "vendor")
+        model = _read_sysfs(device_dir / "model")
+        revision = _read_sysfs(device_dir / "rev")
+        sg_device = _find_sg_device(enc_dir)
+
+        slots = list_slots(enc_id)
+        total = len(slots)
+        populated = sum(1 for s in slots if s["populated"])
+
+        enclosures.append({
+            "id": enc_id,
+            "sg_device": sg_device,
+            "vendor": vendor,
+            "model": model,
+            "revision": revision,
+            "total_slots": total,
+            "populated_slots": populated,
+        })
+
+    return enclosures
+
+
+def list_slots(enclosure_id: str) -> list[dict]:
+    """Enumerate drive slots for an enclosure via sysfs."""
+    enc_dir = ENCLOSURE_BASE / enclosure_id
+    if not enc_dir.is_dir():
+        return []
+
+    slots = []
+    for entry in sorted(enc_dir.iterdir()):
+        # Slot entries are directories like "Slot 00", "Slot 01", etc.
+        # Some enclosures use "Disk" or "ArrayDevice" prefixes.
+        if not entry.is_dir():
+            continue
+        name = entry.name
+        slot_num = _parse_slot_number(name)
+        if slot_num is None:
+            continue
+
+        # Check if a block device is linked in this slot
+        block_dir = entry / "device" / "block"
+        device = None
+        populated = False
+
+        if block_dir.is_dir():
+            devs = list(block_dir.iterdir())
+            if devs:
+                device = devs[0].name
+                populated = True
+        else:
+            # Also check the 'status' file — "not installed" means empty
+            status = _read_sysfs(entry / "status")
+            if status and status != "not installed":
+                populated = True
+
+        slots.append({
+            "slot": slot_num,
+            "populated": populated,
+            "device": device,
+        })
+
+    return slots
+
+
+def _parse_slot_number(name: str) -> int | None:
+    """Extract the slot number from a sysfs slot directory name."""
+    # Handles "Slot 00", "Slot00", "Disk 1", "ArrayDevice00", etc.
+    for prefix in ("Slot ", "Slot", "Disk ", "Disk", "ArrayDevice", "SLOT "):
+        if name.startswith(prefix):
+            num_str = name[len(prefix):].strip()
+            try:
+                return int(num_str)
+            except ValueError:
+                return None
+    return None
--- a/services/smart.py
+++ b/services/smart.py
@@ -0,0 +1,159 @@
+import asyncio
+import json
+import logging
+import re
+import shutil
+
+from services.cache import smart_cache
+
+logger = logging.getLogger(__name__)
+
+# SMART attribute IDs of interest
+ATTR_REALLOCATED = 5
+ATTR_POWER_ON_HOURS = 9
+ATTR_TEMPERATURE = 194
+ATTR_TEMPERATURE_ALT = 190
+ATTR_PENDING = 197
+ATTR_UNCORRECTABLE = 198
+ATTR_WEAR_LEVELING = 177  # SSD wear leveling
+
+
+def smartctl_available() -> bool:
+    return shutil.which("smartctl") is not None
+
+
+def sg_ses_available() -> bool:
+    return shutil.which("sg_ses") is not None
+
+
+async def get_smart_data(device: str) -> dict:
+    """Run smartctl -a -j against a device, with caching."""
+    # Sanitize device name: only allow alphanumeric and hyphens
+    if not re.match(r"^[a-zA-Z0-9\-]+$", device):
+        raise ValueError(f"Invalid device name: {device}")
+
+    cached = smart_cache.get(device)
+    if cached is not None:
+        return cached
+
+    result = await _run_smartctl(device)
+    smart_cache.set(device, result)
+    return result
+
+
+async def _run_smartctl(device: str) -> dict:
+    """Execute smartctl and parse JSON output."""
+    dev_path = f"/dev/{device}"
+
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            "smartctl", "-a", "-j", dev_path,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        stdout, stderr = await proc.communicate()
+    except FileNotFoundError:
+        return {"error": "smartctl not found", "smart_supported": False}
+
+    # smartctl returns non-zero for many non-fatal reasons (bit flags).
+    # The JSON output is still valid even with non-zero exit codes.
+    if not stdout:
+        return {
+            "error": f"smartctl produced no output (exit code {proc.returncode})",
+            "smart_supported": False,
+        }
+
+    try:
+        data = json.loads(stdout)
+    except json.JSONDecodeError:
+        return {"error": "Failed to parse smartctl JSON output", "smart_supported": False}
+
+    return _parse_smart_json(device, data)
+
+
+def _parse_smart_json(device: str, data: dict) -> dict:
+    """Extract relevant fields from smartctl JSON output."""
+    result: dict = {"device": device, "smart_supported": True}
+
+    # Identity
+    result["model"] = data.get("model_name")
+    result["serial"] = data.get("serial_number")
+    result["firmware"] = data.get("firmware_version")
+    result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes")
+
+    # WWN
+    wwn_data = data.get("wwn")
+    if wwn_data:
+        # Reconstruct WWN string from components
+        naa = wwn_data.get("naa", 0)
+        oui = wwn_data.get("oui", 0)
+        wwn_id = wwn_data.get("id", 0)
+        result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}"
+
+    # SMART health
+    smart_status = data.get("smart_status", {})
+    result["smart_healthy"] = smart_status.get("passed")
+
+    # Temperature
+    temp = data.get("temperature", {})
+    result["temperature_c"] = temp.get("current")
+
+    # Power-on hours
+    poh = data.get("power_on_time", {})
+    result["power_on_hours"] = poh.get("hours")
+
+    # SMART attributes (ATA)
+    attrs = data.get("ata_smart_attributes", {}).get("table", [])
+    result["smart_attributes"] = attrs
+
+    result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED)
+    result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING)
+    result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE)
+    result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING)
+
+    # Power-on hours fallback from attributes
+    if result["power_on_hours"] is None:
+        result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS)
+
+    # Temperature fallback from attributes
+    if result["temperature_c"] is None:
+        result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE)
+    if result["temperature_c"] is None:
+        result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT)
+
+    # NVMe attributes (different structure)
+    nvme_health = data.get("nvme_smart_health_information_log")
+    if nvme_health:
+        result["smart_attributes"] = [{"nvme_health": nvme_health}]
+        if result["temperature_c"] is None:
+            result["temperature_c"] = nvme_health.get("temperature")
+        if result["power_on_hours"] is None:
+            result["power_on_hours"] = nvme_health.get("power_on_hours")
+        if result["wear_leveling_percent"] is None:
+            pct_used = nvme_health.get("percentage_used")
+            if pct_used is not None:
+                result["wear_leveling_percent"] = 100 - pct_used
+
+    # SAS/SCSI drives use different error counters
+    scsi_errors = data.get("scsi_error_counter_log")
+    if scsi_errors and not attrs:
+        result["smart_attributes"] = [{"scsi_error_log": scsi_errors}]
+
+    return result
+
+
+def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None:
+    """Get the raw_value for a SMART attribute by ID."""
+    for attr in attrs:
+        if attr.get("id") == attr_id:
+            raw = attr.get("raw", {})
+            return raw.get("value")
+    return None
+
+
+def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None:
+    """Get the normalized value for a SMART attribute by ID."""
+    for attr in attrs:
+        if attr.get("id") == attr_id:
+            return attr.get("value")
+    return None