Initial commit: FastAPI JBOD monitor backend

This commit is contained in:
2026-03-07 02:14:17 +00:00
commit 9f918a3308
26 changed files with 651 additions and 0 deletions

0
services/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

29
services/cache.py Normal file
View File

@@ -0,0 +1,29 @@
import time
from typing import Any
class TTLCache:
"""Simple in-memory TTL cache."""
def __init__(self, ttl_seconds: int = 60):
self._ttl = ttl_seconds
self._store: dict[str, tuple[float, Any]] = {}
def get(self, key: str) -> Any | None:
entry = self._store.get(key)
if entry is None:
return None
ts, value = entry
if time.monotonic() - ts > self._ttl:
del self._store[key]
return None
return value
def set(self, key: str, value: Any) -> None:
self._store[key] = (time.monotonic(), value)
def clear(self) -> None:
self._store.clear()
smart_cache = TTLCache(ttl_seconds=60)

118
services/enclosure.py Normal file
View File

@@ -0,0 +1,118 @@
import os
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
ENCLOSURE_BASE = Path("/sys/class/enclosure")
def _read_sysfs(path: Path) -> str:
"""Read a sysfs attribute file, return stripped content or empty string."""
try:
return path.read_text().strip()
except (OSError, IOError):
return ""
def _find_sg_device(enclosure_path: Path) -> str | None:
"""Resolve the sg device for an enclosure from its sysfs path."""
# The enclosure sysfs directory has a 'device' symlink. Under that,
# there's a scsi_generic directory containing the sg device name.
sg_dir = enclosure_path / "device" / "scsi_generic"
if sg_dir.is_dir():
entries = list(sg_dir.iterdir())
if entries:
return f"/dev/{entries[0].name}"
return None
def discover_enclosures() -> list[dict]:
"""Walk /sys/class/enclosure/ to discover SES enclosures."""
if not ENCLOSURE_BASE.is_dir():
logger.warning("No enclosure sysfs directory found at %s", ENCLOSURE_BASE)
return []
enclosures = []
for enc_dir in sorted(ENCLOSURE_BASE.iterdir()):
if not enc_dir.is_dir():
continue
enc_id = enc_dir.name
device_dir = enc_dir / "device"
vendor = _read_sysfs(device_dir / "vendor")
model = _read_sysfs(device_dir / "model")
revision = _read_sysfs(device_dir / "rev")
sg_device = _find_sg_device(enc_dir)
slots = list_slots(enc_id)
total = len(slots)
populated = sum(1 for s in slots if s["populated"])
enclosures.append({
"id": enc_id,
"sg_device": sg_device,
"vendor": vendor,
"model": model,
"revision": revision,
"total_slots": total,
"populated_slots": populated,
})
return enclosures
def list_slots(enclosure_id: str) -> list[dict]:
"""Enumerate drive slots for an enclosure via sysfs."""
enc_dir = ENCLOSURE_BASE / enclosure_id
if not enc_dir.is_dir():
return []
slots = []
for entry in sorted(enc_dir.iterdir()):
# Slot entries are directories like "Slot 00", "Slot 01", etc.
# Some enclosures use "Disk" or "ArrayDevice" prefixes.
if not entry.is_dir():
continue
name = entry.name
slot_num = _parse_slot_number(name)
if slot_num is None:
continue
# Check if a block device is linked in this slot
block_dir = entry / "device" / "block"
device = None
populated = False
if block_dir.is_dir():
devs = list(block_dir.iterdir())
if devs:
device = devs[0].name
populated = True
else:
# Also check the 'status' file — "not installed" means empty
status = _read_sysfs(entry / "status")
if status and status != "not installed":
populated = True
slots.append({
"slot": slot_num,
"populated": populated,
"device": device,
})
return slots
def _parse_slot_number(name: str) -> int | None:
"""Extract the slot number from a sysfs slot directory name."""
# Handles "Slot 00", "Slot00", "Disk 1", "ArrayDevice00", etc.
for prefix in ("Slot ", "Slot", "Disk ", "Disk", "ArrayDevice", "SLOT "):
if name.startswith(prefix):
num_str = name[len(prefix):].strip()
try:
return int(num_str)
except ValueError:
return None
return None

159
services/smart.py Normal file
View File

@@ -0,0 +1,159 @@
import asyncio
import json
import logging
import re
import shutil
from services.cache import smart_cache
logger = logging.getLogger(__name__)
# SMART attribute IDs of interest
ATTR_REALLOCATED = 5
ATTR_POWER_ON_HOURS = 9
ATTR_TEMPERATURE = 194
ATTR_TEMPERATURE_ALT = 190
ATTR_PENDING = 197
ATTR_UNCORRECTABLE = 198
ATTR_WEAR_LEVELING = 177 # SSD wear leveling
def smartctl_available() -> bool:
return shutil.which("smartctl") is not None
def sg_ses_available() -> bool:
return shutil.which("sg_ses") is not None
async def get_smart_data(device: str) -> dict:
"""Run smartctl -a -j against a device, with caching."""
# Sanitize device name: only allow alphanumeric and hyphens
if not re.match(r"^[a-zA-Z0-9\-]+$", device):
raise ValueError(f"Invalid device name: {device}")
cached = smart_cache.get(device)
if cached is not None:
return cached
result = await _run_smartctl(device)
smart_cache.set(device, result)
return result
async def _run_smartctl(device: str) -> dict:
"""Execute smartctl and parse JSON output."""
dev_path = f"/dev/{device}"
try:
proc = await asyncio.create_subprocess_exec(
"smartctl", "-a", "-j", dev_path,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
except FileNotFoundError:
return {"error": "smartctl not found", "smart_supported": False}
# smartctl returns non-zero for many non-fatal reasons (bit flags).
# The JSON output is still valid even with non-zero exit codes.
if not stdout:
return {
"error": f"smartctl produced no output (exit code {proc.returncode})",
"smart_supported": False,
}
try:
data = json.loads(stdout)
except json.JSONDecodeError:
return {"error": "Failed to parse smartctl JSON output", "smart_supported": False}
return _parse_smart_json(device, data)
def _parse_smart_json(device: str, data: dict) -> dict:
"""Extract relevant fields from smartctl JSON output."""
result: dict = {"device": device, "smart_supported": True}
# Identity
result["model"] = data.get("model_name")
result["serial"] = data.get("serial_number")
result["firmware"] = data.get("firmware_version")
result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes")
# WWN
wwn_data = data.get("wwn")
if wwn_data:
# Reconstruct WWN string from components
naa = wwn_data.get("naa", 0)
oui = wwn_data.get("oui", 0)
wwn_id = wwn_data.get("id", 0)
result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}"
# SMART health
smart_status = data.get("smart_status", {})
result["smart_healthy"] = smart_status.get("passed")
# Temperature
temp = data.get("temperature", {})
result["temperature_c"] = temp.get("current")
# Power-on hours
poh = data.get("power_on_time", {})
result["power_on_hours"] = poh.get("hours")
# SMART attributes (ATA)
attrs = data.get("ata_smart_attributes", {}).get("table", [])
result["smart_attributes"] = attrs
result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED)
result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING)
result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE)
result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING)
# Power-on hours fallback from attributes
if result["power_on_hours"] is None:
result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS)
# Temperature fallback from attributes
if result["temperature_c"] is None:
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE)
if result["temperature_c"] is None:
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT)
# NVMe attributes (different structure)
nvme_health = data.get("nvme_smart_health_information_log")
if nvme_health:
result["smart_attributes"] = [{"nvme_health": nvme_health}]
if result["temperature_c"] is None:
result["temperature_c"] = nvme_health.get("temperature")
if result["power_on_hours"] is None:
result["power_on_hours"] = nvme_health.get("power_on_hours")
if result["wear_leveling_percent"] is None:
pct_used = nvme_health.get("percentage_used")
if pct_used is not None:
result["wear_leveling_percent"] = 100 - pct_used
# SAS/SCSI drives use different error counters
scsi_errors = data.get("scsi_error_counter_log")
if scsi_errors and not attrs:
result["smart_attributes"] = [{"scsi_error_log": scsi_errors}]
return result
def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None:
"""Get the raw_value for a SMART attribute by ID."""
for attr in attrs:
if attr.get("id") == attr_id:
raw = attr.get("raw", {})
return raw.get("value")
return None
def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None:
"""Get the normalized value for a SMART attribute by ID."""
for attr in attrs:
if attr.get("id") == attr_id:
return attr.get("value")
return None