Initial commit: FastAPI JBOD monitor backend
This commit is contained in:
0
services/__init__.py
Normal file
0
services/__init__.py
Normal file
BIN
services/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
services/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
BIN
services/__pycache__/cache.cpython-314.pyc
Normal file
BIN
services/__pycache__/cache.cpython-314.pyc
Normal file
Binary file not shown.
BIN
services/__pycache__/enclosure.cpython-314.pyc
Normal file
BIN
services/__pycache__/enclosure.cpython-314.pyc
Normal file
Binary file not shown.
BIN
services/__pycache__/smart.cpython-314.pyc
Normal file
BIN
services/__pycache__/smart.cpython-314.pyc
Normal file
Binary file not shown.
29
services/cache.py
Normal file
29
services/cache.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
|
||||
class TTLCache:
|
||||
"""Simple in-memory TTL cache."""
|
||||
|
||||
def __init__(self, ttl_seconds: int = 60):
|
||||
self._ttl = ttl_seconds
|
||||
self._store: dict[str, tuple[float, Any]] = {}
|
||||
|
||||
def get(self, key: str) -> Any | None:
|
||||
entry = self._store.get(key)
|
||||
if entry is None:
|
||||
return None
|
||||
ts, value = entry
|
||||
if time.monotonic() - ts > self._ttl:
|
||||
del self._store[key]
|
||||
return None
|
||||
return value
|
||||
|
||||
def set(self, key: str, value: Any) -> None:
|
||||
self._store[key] = (time.monotonic(), value)
|
||||
|
||||
def clear(self) -> None:
|
||||
self._store.clear()
|
||||
|
||||
|
||||
smart_cache = TTLCache(ttl_seconds=60)
|
||||
118
services/enclosure.py
Normal file
118
services/enclosure.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ENCLOSURE_BASE = Path("/sys/class/enclosure")
|
||||
|
||||
|
||||
def _read_sysfs(path: Path) -> str:
|
||||
"""Read a sysfs attribute file, return stripped content or empty string."""
|
||||
try:
|
||||
return path.read_text().strip()
|
||||
except (OSError, IOError):
|
||||
return ""
|
||||
|
||||
|
||||
def _find_sg_device(enclosure_path: Path) -> str | None:
|
||||
"""Resolve the sg device for an enclosure from its sysfs path."""
|
||||
# The enclosure sysfs directory has a 'device' symlink. Under that,
|
||||
# there's a scsi_generic directory containing the sg device name.
|
||||
sg_dir = enclosure_path / "device" / "scsi_generic"
|
||||
if sg_dir.is_dir():
|
||||
entries = list(sg_dir.iterdir())
|
||||
if entries:
|
||||
return f"/dev/{entries[0].name}"
|
||||
return None
|
||||
|
||||
|
||||
def discover_enclosures() -> list[dict]:
|
||||
"""Walk /sys/class/enclosure/ to discover SES enclosures."""
|
||||
if not ENCLOSURE_BASE.is_dir():
|
||||
logger.warning("No enclosure sysfs directory found at %s", ENCLOSURE_BASE)
|
||||
return []
|
||||
|
||||
enclosures = []
|
||||
for enc_dir in sorted(ENCLOSURE_BASE.iterdir()):
|
||||
if not enc_dir.is_dir():
|
||||
continue
|
||||
|
||||
enc_id = enc_dir.name
|
||||
device_dir = enc_dir / "device"
|
||||
|
||||
vendor = _read_sysfs(device_dir / "vendor")
|
||||
model = _read_sysfs(device_dir / "model")
|
||||
revision = _read_sysfs(device_dir / "rev")
|
||||
sg_device = _find_sg_device(enc_dir)
|
||||
|
||||
slots = list_slots(enc_id)
|
||||
total = len(slots)
|
||||
populated = sum(1 for s in slots if s["populated"])
|
||||
|
||||
enclosures.append({
|
||||
"id": enc_id,
|
||||
"sg_device": sg_device,
|
||||
"vendor": vendor,
|
||||
"model": model,
|
||||
"revision": revision,
|
||||
"total_slots": total,
|
||||
"populated_slots": populated,
|
||||
})
|
||||
|
||||
return enclosures
|
||||
|
||||
|
||||
def list_slots(enclosure_id: str) -> list[dict]:
|
||||
"""Enumerate drive slots for an enclosure via sysfs."""
|
||||
enc_dir = ENCLOSURE_BASE / enclosure_id
|
||||
if not enc_dir.is_dir():
|
||||
return []
|
||||
|
||||
slots = []
|
||||
for entry in sorted(enc_dir.iterdir()):
|
||||
# Slot entries are directories like "Slot 00", "Slot 01", etc.
|
||||
# Some enclosures use "Disk" or "ArrayDevice" prefixes.
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
name = entry.name
|
||||
slot_num = _parse_slot_number(name)
|
||||
if slot_num is None:
|
||||
continue
|
||||
|
||||
# Check if a block device is linked in this slot
|
||||
block_dir = entry / "device" / "block"
|
||||
device = None
|
||||
populated = False
|
||||
|
||||
if block_dir.is_dir():
|
||||
devs = list(block_dir.iterdir())
|
||||
if devs:
|
||||
device = devs[0].name
|
||||
populated = True
|
||||
else:
|
||||
# Also check the 'status' file — "not installed" means empty
|
||||
status = _read_sysfs(entry / "status")
|
||||
if status and status != "not installed":
|
||||
populated = True
|
||||
|
||||
slots.append({
|
||||
"slot": slot_num,
|
||||
"populated": populated,
|
||||
"device": device,
|
||||
})
|
||||
|
||||
return slots
|
||||
|
||||
|
||||
def _parse_slot_number(name: str) -> int | None:
|
||||
"""Extract the slot number from a sysfs slot directory name."""
|
||||
# Handles "Slot 00", "Slot00", "Disk 1", "ArrayDevice00", etc.
|
||||
for prefix in ("Slot ", "Slot", "Disk ", "Disk", "ArrayDevice", "SLOT "):
|
||||
if name.startswith(prefix):
|
||||
num_str = name[len(prefix):].strip()
|
||||
try:
|
||||
return int(num_str)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
159
services/smart.py
Normal file
159
services/smart.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from services.cache import smart_cache
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# SMART attribute IDs of interest
|
||||
ATTR_REALLOCATED = 5
|
||||
ATTR_POWER_ON_HOURS = 9
|
||||
ATTR_TEMPERATURE = 194
|
||||
ATTR_TEMPERATURE_ALT = 190
|
||||
ATTR_PENDING = 197
|
||||
ATTR_UNCORRECTABLE = 198
|
||||
ATTR_WEAR_LEVELING = 177 # SSD wear leveling
|
||||
|
||||
|
||||
def smartctl_available() -> bool:
|
||||
return shutil.which("smartctl") is not None
|
||||
|
||||
|
||||
def sg_ses_available() -> bool:
|
||||
return shutil.which("sg_ses") is not None
|
||||
|
||||
|
||||
async def get_smart_data(device: str) -> dict:
|
||||
"""Run smartctl -a -j against a device, with caching."""
|
||||
# Sanitize device name: only allow alphanumeric and hyphens
|
||||
if not re.match(r"^[a-zA-Z0-9\-]+$", device):
|
||||
raise ValueError(f"Invalid device name: {device}")
|
||||
|
||||
cached = smart_cache.get(device)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
result = await _run_smartctl(device)
|
||||
smart_cache.set(device, result)
|
||||
return result
|
||||
|
||||
|
||||
async def _run_smartctl(device: str) -> dict:
|
||||
"""Execute smartctl and parse JSON output."""
|
||||
dev_path = f"/dev/{device}"
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"smartctl", "-a", "-j", dev_path,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await proc.communicate()
|
||||
except FileNotFoundError:
|
||||
return {"error": "smartctl not found", "smart_supported": False}
|
||||
|
||||
# smartctl returns non-zero for many non-fatal reasons (bit flags).
|
||||
# The JSON output is still valid even with non-zero exit codes.
|
||||
if not stdout:
|
||||
return {
|
||||
"error": f"smartctl produced no output (exit code {proc.returncode})",
|
||||
"smart_supported": False,
|
||||
}
|
||||
|
||||
try:
|
||||
data = json.loads(stdout)
|
||||
except json.JSONDecodeError:
|
||||
return {"error": "Failed to parse smartctl JSON output", "smart_supported": False}
|
||||
|
||||
return _parse_smart_json(device, data)
|
||||
|
||||
|
||||
def _parse_smart_json(device: str, data: dict) -> dict:
|
||||
"""Extract relevant fields from smartctl JSON output."""
|
||||
result: dict = {"device": device, "smart_supported": True}
|
||||
|
||||
# Identity
|
||||
result["model"] = data.get("model_name")
|
||||
result["serial"] = data.get("serial_number")
|
||||
result["firmware"] = data.get("firmware_version")
|
||||
result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes")
|
||||
|
||||
# WWN
|
||||
wwn_data = data.get("wwn")
|
||||
if wwn_data:
|
||||
# Reconstruct WWN string from components
|
||||
naa = wwn_data.get("naa", 0)
|
||||
oui = wwn_data.get("oui", 0)
|
||||
wwn_id = wwn_data.get("id", 0)
|
||||
result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}"
|
||||
|
||||
# SMART health
|
||||
smart_status = data.get("smart_status", {})
|
||||
result["smart_healthy"] = smart_status.get("passed")
|
||||
|
||||
# Temperature
|
||||
temp = data.get("temperature", {})
|
||||
result["temperature_c"] = temp.get("current")
|
||||
|
||||
# Power-on hours
|
||||
poh = data.get("power_on_time", {})
|
||||
result["power_on_hours"] = poh.get("hours")
|
||||
|
||||
# SMART attributes (ATA)
|
||||
attrs = data.get("ata_smart_attributes", {}).get("table", [])
|
||||
result["smart_attributes"] = attrs
|
||||
|
||||
result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED)
|
||||
result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING)
|
||||
result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE)
|
||||
result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING)
|
||||
|
||||
# Power-on hours fallback from attributes
|
||||
if result["power_on_hours"] is None:
|
||||
result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS)
|
||||
|
||||
# Temperature fallback from attributes
|
||||
if result["temperature_c"] is None:
|
||||
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE)
|
||||
if result["temperature_c"] is None:
|
||||
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT)
|
||||
|
||||
# NVMe attributes (different structure)
|
||||
nvme_health = data.get("nvme_smart_health_information_log")
|
||||
if nvme_health:
|
||||
result["smart_attributes"] = [{"nvme_health": nvme_health}]
|
||||
if result["temperature_c"] is None:
|
||||
result["temperature_c"] = nvme_health.get("temperature")
|
||||
if result["power_on_hours"] is None:
|
||||
result["power_on_hours"] = nvme_health.get("power_on_hours")
|
||||
if result["wear_leveling_percent"] is None:
|
||||
pct_used = nvme_health.get("percentage_used")
|
||||
if pct_used is not None:
|
||||
result["wear_leveling_percent"] = 100 - pct_used
|
||||
|
||||
# SAS/SCSI drives use different error counters
|
||||
scsi_errors = data.get("scsi_error_counter_log")
|
||||
if scsi_errors and not attrs:
|
||||
result["smart_attributes"] = [{"scsi_error_log": scsi_errors}]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None:
|
||||
"""Get the raw_value for a SMART attribute by ID."""
|
||||
for attr in attrs:
|
||||
if attr.get("id") == attr_id:
|
||||
raw = attr.get("raw", {})
|
||||
return raw.get("value")
|
||||
return None
|
||||
|
||||
|
||||
def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None:
|
||||
"""Get the normalized value for a SMART attribute by ID."""
|
||||
for attr in attrs:
|
||||
if attr.get("id") == attr_id:
|
||||
return attr.get("value")
|
||||
return None
|
||||
Reference in New Issue
Block a user