Initial commit: FastAPI JBOD monitor backend
This commit is contained in:
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
.git
|
||||||
|
.venv
|
||||||
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.13-slim
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends smartmontools sg3-utils && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
45
README.md
Normal file
45
README.md
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# JBOD Monitor
|
||||||
|
|
||||||
|
REST API for monitoring drive health in JBOD enclosures on Linux.
|
||||||
|
|
||||||
|
Auto-discovers SES enclosures via sysfs, maps drives to physical slots, and exposes SMART health data.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Linux with SAS/SATA JBODs connected via HBA
|
||||||
|
- `smartmontools` — for `smartctl` (SMART data)
|
||||||
|
- `sg3-utils` — for `sg_ses` (SES enclosure data)
|
||||||
|
- Python 3.11+
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Debian/Ubuntu
|
||||||
|
apt install smartmontools sg3-utils
|
||||||
|
|
||||||
|
# RHEL/Fedora
|
||||||
|
dnf install smartmontools sg3_utils
|
||||||
|
```
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
The API needs root access for `smartctl` to query drives:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
| Endpoint | Description |
|
||||||
|
|---|---|
|
||||||
|
| `GET /api/health` | Service health + tool availability |
|
||||||
|
| `GET /api/enclosures` | List all discovered SES enclosures |
|
||||||
|
| `GET /api/enclosures/{id}/drives` | List drive slots for an enclosure |
|
||||||
|
| `GET /api/drives/{device}` | SMART detail for a block device |
|
||||||
|
| `GET /api/overview` | Aggregate enclosure + drive health |
|
||||||
|
| `GET /docs` | Interactive API docs (Swagger UI) |
|
||||||
BIN
__pycache__/main.cpython-314.pyc
Normal file
BIN
__pycache__/main.cpython-314.pyc
Normal file
Binary file not shown.
52
main.py
Normal file
52
main.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
from models.schemas import HealthCheck
|
||||||
|
from routers import drives, enclosures, overview
|
||||||
|
from services.smart import sg_ses_available, smartctl_available
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="JBOD Monitor",
|
||||||
|
description="Drive health monitoring for JBOD enclosures",
|
||||||
|
version="0.1.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(enclosures.router)
|
||||||
|
app.include_router(drives.router)
|
||||||
|
app.include_router(overview.router)
|
||||||
|
|
||||||
|
_tool_status: dict[str, bool] = {}
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def check_dependencies():
|
||||||
|
_tool_status["smartctl"] = smartctl_available()
|
||||||
|
_tool_status["sg_ses"] = sg_ses_available()
|
||||||
|
|
||||||
|
if not _tool_status["smartctl"]:
|
||||||
|
logger.warning("smartctl not found — install smartmontools for SMART data")
|
||||||
|
if not _tool_status["sg_ses"]:
|
||||||
|
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
|
||||||
|
if os.geteuid() != 0:
|
||||||
|
logger.warning("Not running as root — smartctl may fail on some devices")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/health", response_model=HealthCheck, tags=["health"])
|
||||||
|
async def health():
|
||||||
|
return HealthCheck(status="ok", tools=_tool_status)
|
||||||
0
models/__init__.py
Normal file
0
models/__init__.py
Normal file
BIN
models/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
models/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
BIN
models/__pycache__/schemas.cpython-314.pyc
Normal file
BIN
models/__pycache__/schemas.cpython-314.pyc
Normal file
Binary file not shown.
76
models/schemas.py
Normal file
76
models/schemas.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class Enclosure(BaseModel):
|
||||||
|
id: str
|
||||||
|
sg_device: str | None = None
|
||||||
|
vendor: str
|
||||||
|
model: str
|
||||||
|
revision: str
|
||||||
|
total_slots: int
|
||||||
|
populated_slots: int
|
||||||
|
|
||||||
|
|
||||||
|
class SlotInfo(BaseModel):
|
||||||
|
slot: int
|
||||||
|
populated: bool
|
||||||
|
device: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class DriveDetail(BaseModel):
|
||||||
|
device: str
|
||||||
|
model: str | None = None
|
||||||
|
serial: str | None = None
|
||||||
|
wwn: str | None = None
|
||||||
|
firmware: str | None = None
|
||||||
|
capacity_bytes: int | None = None
|
||||||
|
smart_healthy: bool | None = None
|
||||||
|
smart_supported: bool = True
|
||||||
|
temperature_c: int | None = None
|
||||||
|
power_on_hours: int | None = None
|
||||||
|
reallocated_sectors: int | None = None
|
||||||
|
pending_sectors: int | None = None
|
||||||
|
uncorrectable_errors: int | None = None
|
||||||
|
wear_leveling_percent: int | None = None
|
||||||
|
smart_attributes: list[dict] = []
|
||||||
|
|
||||||
|
|
||||||
|
class DriveHealthSummary(BaseModel):
|
||||||
|
device: str
|
||||||
|
model: str | None = None
|
||||||
|
serial: str | None = None
|
||||||
|
smart_healthy: bool | None = None
|
||||||
|
smart_supported: bool = True
|
||||||
|
temperature_c: int | None = None
|
||||||
|
power_on_hours: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class SlotWithDrive(BaseModel):
|
||||||
|
slot: int
|
||||||
|
populated: bool
|
||||||
|
device: str | None = None
|
||||||
|
drive: DriveHealthSummary | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class EnclosureWithDrives(BaseModel):
|
||||||
|
id: str
|
||||||
|
sg_device: str | None = None
|
||||||
|
vendor: str
|
||||||
|
model: str
|
||||||
|
revision: str
|
||||||
|
total_slots: int
|
||||||
|
populated_slots: int
|
||||||
|
slots: list[SlotWithDrive]
|
||||||
|
|
||||||
|
|
||||||
|
class Overview(BaseModel):
|
||||||
|
healthy: bool
|
||||||
|
drive_count: int
|
||||||
|
warning_count: int
|
||||||
|
error_count: int
|
||||||
|
enclosures: list[EnclosureWithDrives]
|
||||||
|
|
||||||
|
|
||||||
|
class HealthCheck(BaseModel):
|
||||||
|
status: str
|
||||||
|
tools: dict[str, bool]
|
||||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
fastapi>=0.115.0
|
||||||
|
uvicorn>=0.34.0
|
||||||
|
pydantic>=2.10.0
|
||||||
0
routers/__init__.py
Normal file
0
routers/__init__.py
Normal file
BIN
routers/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
routers/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
BIN
routers/__pycache__/drives.cpython-314.pyc
Normal file
BIN
routers/__pycache__/drives.cpython-314.pyc
Normal file
Binary file not shown.
BIN
routers/__pycache__/enclosures.cpython-314.pyc
Normal file
BIN
routers/__pycache__/enclosures.cpython-314.pyc
Normal file
Binary file not shown.
BIN
routers/__pycache__/overview.cpython-314.pyc
Normal file
BIN
routers/__pycache__/overview.cpython-314.pyc
Normal file
Binary file not shown.
20
routers/drives.py
Normal file
20
routers/drives.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from models.schemas import DriveDetail
|
||||||
|
from services.smart import get_smart_data
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/drives", tags=["drives"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{device}", response_model=DriveDetail)
|
||||||
|
async def get_drive_detail(device: str):
|
||||||
|
"""Get SMART detail for a specific block device."""
|
||||||
|
try:
|
||||||
|
data = await get_smart_data(device)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=400, detail=str(e))
|
||||||
|
|
||||||
|
if "error" in data:
|
||||||
|
raise HTTPException(status_code=502, detail=data["error"])
|
||||||
|
|
||||||
|
return DriveDetail(**data)
|
||||||
24
routers/enclosures.py
Normal file
24
routers/enclosures.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
from models.schemas import Enclosure, SlotInfo
|
||||||
|
from services.enclosure import discover_enclosures, list_slots
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/enclosures", tags=["enclosures"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=list[Enclosure])
|
||||||
|
async def get_enclosures():
|
||||||
|
"""Discover all SES enclosures."""
|
||||||
|
return discover_enclosures()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{enclosure_id}/drives", response_model=list[SlotInfo])
|
||||||
|
async def get_enclosure_drives(enclosure_id: str):
|
||||||
|
"""List all drive slots for an enclosure."""
|
||||||
|
slots = list_slots(enclosure_id)
|
||||||
|
if not slots:
|
||||||
|
# Check if the enclosure exists at all
|
||||||
|
enclosures = discover_enclosures()
|
||||||
|
if not any(e["id"] == enclosure_id for e in enclosures):
|
||||||
|
raise HTTPException(status_code=404, detail=f"Enclosure '{enclosure_id}' not found")
|
||||||
|
return slots
|
||||||
105
routers/overview.py
Normal file
105
routers/overview.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from models.schemas import (
|
||||||
|
DriveHealthSummary,
|
||||||
|
EnclosureWithDrives,
|
||||||
|
Overview,
|
||||||
|
SlotWithDrive,
|
||||||
|
)
|
||||||
|
from services.enclosure import discover_enclosures, list_slots
|
||||||
|
from services.smart import get_smart_data
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/overview", tags=["overview"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("", response_model=Overview)
|
||||||
|
async def get_overview():
|
||||||
|
"""Aggregate view of all enclosures, slots, and drive health."""
|
||||||
|
enclosures_raw = discover_enclosures()
|
||||||
|
|
||||||
|
enc_results: list[EnclosureWithDrives] = []
|
||||||
|
total_drives = 0
|
||||||
|
warnings = 0
|
||||||
|
errors = 0
|
||||||
|
all_healthy = True
|
||||||
|
|
||||||
|
for enc in enclosures_raw:
|
||||||
|
slots_raw = list_slots(enc["id"])
|
||||||
|
|
||||||
|
# Gather SMART data for all populated slots concurrently
|
||||||
|
populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]]
|
||||||
|
smart_tasks = [get_smart_data(dev) for _, dev in populated]
|
||||||
|
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
smart_map: dict[str, dict] = {}
|
||||||
|
for (slot_info, dev), result in zip(populated, smart_results):
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
logger.warning("SMART query failed for %s: %s", dev, result)
|
||||||
|
smart_map[dev] = {"device": dev, "smart_supported": False}
|
||||||
|
else:
|
||||||
|
smart_map[dev] = result
|
||||||
|
|
||||||
|
slots_out: list[SlotWithDrive] = []
|
||||||
|
for s in slots_raw:
|
||||||
|
drive_summary = None
|
||||||
|
if s["device"] and s["device"] in smart_map:
|
||||||
|
sd = smart_map[s["device"]]
|
||||||
|
total_drives += 1
|
||||||
|
|
||||||
|
healthy = sd.get("smart_healthy")
|
||||||
|
if healthy is False:
|
||||||
|
errors += 1
|
||||||
|
all_healthy = False
|
||||||
|
elif healthy is None and sd.get("smart_supported", True):
|
||||||
|
warnings += 1
|
||||||
|
|
||||||
|
# Check for concerning SMART values
|
||||||
|
if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0:
|
||||||
|
warnings += 1
|
||||||
|
if sd.get("pending_sectors") and sd["pending_sectors"] > 0:
|
||||||
|
warnings += 1
|
||||||
|
if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0:
|
||||||
|
warnings += 1
|
||||||
|
|
||||||
|
drive_summary = DriveHealthSummary(
|
||||||
|
device=sd["device"],
|
||||||
|
model=sd.get("model"),
|
||||||
|
serial=sd.get("serial"),
|
||||||
|
smart_healthy=healthy,
|
||||||
|
smart_supported=sd.get("smart_supported", True),
|
||||||
|
temperature_c=sd.get("temperature_c"),
|
||||||
|
power_on_hours=sd.get("power_on_hours"),
|
||||||
|
)
|
||||||
|
elif s["populated"]:
|
||||||
|
total_drives += 1
|
||||||
|
|
||||||
|
slots_out.append(SlotWithDrive(
|
||||||
|
slot=s["slot"],
|
||||||
|
populated=s["populated"],
|
||||||
|
device=s["device"],
|
||||||
|
drive=drive_summary,
|
||||||
|
))
|
||||||
|
|
||||||
|
enc_results.append(EnclosureWithDrives(
|
||||||
|
id=enc["id"],
|
||||||
|
sg_device=enc.get("sg_device"),
|
||||||
|
vendor=enc["vendor"],
|
||||||
|
model=enc["model"],
|
||||||
|
revision=enc["revision"],
|
||||||
|
total_slots=enc["total_slots"],
|
||||||
|
populated_slots=enc["populated_slots"],
|
||||||
|
slots=slots_out,
|
||||||
|
))
|
||||||
|
|
||||||
|
return Overview(
|
||||||
|
healthy=all_healthy and errors == 0,
|
||||||
|
drive_count=total_drives,
|
||||||
|
warning_count=warnings,
|
||||||
|
error_count=errors,
|
||||||
|
enclosures=enc_results,
|
||||||
|
)
|
||||||
0
services/__init__.py
Normal file
0
services/__init__.py
Normal file
BIN
services/__pycache__/__init__.cpython-314.pyc
Normal file
BIN
services/__pycache__/__init__.cpython-314.pyc
Normal file
Binary file not shown.
BIN
services/__pycache__/cache.cpython-314.pyc
Normal file
BIN
services/__pycache__/cache.cpython-314.pyc
Normal file
Binary file not shown.
BIN
services/__pycache__/enclosure.cpython-314.pyc
Normal file
BIN
services/__pycache__/enclosure.cpython-314.pyc
Normal file
Binary file not shown.
BIN
services/__pycache__/smart.cpython-314.pyc
Normal file
BIN
services/__pycache__/smart.cpython-314.pyc
Normal file
Binary file not shown.
29
services/cache.py
Normal file
29
services/cache.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class TTLCache:
|
||||||
|
"""Simple in-memory TTL cache."""
|
||||||
|
|
||||||
|
def __init__(self, ttl_seconds: int = 60):
|
||||||
|
self._ttl = ttl_seconds
|
||||||
|
self._store: dict[str, tuple[float, Any]] = {}
|
||||||
|
|
||||||
|
def get(self, key: str) -> Any | None:
|
||||||
|
entry = self._store.get(key)
|
||||||
|
if entry is None:
|
||||||
|
return None
|
||||||
|
ts, value = entry
|
||||||
|
if time.monotonic() - ts > self._ttl:
|
||||||
|
del self._store[key]
|
||||||
|
return None
|
||||||
|
return value
|
||||||
|
|
||||||
|
def set(self, key: str, value: Any) -> None:
|
||||||
|
self._store[key] = (time.monotonic(), value)
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
self._store.clear()
|
||||||
|
|
||||||
|
|
||||||
|
smart_cache = TTLCache(ttl_seconds=60)
|
||||||
118
services/enclosure.py
Normal file
118
services/enclosure.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ENCLOSURE_BASE = Path("/sys/class/enclosure")
|
||||||
|
|
||||||
|
|
||||||
|
def _read_sysfs(path: Path) -> str:
|
||||||
|
"""Read a sysfs attribute file, return stripped content or empty string."""
|
||||||
|
try:
|
||||||
|
return path.read_text().strip()
|
||||||
|
except (OSError, IOError):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _find_sg_device(enclosure_path: Path) -> str | None:
|
||||||
|
"""Resolve the sg device for an enclosure from its sysfs path."""
|
||||||
|
# The enclosure sysfs directory has a 'device' symlink. Under that,
|
||||||
|
# there's a scsi_generic directory containing the sg device name.
|
||||||
|
sg_dir = enclosure_path / "device" / "scsi_generic"
|
||||||
|
if sg_dir.is_dir():
|
||||||
|
entries = list(sg_dir.iterdir())
|
||||||
|
if entries:
|
||||||
|
return f"/dev/{entries[0].name}"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def discover_enclosures() -> list[dict]:
|
||||||
|
"""Walk /sys/class/enclosure/ to discover SES enclosures."""
|
||||||
|
if not ENCLOSURE_BASE.is_dir():
|
||||||
|
logger.warning("No enclosure sysfs directory found at %s", ENCLOSURE_BASE)
|
||||||
|
return []
|
||||||
|
|
||||||
|
enclosures = []
|
||||||
|
for enc_dir in sorted(ENCLOSURE_BASE.iterdir()):
|
||||||
|
if not enc_dir.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
enc_id = enc_dir.name
|
||||||
|
device_dir = enc_dir / "device"
|
||||||
|
|
||||||
|
vendor = _read_sysfs(device_dir / "vendor")
|
||||||
|
model = _read_sysfs(device_dir / "model")
|
||||||
|
revision = _read_sysfs(device_dir / "rev")
|
||||||
|
sg_device = _find_sg_device(enc_dir)
|
||||||
|
|
||||||
|
slots = list_slots(enc_id)
|
||||||
|
total = len(slots)
|
||||||
|
populated = sum(1 for s in slots if s["populated"])
|
||||||
|
|
||||||
|
enclosures.append({
|
||||||
|
"id": enc_id,
|
||||||
|
"sg_device": sg_device,
|
||||||
|
"vendor": vendor,
|
||||||
|
"model": model,
|
||||||
|
"revision": revision,
|
||||||
|
"total_slots": total,
|
||||||
|
"populated_slots": populated,
|
||||||
|
})
|
||||||
|
|
||||||
|
return enclosures
|
||||||
|
|
||||||
|
|
||||||
|
def list_slots(enclosure_id: str) -> list[dict]:
|
||||||
|
"""Enumerate drive slots for an enclosure via sysfs."""
|
||||||
|
enc_dir = ENCLOSURE_BASE / enclosure_id
|
||||||
|
if not enc_dir.is_dir():
|
||||||
|
return []
|
||||||
|
|
||||||
|
slots = []
|
||||||
|
for entry in sorted(enc_dir.iterdir()):
|
||||||
|
# Slot entries are directories like "Slot 00", "Slot 01", etc.
|
||||||
|
# Some enclosures use "Disk" or "ArrayDevice" prefixes.
|
||||||
|
if not entry.is_dir():
|
||||||
|
continue
|
||||||
|
name = entry.name
|
||||||
|
slot_num = _parse_slot_number(name)
|
||||||
|
if slot_num is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if a block device is linked in this slot
|
||||||
|
block_dir = entry / "device" / "block"
|
||||||
|
device = None
|
||||||
|
populated = False
|
||||||
|
|
||||||
|
if block_dir.is_dir():
|
||||||
|
devs = list(block_dir.iterdir())
|
||||||
|
if devs:
|
||||||
|
device = devs[0].name
|
||||||
|
populated = True
|
||||||
|
else:
|
||||||
|
# Also check the 'status' file — "not installed" means empty
|
||||||
|
status = _read_sysfs(entry / "status")
|
||||||
|
if status and status != "not installed":
|
||||||
|
populated = True
|
||||||
|
|
||||||
|
slots.append({
|
||||||
|
"slot": slot_num,
|
||||||
|
"populated": populated,
|
||||||
|
"device": device,
|
||||||
|
})
|
||||||
|
|
||||||
|
return slots
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_slot_number(name: str) -> int | None:
|
||||||
|
"""Extract the slot number from a sysfs slot directory name."""
|
||||||
|
# Handles "Slot 00", "Slot00", "Disk 1", "ArrayDevice00", etc.
|
||||||
|
for prefix in ("Slot ", "Slot", "Disk ", "Disk", "ArrayDevice", "SLOT "):
|
||||||
|
if name.startswith(prefix):
|
||||||
|
num_str = name[len(prefix):].strip()
|
||||||
|
try:
|
||||||
|
return int(num_str)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
159
services/smart.py
Normal file
159
services/smart.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from services.cache import smart_cache
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# SMART attribute IDs of interest
|
||||||
|
ATTR_REALLOCATED = 5
|
||||||
|
ATTR_POWER_ON_HOURS = 9
|
||||||
|
ATTR_TEMPERATURE = 194
|
||||||
|
ATTR_TEMPERATURE_ALT = 190
|
||||||
|
ATTR_PENDING = 197
|
||||||
|
ATTR_UNCORRECTABLE = 198
|
||||||
|
ATTR_WEAR_LEVELING = 177 # SSD wear leveling
|
||||||
|
|
||||||
|
|
||||||
|
def smartctl_available() -> bool:
|
||||||
|
return shutil.which("smartctl") is not None
|
||||||
|
|
||||||
|
|
||||||
|
def sg_ses_available() -> bool:
|
||||||
|
return shutil.which("sg_ses") is not None
|
||||||
|
|
||||||
|
|
||||||
|
async def get_smart_data(device: str) -> dict:
|
||||||
|
"""Run smartctl -a -j against a device, with caching."""
|
||||||
|
# Sanitize device name: only allow alphanumeric and hyphens
|
||||||
|
if not re.match(r"^[a-zA-Z0-9\-]+$", device):
|
||||||
|
raise ValueError(f"Invalid device name: {device}")
|
||||||
|
|
||||||
|
cached = smart_cache.get(device)
|
||||||
|
if cached is not None:
|
||||||
|
return cached
|
||||||
|
|
||||||
|
result = await _run_smartctl(device)
|
||||||
|
smart_cache.set(device, result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def _run_smartctl(device: str) -> dict:
|
||||||
|
"""Execute smartctl and parse JSON output."""
|
||||||
|
dev_path = f"/dev/{device}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
"smartctl", "-a", "-j", dev_path,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
stdout, stderr = await proc.communicate()
|
||||||
|
except FileNotFoundError:
|
||||||
|
return {"error": "smartctl not found", "smart_supported": False}
|
||||||
|
|
||||||
|
# smartctl returns non-zero for many non-fatal reasons (bit flags).
|
||||||
|
# The JSON output is still valid even with non-zero exit codes.
|
||||||
|
if not stdout:
|
||||||
|
return {
|
||||||
|
"error": f"smartctl produced no output (exit code {proc.returncode})",
|
||||||
|
"smart_supported": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(stdout)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"error": "Failed to parse smartctl JSON output", "smart_supported": False}
|
||||||
|
|
||||||
|
return _parse_smart_json(device, data)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_smart_json(device: str, data: dict) -> dict:
|
||||||
|
"""Extract relevant fields from smartctl JSON output."""
|
||||||
|
result: dict = {"device": device, "smart_supported": True}
|
||||||
|
|
||||||
|
# Identity
|
||||||
|
result["model"] = data.get("model_name")
|
||||||
|
result["serial"] = data.get("serial_number")
|
||||||
|
result["firmware"] = data.get("firmware_version")
|
||||||
|
result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes")
|
||||||
|
|
||||||
|
# WWN
|
||||||
|
wwn_data = data.get("wwn")
|
||||||
|
if wwn_data:
|
||||||
|
# Reconstruct WWN string from components
|
||||||
|
naa = wwn_data.get("naa", 0)
|
||||||
|
oui = wwn_data.get("oui", 0)
|
||||||
|
wwn_id = wwn_data.get("id", 0)
|
||||||
|
result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}"
|
||||||
|
|
||||||
|
# SMART health
|
||||||
|
smart_status = data.get("smart_status", {})
|
||||||
|
result["smart_healthy"] = smart_status.get("passed")
|
||||||
|
|
||||||
|
# Temperature
|
||||||
|
temp = data.get("temperature", {})
|
||||||
|
result["temperature_c"] = temp.get("current")
|
||||||
|
|
||||||
|
# Power-on hours
|
||||||
|
poh = data.get("power_on_time", {})
|
||||||
|
result["power_on_hours"] = poh.get("hours")
|
||||||
|
|
||||||
|
# SMART attributes (ATA)
|
||||||
|
attrs = data.get("ata_smart_attributes", {}).get("table", [])
|
||||||
|
result["smart_attributes"] = attrs
|
||||||
|
|
||||||
|
result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED)
|
||||||
|
result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING)
|
||||||
|
result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE)
|
||||||
|
result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING)
|
||||||
|
|
||||||
|
# Power-on hours fallback from attributes
|
||||||
|
if result["power_on_hours"] is None:
|
||||||
|
result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS)
|
||||||
|
|
||||||
|
# Temperature fallback from attributes
|
||||||
|
if result["temperature_c"] is None:
|
||||||
|
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE)
|
||||||
|
if result["temperature_c"] is None:
|
||||||
|
result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT)
|
||||||
|
|
||||||
|
# NVMe attributes (different structure)
|
||||||
|
nvme_health = data.get("nvme_smart_health_information_log")
|
||||||
|
if nvme_health:
|
||||||
|
result["smart_attributes"] = [{"nvme_health": nvme_health}]
|
||||||
|
if result["temperature_c"] is None:
|
||||||
|
result["temperature_c"] = nvme_health.get("temperature")
|
||||||
|
if result["power_on_hours"] is None:
|
||||||
|
result["power_on_hours"] = nvme_health.get("power_on_hours")
|
||||||
|
if result["wear_leveling_percent"] is None:
|
||||||
|
pct_used = nvme_health.get("percentage_used")
|
||||||
|
if pct_used is not None:
|
||||||
|
result["wear_leveling_percent"] = 100 - pct_used
|
||||||
|
|
||||||
|
# SAS/SCSI drives use different error counters
|
||||||
|
scsi_errors = data.get("scsi_error_counter_log")
|
||||||
|
if scsi_errors and not attrs:
|
||||||
|
result["smart_attributes"] = [{"scsi_error_log": scsi_errors}]
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None:
|
||||||
|
"""Get the raw_value for a SMART attribute by ID."""
|
||||||
|
for attr in attrs:
|
||||||
|
if attr.get("id") == attr_id:
|
||||||
|
raw = attr.get("raw", {})
|
||||||
|
return raw.get("value")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None:
|
||||||
|
"""Get the normalized value for a SMART attribute by ID."""
|
||||||
|
for attr in attrs:
|
||||||
|
if attr.get("id") == attr_id:
|
||||||
|
return attr.get("value")
|
||||||
|
return None
|
||||||
Reference in New Issue
Block a user