Files
jbod-monitor/routers/overview.py
adam 0112875894 Add enclosure health details (PSUs, fans, temps, voltages) via SES
Parse sg_ses --page=0x02 output to surface enclosure-level health data
including power supply status, fan RPMs, temperature sensors, and voltage
rails. Failed/critical components are reflected in the overview totals
and shown as status pills in the enclosure card header with an expandable
detail panel.
2026-03-07 06:03:26 +00:00

181 lines
6.6 KiB
Python

import asyncio
import logging
from fastapi import APIRouter
from models.schemas import (
DriveHealthSummary,
EnclosureHealth,
EnclosureWithDrives,
HostDrive,
Overview,
SlotWithDrive,
)
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
from services.host import get_host_drives
from services.smart import get_smart_data
from services.zfs import get_zfs_pool_map
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/overview", tags=["overview"])
@router.get("", response_model=Overview)
async def get_overview():
"""Aggregate view of all enclosures, slots, and drive health."""
enclosures_raw = discover_enclosures()
pool_map = await get_zfs_pool_map()
# Fetch SES health data for all enclosures concurrently
async def _get_health(enc):
if enc.get("sg_device"):
return await get_enclosure_status(enc["sg_device"])
return None
health_results = await asyncio.gather(
*[_get_health(enc) for enc in enclosures_raw],
return_exceptions=True,
)
enc_results: list[EnclosureWithDrives] = []
total_drives = 0
warnings = 0
errors = 0
all_healthy = True
for enc_idx, enc in enumerate(enclosures_raw):
slots_raw = list_slots(enc["id"])
# Gather SMART data for all populated slots concurrently
populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]]
smart_tasks = [get_smart_data(dev) for _, dev in populated]
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
smart_map: dict[str, dict] = {}
for (slot_info, dev), result in zip(populated, smart_results):
if isinstance(result, Exception):
logger.warning("SMART query failed for %s: %s", dev, result)
smart_map[dev] = {"device": dev, "smart_supported": False}
else:
smart_map[dev] = result
slots_out: list[SlotWithDrive] = []
for s in slots_raw:
drive_summary = None
if s["device"] and s["device"] in smart_map:
sd = smart_map[s["device"]]
total_drives += 1
healthy = sd.get("smart_healthy")
if healthy is False:
errors += 1
all_healthy = False
elif healthy is None and sd.get("smart_supported", True):
warnings += 1
# Check for concerning SMART values
if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0:
warnings += 1
if sd.get("pending_sectors") and sd["pending_sectors"] > 0:
warnings += 1
if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0:
warnings += 1
# Compute health_status for frontend
realloc = sd.get("reallocated_sectors") or 0
pending = sd.get("pending_sectors") or 0
unc = sd.get("uncorrectable_errors") or 0
if healthy is False:
health_status = "error"
elif realloc > 0 or pending > 0 or unc > 0 or (healthy is None and sd.get("smart_supported", True)):
health_status = "warning"
else:
health_status = "healthy"
drive_summary = DriveHealthSummary(
device=sd["device"],
model=sd.get("model"),
serial=sd.get("serial"),
wwn=sd.get("wwn"),
firmware=sd.get("firmware"),
capacity_bytes=sd.get("capacity_bytes"),
smart_healthy=healthy,
smart_supported=sd.get("smart_supported", True),
temperature_c=sd.get("temperature_c"),
power_on_hours=sd.get("power_on_hours"),
reallocated_sectors=sd.get("reallocated_sectors"),
pending_sectors=sd.get("pending_sectors"),
uncorrectable_errors=sd.get("uncorrectable_errors"),
zfs_pool=pool_map.get(sd["device"], {}).get("pool"),
zfs_vdev=pool_map.get(sd["device"], {}).get("vdev"),
zfs_state=pool_map.get(sd["device"], {}).get("state"),
health_status=health_status,
)
elif s["populated"]:
total_drives += 1
slots_out.append(SlotWithDrive(
slot=s["slot"],
populated=s["populated"],
device=s["device"],
drive=drive_summary,
))
# Attach enclosure health from SES
health_data = health_results[enc_idx]
enc_health = None
if isinstance(health_data, dict):
enc_health = EnclosureHealth(**health_data)
# Count enclosure-level issues
if enc_health.overall_status == "CRITICAL":
errors += 1
all_healthy = False
elif enc_health.overall_status == "WARNING":
warnings += 1
elif isinstance(health_data, Exception):
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
enc_results.append(EnclosureWithDrives(
id=enc["id"],
sg_device=enc.get("sg_device"),
vendor=enc["vendor"],
model=enc["model"],
revision=enc["revision"],
total_slots=enc["total_slots"],
populated_slots=enc["populated_slots"],
slots=slots_out,
health=enc_health,
))
# Host drives (non-enclosure)
host_drives_raw = await get_host_drives()
host_drives_out: list[HostDrive] = []
for hd in host_drives_raw:
total_drives += 1
hs = hd.get("health_status", "healthy")
if hs == "error":
errors += 1
all_healthy = False
elif hs == "warning":
warnings += 1
# Count physical drives behind RAID controllers
for pd in hd.get("physical_drives", []):
total_drives += 1
pd_hs = pd.get("health_status", "healthy")
if pd_hs == "error":
errors += 1
all_healthy = False
elif pd_hs == "warning":
warnings += 1
host_drives_out.append(HostDrive(**hd))
return Overview(
healthy=all_healthy and errors == 0,
drive_count=total_drives,
warning_count=warnings,
error_count=errors,
enclosures=enc_results,
host_drives=host_drives_out,
)