Add enclosure health details (PSUs, fans, temps, voltages) via SES

Parse sg_ses --page=0x02 output to surface enclosure-level health data
including power supply status, fan RPMs, temperature sensors, and voltage
rails. Failed/critical components are reflected in the overview totals
and shown as status pills in the enclosure card header with an expandable
detail panel.
This commit is contained in:
2026-03-07 06:03:26 +00:00
parent 8ea8fdef08
commit 0112875894
4 changed files with 379 additions and 4 deletions

View File

@@ -5,12 +5,13 @@ from fastapi import APIRouter
from models.schemas import (
DriveHealthSummary,
EnclosureHealth,
EnclosureWithDrives,
HostDrive,
Overview,
SlotWithDrive,
)
from services.enclosure import discover_enclosures, list_slots
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
from services.host import get_host_drives
from services.smart import get_smart_data
from services.zfs import get_zfs_pool_map
@@ -26,13 +27,24 @@ async def get_overview():
enclosures_raw = discover_enclosures()
pool_map = await get_zfs_pool_map()
# Fetch SES health data for all enclosures concurrently
async def _get_health(enc):
if enc.get("sg_device"):
return await get_enclosure_status(enc["sg_device"])
return None
health_results = await asyncio.gather(
*[_get_health(enc) for enc in enclosures_raw],
return_exceptions=True,
)
enc_results: list[EnclosureWithDrives] = []
total_drives = 0
warnings = 0
errors = 0
all_healthy = True
for enc in enclosures_raw:
for enc_idx, enc in enumerate(enclosures_raw):
slots_raw = list_slots(enc["id"])
# Gather SMART data for all populated slots concurrently
@@ -110,6 +122,20 @@ async def get_overview():
drive=drive_summary,
))
# Attach enclosure health from SES
health_data = health_results[enc_idx]
enc_health = None
if isinstance(health_data, dict):
enc_health = EnclosureHealth(**health_data)
# Count enclosure-level issues
if enc_health.overall_status == "CRITICAL":
errors += 1
all_healthy = False
elif enc_health.overall_status == "WARNING":
warnings += 1
elif isinstance(health_data, Exception):
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
enc_results.append(EnclosureWithDrives(
id=enc["id"],
sg_device=enc.get("sg_device"),
@@ -119,6 +145,7 @@ async def get_overview():
total_slots=enc["total_slots"],
populated_slots=enc["populated_slots"],
slots=slots_out,
health=enc_health,
))
# Host drives (non-enclosure)