Parse sg_ses --page=0x02 output to surface enclosure-level health data including power supply status, fan RPMs, temperature sensors, and voltage rails. Failed/critical components are reflected in the overview totals and shown as status pills in the enclosure card header with an expandable detail panel.
181 lines
6.6 KiB
Python
181 lines
6.6 KiB
Python
import asyncio
|
|
import logging
|
|
|
|
from fastapi import APIRouter
|
|
|
|
from models.schemas import (
|
|
DriveHealthSummary,
|
|
EnclosureHealth,
|
|
EnclosureWithDrives,
|
|
HostDrive,
|
|
Overview,
|
|
SlotWithDrive,
|
|
)
|
|
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
|
|
from services.host import get_host_drives
|
|
from services.smart import get_smart_data
|
|
from services.zfs import get_zfs_pool_map
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/overview", tags=["overview"])
|
|
|
|
|
|
@router.get("", response_model=Overview)
|
|
async def get_overview():
|
|
"""Aggregate view of all enclosures, slots, and drive health."""
|
|
enclosures_raw = discover_enclosures()
|
|
pool_map = await get_zfs_pool_map()
|
|
|
|
# Fetch SES health data for all enclosures concurrently
|
|
async def _get_health(enc):
|
|
if enc.get("sg_device"):
|
|
return await get_enclosure_status(enc["sg_device"])
|
|
return None
|
|
|
|
health_results = await asyncio.gather(
|
|
*[_get_health(enc) for enc in enclosures_raw],
|
|
return_exceptions=True,
|
|
)
|
|
|
|
enc_results: list[EnclosureWithDrives] = []
|
|
total_drives = 0
|
|
warnings = 0
|
|
errors = 0
|
|
all_healthy = True
|
|
|
|
for enc_idx, enc in enumerate(enclosures_raw):
|
|
slots_raw = list_slots(enc["id"])
|
|
|
|
# Gather SMART data for all populated slots concurrently
|
|
populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]]
|
|
smart_tasks = [get_smart_data(dev) for _, dev in populated]
|
|
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
|
|
|
|
smart_map: dict[str, dict] = {}
|
|
for (slot_info, dev), result in zip(populated, smart_results):
|
|
if isinstance(result, Exception):
|
|
logger.warning("SMART query failed for %s: %s", dev, result)
|
|
smart_map[dev] = {"device": dev, "smart_supported": False}
|
|
else:
|
|
smart_map[dev] = result
|
|
|
|
slots_out: list[SlotWithDrive] = []
|
|
for s in slots_raw:
|
|
drive_summary = None
|
|
if s["device"] and s["device"] in smart_map:
|
|
sd = smart_map[s["device"]]
|
|
total_drives += 1
|
|
|
|
healthy = sd.get("smart_healthy")
|
|
if healthy is False:
|
|
errors += 1
|
|
all_healthy = False
|
|
elif healthy is None and sd.get("smart_supported", True):
|
|
warnings += 1
|
|
|
|
# Check for concerning SMART values
|
|
if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0:
|
|
warnings += 1
|
|
if sd.get("pending_sectors") and sd["pending_sectors"] > 0:
|
|
warnings += 1
|
|
if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0:
|
|
warnings += 1
|
|
|
|
# Compute health_status for frontend
|
|
realloc = sd.get("reallocated_sectors") or 0
|
|
pending = sd.get("pending_sectors") or 0
|
|
unc = sd.get("uncorrectable_errors") or 0
|
|
if healthy is False:
|
|
health_status = "error"
|
|
elif realloc > 0 or pending > 0 or unc > 0 or (healthy is None and sd.get("smart_supported", True)):
|
|
health_status = "warning"
|
|
else:
|
|
health_status = "healthy"
|
|
|
|
drive_summary = DriveHealthSummary(
|
|
device=sd["device"],
|
|
model=sd.get("model"),
|
|
serial=sd.get("serial"),
|
|
wwn=sd.get("wwn"),
|
|
firmware=sd.get("firmware"),
|
|
capacity_bytes=sd.get("capacity_bytes"),
|
|
smart_healthy=healthy,
|
|
smart_supported=sd.get("smart_supported", True),
|
|
temperature_c=sd.get("temperature_c"),
|
|
power_on_hours=sd.get("power_on_hours"),
|
|
reallocated_sectors=sd.get("reallocated_sectors"),
|
|
pending_sectors=sd.get("pending_sectors"),
|
|
uncorrectable_errors=sd.get("uncorrectable_errors"),
|
|
zfs_pool=pool_map.get(sd["device"], {}).get("pool"),
|
|
zfs_vdev=pool_map.get(sd["device"], {}).get("vdev"),
|
|
zfs_state=pool_map.get(sd["device"], {}).get("state"),
|
|
health_status=health_status,
|
|
)
|
|
elif s["populated"]:
|
|
total_drives += 1
|
|
|
|
slots_out.append(SlotWithDrive(
|
|
slot=s["slot"],
|
|
populated=s["populated"],
|
|
device=s["device"],
|
|
drive=drive_summary,
|
|
))
|
|
|
|
# Attach enclosure health from SES
|
|
health_data = health_results[enc_idx]
|
|
enc_health = None
|
|
if isinstance(health_data, dict):
|
|
enc_health = EnclosureHealth(**health_data)
|
|
# Count enclosure-level issues
|
|
if enc_health.overall_status == "CRITICAL":
|
|
errors += 1
|
|
all_healthy = False
|
|
elif enc_health.overall_status == "WARNING":
|
|
warnings += 1
|
|
elif isinstance(health_data, Exception):
|
|
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
|
|
|
|
enc_results.append(EnclosureWithDrives(
|
|
id=enc["id"],
|
|
sg_device=enc.get("sg_device"),
|
|
vendor=enc["vendor"],
|
|
model=enc["model"],
|
|
revision=enc["revision"],
|
|
total_slots=enc["total_slots"],
|
|
populated_slots=enc["populated_slots"],
|
|
slots=slots_out,
|
|
health=enc_health,
|
|
))
|
|
|
|
# Host drives (non-enclosure)
|
|
host_drives_raw = await get_host_drives()
|
|
host_drives_out: list[HostDrive] = []
|
|
for hd in host_drives_raw:
|
|
total_drives += 1
|
|
hs = hd.get("health_status", "healthy")
|
|
if hs == "error":
|
|
errors += 1
|
|
all_healthy = False
|
|
elif hs == "warning":
|
|
warnings += 1
|
|
# Count physical drives behind RAID controllers
|
|
for pd in hd.get("physical_drives", []):
|
|
total_drives += 1
|
|
pd_hs = pd.get("health_status", "healthy")
|
|
if pd_hs == "error":
|
|
errors += 1
|
|
all_healthy = False
|
|
elif pd_hs == "warning":
|
|
warnings += 1
|
|
host_drives_out.append(HostDrive(**hd))
|
|
|
|
return Overview(
|
|
healthy=all_healthy and errors == 0,
|
|
drive_count=total_drives,
|
|
warning_count=warnings,
|
|
error_count=errors,
|
|
enclosures=enc_results,
|
|
host_drives=host_drives_out,
|
|
)
|