diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index c9b18ba..e2908b2 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -709,7 +709,181 @@ function HostDrivesCard({ drives, onSelect, t }) { ); } +function EnclosureHealthSummary({ health, t }) { + if (!health) return null; + + const statusColors = { + CRITICAL: t.health.error, + WARNING: t.health.warning, + OK: t.health.healthy, + }; + const sc = statusColors[health.overall_status] || statusColors.OK; + + const failedPsus = health.psus.filter((p) => p.fail || p.status.toLowerCase() === "critical"); + const failedFans = health.fans.filter((f) => f.fail); + const temps = health.temps.filter((s) => s.temperature_c != null); + const tempMin = temps.length > 0 ? Math.min(...temps.map((s) => s.temperature_c)) : null; + const tempMax = temps.length > 0 ? Math.max(...temps.map((s) => s.temperature_c)) : null; + + return ( +
+ {/* Overall badge */} + + + {health.overall_status} + + + {/* PSU pills */} + {health.psus.map((psu) => { + const bad = psu.fail || psu.status.toLowerCase() === "critical"; + const pc = bad ? t.health.error : t.health.healthy; + return ( + + + PSU {psu.index} {bad ? "FAIL" : "OK"} + + ); + })} + + {/* Fans summary */} + {health.fans.length > 0 && ( + 0 ? t.health.error.text : t.textSecondary, + fontWeight: 600, + }}> + {failedFans.length > 0 + ? `${failedFans.length}/${health.fans.length} fans failed` + : `${health.fans.length} fans OK`} + + )} + + {/* Temp range */} + {tempMin != null && ( + = 45 ? t.health.warning.text : t.textSecondary, + fontWeight: 600, fontFamily: "'JetBrains Mono', monospace", + }}> + {tempMin === tempMax ? `${tempMin}\u00B0C` : `${tempMin}\u2013${tempMax}\u00B0C`} + + )} +
+ ); +} + +function EnclosureHealthDetail({ health, t }) { + if (!health) return null; + + const sectionStyle = { marginBottom: 12 }; + const headerStyle = { + fontSize: 10, fontWeight: 700, color: t.textMuted, + textTransform: "uppercase", letterSpacing: 1, marginBottom: 6, + }; + const rowStyle = { + display: "flex", justifyContent: "space-between", alignItems: "center", + padding: "4px 0", borderBottom: `1px solid ${t.divider}`, fontSize: 12, + }; + + return ( +
+
+ {/* PSUs */} + {health.psus.length > 0 && ( +
+
Power Supplies
+ {health.psus.map((psu) => { + const bad = psu.fail || psu.status.toLowerCase() === "critical"; + return ( +
+ PSU {psu.index} + + {psu.status}{psu.ac_fail ? " (AC fail)" : ""}{psu.dc_fail ? " (DC fail)" : ""} + +
+ ); + })} +
+ )} + + {/* Fans */} + {health.fans.length > 0 && ( +
+
Fans
+ {health.fans.map((fan) => ( +
+ Fan {fan.index} + + {fan.rpm != null ? `${fan.rpm} RPM` : fan.status} + {fan.fail ? " FAIL" : ""} + +
+ ))} +
+ )} + + {/* Temps */} + {health.temps.length > 0 && ( +
+
Temperature Sensors
+ {health.temps.map((ts) => ( +
+ Sensor {ts.index} + = 45 ? t.health.warning.text : t.text, + fontFamily: "'JetBrains Mono', monospace", + }}> + {ts.temperature_c != null ? `${ts.temperature_c}\u00B0C` : ts.status} + +
+ ))} +
+ )} + + {/* Voltages */} + {health.voltages.length > 0 && ( +
+
Voltage Rails
+ {health.voltages.map((vs) => ( +
+ Rail {vs.index} + + {vs.voltage != null ? `${vs.voltage} V` : vs.status} + +
+ ))} +
+ )} +
+
+ ); +} + function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) { + const [healthExpanded, setHealthExpanded] = useState(false); + return (
@@ -730,11 +904,29 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
{enclosure.sg_device} · {enclosure.populated_slots}/{enclosure.total_slots} slots populated
+ {enclosure.health && ( +
+ + +
+ )}
ID {enclosure.id}
+ {healthExpanded && enclosure.health && ( + + )}
{view === "grid" ? ( diff --git a/models/schemas.py b/models/schemas.py index 6f45a68..f73fde9 100644 --- a/models/schemas.py +++ b/models/schemas.py @@ -65,6 +65,41 @@ class SlotWithDrive(BaseModel): drive: DriveHealthSummary | None = None +class PsuStatus(BaseModel): + index: int + status: str + fail: bool = False + ac_fail: bool = False + dc_fail: bool = False + + +class FanStatus(BaseModel): + index: int + status: str + rpm: int | None = None + fail: bool = False + + +class TempSensor(BaseModel): + index: int + status: str + temperature_c: float | None = None + + +class VoltageSensor(BaseModel): + index: int + status: str + voltage: float | None = None + + +class EnclosureHealth(BaseModel): + overall_status: str = "OK" + psus: list[PsuStatus] = [] + fans: list[FanStatus] = [] + temps: list[TempSensor] = [] + voltages: list[VoltageSensor] = [] + + class EnclosureWithDrives(BaseModel): id: str sg_device: str | None = None @@ -74,6 +109,7 @@ class EnclosureWithDrives(BaseModel): total_slots: int populated_slots: int slots: list[SlotWithDrive] + health: EnclosureHealth | None = None class HostDrive(BaseModel): diff --git a/routers/overview.py b/routers/overview.py index 28c884d..8240b90 100644 --- a/routers/overview.py +++ b/routers/overview.py @@ -5,12 +5,13 @@ from fastapi import APIRouter from models.schemas import ( DriveHealthSummary, + EnclosureHealth, EnclosureWithDrives, HostDrive, Overview, SlotWithDrive, ) -from services.enclosure import discover_enclosures, list_slots +from services.enclosure import discover_enclosures, get_enclosure_status, list_slots from services.host import get_host_drives from services.smart import get_smart_data from services.zfs import get_zfs_pool_map @@ -26,13 +27,24 @@ async def get_overview(): enclosures_raw = discover_enclosures() pool_map = await get_zfs_pool_map() + # Fetch SES health data for all enclosures concurrently + async def _get_health(enc): + if enc.get("sg_device"): + return await get_enclosure_status(enc["sg_device"]) + return None + + health_results = await asyncio.gather( + *[_get_health(enc) for enc in enclosures_raw], + return_exceptions=True, + ) + enc_results: list[EnclosureWithDrives] = [] total_drives = 0 warnings = 0 errors = 0 all_healthy = True - for enc in enclosures_raw: + for enc_idx, enc in enumerate(enclosures_raw): slots_raw = list_slots(enc["id"]) # Gather SMART data for all populated slots concurrently @@ -110,6 +122,20 @@ async def get_overview(): drive=drive_summary, )) + # Attach enclosure health from SES + health_data = health_results[enc_idx] + enc_health = None + if isinstance(health_data, dict): + enc_health = EnclosureHealth(**health_data) + # Count enclosure-level issues + if enc_health.overall_status == "CRITICAL": + errors += 1 + all_healthy = False + elif enc_health.overall_status == "WARNING": + warnings += 1 + elif isinstance(health_data, Exception): + logger.warning("SES health failed for %s: %s", enc["id"], health_data) + enc_results.append(EnclosureWithDrives( id=enc["id"], sg_device=enc.get("sg_device"), @@ -119,6 +145,7 @@ async def get_overview(): total_slots=enc["total_slots"], populated_slots=enc["populated_slots"], slots=slots_out, + health=enc_health, )) # Host drives (non-enclosure) diff --git a/services/enclosure.py b/services/enclosure.py index b5beebb..ad2b8bf 100644 --- a/services/enclosure.py +++ b/services/enclosure.py @@ -1,5 +1,7 @@ -import os +import asyncio import logging +import os +import re from pathlib import Path logger = logging.getLogger(__name__) @@ -136,3 +138,121 @@ def _parse_slot_number(entry: Path) -> int | None: except ValueError: return None return None + + +async def get_enclosure_status(sg_device: str) -> dict | None: + """Run sg_ses --page=0x02 and parse enclosure health data.""" + try: + proc = await asyncio.create_subprocess_exec( + "sg_ses", "--page=0x02", sg_device, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + logger.warning("sg_ses failed for %s: %s", sg_device, stderr.decode().strip()) + return None + return _parse_ses_page02(stdout.decode(errors="replace")) + except FileNotFoundError: + logger.warning("sg_ses not found") + return None + except Exception as e: + logger.warning("sg_ses error for %s: %s", sg_device, e) + return None + + +def _parse_ses_page02(text: str) -> dict: + """Parse sg_ses --page=0x02 text output into structured health data.""" + result = { + "overall_status": "OK", + "psus": [], + "fans": [], + "temps": [], + "voltages": [], + } + + # Parse header line for overall status: + # INVOP=0, INFO=0, NON-CRIT=0, CRIT=1, UNRECOV=0 + header_match = re.search( + r"INVOP=\d+,\s*INFO=\d+,\s*NON-CRIT=(\d+),\s*CRIT=(\d+),\s*UNRECOV=(\d+)", + text, + ) + if header_match: + non_crit = int(header_match.group(1)) + crit = int(header_match.group(2)) + unrecov = int(header_match.group(3)) + if crit > 0 or unrecov > 0: + result["overall_status"] = "CRITICAL" + elif non_crit > 0: + result["overall_status"] = "WARNING" + + # Split into element type sections. + # Each section starts with "Element type: " + sections = re.split(r"(?=\s*Element type:)", text) + + for section in sections: + type_match = re.match(r"\s*Element type:\s*(.+)", section) + if not type_match: + continue + element_type = type_match.group(1).strip().rstrip(",").lower() + + # Find individual element blocks (skip "Overall descriptor") + elements = re.split(r"(?=\s*Element \d+ descriptor:)", section) + + for elem_text in elements: + desc_match = re.match(r"\s*Element (\d+) descriptor:", elem_text) + if not desc_match: + continue + idx = int(desc_match.group(1)) + + # Extract status line + status_match = re.search(r"status:\s*(.+?)(?:,|\n|$)", elem_text, re.IGNORECASE) + status = status_match.group(1).strip() if status_match else "Unknown" + + if status.lower() == "not installed": + continue + + if "power supply" in element_type: + fail = "Fail=1" in elem_text + ac_fail = "AC fail=1" in elem_text + dc_fail = "DC fail=1" in elem_text + result["psus"].append({ + "index": idx, + "status": status, + "fail": fail, + "ac_fail": ac_fail, + "dc_fail": dc_fail, + }) + + elif "cooling" in element_type or "fan" in element_type: + fail = "Fail=1" in elem_text + rpm_match = re.search(r"Actual speed[=:]\s*(\d+)\s*rpm", elem_text, re.IGNORECASE) + rpm = int(rpm_match.group(1)) if rpm_match else None + result["fans"].append({ + "index": idx, + "status": status, + "rpm": rpm, + "fail": fail, + }) + + elif "temperature" in element_type: + temp_match = re.search(r"Temperature=\s*([\d.]+)\s*C", elem_text) + temp = float(temp_match.group(1)) if temp_match else None + result["temps"].append({ + "index": idx, + "status": status, + "temperature_c": temp, + }) + + elif "voltage" in element_type: + volt_match = re.search(r"Voltage:\s*([\d.]+)\s*V", elem_text, re.IGNORECASE) + if not volt_match: + volt_match = re.search(r"([\d.]+)\s*V", elem_text) + voltage = float(volt_match.group(1)) if volt_match else None + result["voltages"].append({ + "index": idx, + "status": status, + "voltage": voltage, + }) + + return result