Add enclosure health details (PSUs, fans, temps, voltages) via SES

Parse sg_ses --page=0x02 output to surface enclosure-level health data
including power supply status, fan RPMs, temperature sensors, and voltage
rails. Failed/critical components are reflected in the overview totals
and shown as status pills in the enclosure card header with an expandable
detail panel.
This commit is contained in:
2026-03-07 06:03:26 +00:00
parent 8ea8fdef08
commit 0112875894
4 changed files with 379 additions and 4 deletions

View File

@@ -709,7 +709,181 @@ function HostDrivesCard({ drives, onSelect, t }) {
);
}
function EnclosureHealthSummary({ health, t }) {
if (!health) return null;
const statusColors = {
CRITICAL: t.health.error,
WARNING: t.health.warning,
OK: t.health.healthy,
};
const sc = statusColors[health.overall_status] || statusColors.OK;
const failedPsus = health.psus.filter((p) => p.fail || p.status.toLowerCase() === "critical");
const failedFans = health.fans.filter((f) => f.fail);
const temps = health.temps.filter((s) => s.temperature_c != null);
const tempMin = temps.length > 0 ? Math.min(...temps.map((s) => s.temperature_c)) : null;
const tempMax = temps.length > 0 ? Math.max(...temps.map((s) => s.temperature_c)) : null;
return (
<div style={{ display: "flex", alignItems: "center", gap: 8, flexWrap: "wrap", marginTop: 6 }}>
{/* Overall badge */}
<span style={{
display: "inline-flex", alignItems: "center", gap: 5,
padding: "2px 10px", borderRadius: 99,
background: sc.bg, border: `1px solid ${sc.border}`,
fontSize: 11, fontWeight: 700, color: sc.text, letterSpacing: 0.3,
}}>
<span style={{ width: 6, height: 6, borderRadius: "50%", background: sc.dot }} />
{health.overall_status}
</span>
{/* PSU pills */}
{health.psus.map((psu) => {
const bad = psu.fail || psu.status.toLowerCase() === "critical";
const pc = bad ? t.health.error : t.health.healthy;
return (
<span key={psu.index} style={{
display: "inline-flex", alignItems: "center", gap: 4,
padding: "2px 8px", borderRadius: 99,
background: pc.bg, border: `1px solid ${pc.border}`,
fontSize: 10, fontWeight: 600, color: pc.text,
}}>
<span style={{ width: 5, height: 5, borderRadius: "50%", background: pc.dot }} />
PSU {psu.index} {bad ? "FAIL" : "OK"}
</span>
);
})}
{/* Fans summary */}
{health.fans.length > 0 && (
<span style={{
fontSize: 11, color: failedFans.length > 0 ? t.health.error.text : t.textSecondary,
fontWeight: 600,
}}>
{failedFans.length > 0
? `${failedFans.length}/${health.fans.length} fans failed`
: `${health.fans.length} fans OK`}
</span>
)}
{/* Temp range */}
{tempMin != null && (
<span style={{
fontSize: 11, color: tempMax >= 45 ? t.health.warning.text : t.textSecondary,
fontWeight: 600, fontFamily: "'JetBrains Mono', monospace",
}}>
{tempMin === tempMax ? `${tempMin}\u00B0C` : `${tempMin}\u2013${tempMax}\u00B0C`}
</span>
)}
</div>
);
}
function EnclosureHealthDetail({ health, t }) {
if (!health) return null;
const sectionStyle = { marginBottom: 12 };
const headerStyle = {
fontSize: 10, fontWeight: 700, color: t.textMuted,
textTransform: "uppercase", letterSpacing: 1, marginBottom: 6,
};
const rowStyle = {
display: "flex", justifyContent: "space-between", alignItems: "center",
padding: "4px 0", borderBottom: `1px solid ${t.divider}`, fontSize: 12,
};
return (
<div style={{
padding: "12px 16px", background: t.surface,
borderTop: `1px solid ${t.divider}`, borderBottom: `1px solid ${t.divider}`,
}}>
<div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(220px, 1fr))", gap: 16 }}>
{/* PSUs */}
{health.psus.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Power Supplies</div>
{health.psus.map((psu) => {
const bad = psu.fail || psu.status.toLowerCase() === "critical";
return (
<div key={psu.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>PSU {psu.index}</span>
<span style={{
fontWeight: 600, color: bad ? t.health.error.text : t.health.healthy.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{psu.status}{psu.ac_fail ? " (AC fail)" : ""}{psu.dc_fail ? " (DC fail)" : ""}
</span>
</div>
);
})}
</div>
)}
{/* Fans */}
{health.fans.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Fans</div>
{health.fans.map((fan) => (
<div key={fan.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>Fan {fan.index}</span>
<span style={{
fontWeight: 600,
color: fan.fail ? t.health.error.text : t.health.healthy.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{fan.rpm != null ? `${fan.rpm} RPM` : fan.status}
{fan.fail ? " FAIL" : ""}
</span>
</div>
))}
</div>
)}
{/* Temps */}
{health.temps.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Temperature Sensors</div>
{health.temps.map((ts) => (
<div key={ts.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>Sensor {ts.index}</span>
<span style={{
fontWeight: 600,
color: ts.temperature_c >= 45 ? t.health.warning.text : t.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{ts.temperature_c != null ? `${ts.temperature_c}\u00B0C` : ts.status}
</span>
</div>
))}
</div>
)}
{/* Voltages */}
{health.voltages.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Voltage Rails</div>
{health.voltages.map((vs) => (
<div key={vs.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>Rail {vs.index}</span>
<span style={{
fontWeight: 600, color: t.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{vs.voltage != null ? `${vs.voltage} V` : vs.status}
</span>
</div>
))}
</div>
)}
</div>
</div>
);
}
function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
const [healthExpanded, setHealthExpanded] = useState(false);
return (
<div style={{
background: t.cardBg, borderRadius: 16,
@@ -720,7 +894,7 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
<div style={{
padding: "16px 20px",
borderBottom: `1px solid ${t.divider}`,
display: "flex", alignItems: "center", justifyContent: "space-between",
display: "flex", alignItems: "flex-start", justifyContent: "space-between",
flexWrap: "wrap", gap: 8,
}}>
<div>
@@ -730,11 +904,29 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
<div style={{ fontSize: 12, color: t.textSecondary, marginTop: 2 }}>
{enclosure.sg_device} &middot; {enclosure.populated_slots}/{enclosure.total_slots} slots populated
</div>
{enclosure.health && (
<div style={{ display: "flex", alignItems: "center", gap: 6 }}>
<EnclosureHealthSummary health={enclosure.health} t={t} />
<button
onClick={() => setHealthExpanded(!healthExpanded)}
style={{
background: "none", border: "none", cursor: "pointer",
fontSize: 11, color: t.accent, fontWeight: 600,
padding: "2px 6px", marginTop: 6,
}}
>
{healthExpanded ? "Hide details" : "Details"}
</button>
</div>
)}
</div>
<div style={{ fontSize: 11, color: t.textMuted, fontFamily: "'JetBrains Mono', monospace" }}>
ID {enclosure.id}
</div>
</div>
{healthExpanded && enclosure.health && (
<EnclosureHealthDetail health={enclosure.health} t={t} />
)}
<div style={{ padding: 16 }}>
{view === "grid" ? (
<GridView enclosure={enclosure} onSelect={onSelect} t={t} />

View File

@@ -65,6 +65,41 @@ class SlotWithDrive(BaseModel):
drive: DriveHealthSummary | None = None
class PsuStatus(BaseModel):
index: int
status: str
fail: bool = False
ac_fail: bool = False
dc_fail: bool = False
class FanStatus(BaseModel):
index: int
status: str
rpm: int | None = None
fail: bool = False
class TempSensor(BaseModel):
index: int
status: str
temperature_c: float | None = None
class VoltageSensor(BaseModel):
index: int
status: str
voltage: float | None = None
class EnclosureHealth(BaseModel):
overall_status: str = "OK"
psus: list[PsuStatus] = []
fans: list[FanStatus] = []
temps: list[TempSensor] = []
voltages: list[VoltageSensor] = []
class EnclosureWithDrives(BaseModel):
id: str
sg_device: str | None = None
@@ -74,6 +109,7 @@ class EnclosureWithDrives(BaseModel):
total_slots: int
populated_slots: int
slots: list[SlotWithDrive]
health: EnclosureHealth | None = None
class HostDrive(BaseModel):

View File

@@ -5,12 +5,13 @@ from fastapi import APIRouter
from models.schemas import (
DriveHealthSummary,
EnclosureHealth,
EnclosureWithDrives,
HostDrive,
Overview,
SlotWithDrive,
)
from services.enclosure import discover_enclosures, list_slots
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
from services.host import get_host_drives
from services.smart import get_smart_data
from services.zfs import get_zfs_pool_map
@@ -26,13 +27,24 @@ async def get_overview():
enclosures_raw = discover_enclosures()
pool_map = await get_zfs_pool_map()
# Fetch SES health data for all enclosures concurrently
async def _get_health(enc):
if enc.get("sg_device"):
return await get_enclosure_status(enc["sg_device"])
return None
health_results = await asyncio.gather(
*[_get_health(enc) for enc in enclosures_raw],
return_exceptions=True,
)
enc_results: list[EnclosureWithDrives] = []
total_drives = 0
warnings = 0
errors = 0
all_healthy = True
for enc in enclosures_raw:
for enc_idx, enc in enumerate(enclosures_raw):
slots_raw = list_slots(enc["id"])
# Gather SMART data for all populated slots concurrently
@@ -110,6 +122,20 @@ async def get_overview():
drive=drive_summary,
))
# Attach enclosure health from SES
health_data = health_results[enc_idx]
enc_health = None
if isinstance(health_data, dict):
enc_health = EnclosureHealth(**health_data)
# Count enclosure-level issues
if enc_health.overall_status == "CRITICAL":
errors += 1
all_healthy = False
elif enc_health.overall_status == "WARNING":
warnings += 1
elif isinstance(health_data, Exception):
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
enc_results.append(EnclosureWithDrives(
id=enc["id"],
sg_device=enc.get("sg_device"),
@@ -119,6 +145,7 @@ async def get_overview():
total_slots=enc["total_slots"],
populated_slots=enc["populated_slots"],
slots=slots_out,
health=enc_health,
))
# Host drives (non-enclosure)

View File

@@ -1,5 +1,7 @@
import os
import asyncio
import logging
import os
import re
from pathlib import Path
logger = logging.getLogger(__name__)
@@ -136,3 +138,121 @@ def _parse_slot_number(entry: Path) -> int | None:
except ValueError:
return None
return None
async def get_enclosure_status(sg_device: str) -> dict | None:
"""Run sg_ses --page=0x02 and parse enclosure health data."""
try:
proc = await asyncio.create_subprocess_exec(
"sg_ses", "--page=0x02", sg_device,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
logger.warning("sg_ses failed for %s: %s", sg_device, stderr.decode().strip())
return None
return _parse_ses_page02(stdout.decode(errors="replace"))
except FileNotFoundError:
logger.warning("sg_ses not found")
return None
except Exception as e:
logger.warning("sg_ses error for %s: %s", sg_device, e)
return None
def _parse_ses_page02(text: str) -> dict:
"""Parse sg_ses --page=0x02 text output into structured health data."""
result = {
"overall_status": "OK",
"psus": [],
"fans": [],
"temps": [],
"voltages": [],
}
# Parse header line for overall status:
# INVOP=0, INFO=0, NON-CRIT=0, CRIT=1, UNRECOV=0
header_match = re.search(
r"INVOP=\d+,\s*INFO=\d+,\s*NON-CRIT=(\d+),\s*CRIT=(\d+),\s*UNRECOV=(\d+)",
text,
)
if header_match:
non_crit = int(header_match.group(1))
crit = int(header_match.group(2))
unrecov = int(header_match.group(3))
if crit > 0 or unrecov > 0:
result["overall_status"] = "CRITICAL"
elif non_crit > 0:
result["overall_status"] = "WARNING"
# Split into element type sections.
# Each section starts with "Element type: <type>"
sections = re.split(r"(?=\s*Element type:)", text)
for section in sections:
type_match = re.match(r"\s*Element type:\s*(.+)", section)
if not type_match:
continue
element_type = type_match.group(1).strip().rstrip(",").lower()
# Find individual element blocks (skip "Overall descriptor")
elements = re.split(r"(?=\s*Element \d+ descriptor:)", section)
for elem_text in elements:
desc_match = re.match(r"\s*Element (\d+) descriptor:", elem_text)
if not desc_match:
continue
idx = int(desc_match.group(1))
# Extract status line
status_match = re.search(r"status:\s*(.+?)(?:,|\n|$)", elem_text, re.IGNORECASE)
status = status_match.group(1).strip() if status_match else "Unknown"
if status.lower() == "not installed":
continue
if "power supply" in element_type:
fail = "Fail=1" in elem_text
ac_fail = "AC fail=1" in elem_text
dc_fail = "DC fail=1" in elem_text
result["psus"].append({
"index": idx,
"status": status,
"fail": fail,
"ac_fail": ac_fail,
"dc_fail": dc_fail,
})
elif "cooling" in element_type or "fan" in element_type:
fail = "Fail=1" in elem_text
rpm_match = re.search(r"Actual speed[=:]\s*(\d+)\s*rpm", elem_text, re.IGNORECASE)
rpm = int(rpm_match.group(1)) if rpm_match else None
result["fans"].append({
"index": idx,
"status": status,
"rpm": rpm,
"fail": fail,
})
elif "temperature" in element_type:
temp_match = re.search(r"Temperature=\s*([\d.]+)\s*C", elem_text)
temp = float(temp_match.group(1)) if temp_match else None
result["temps"].append({
"index": idx,
"status": status,
"temperature_c": temp,
})
elif "voltage" in element_type:
volt_match = re.search(r"Voltage:\s*([\d.]+)\s*V", elem_text, re.IGNORECASE)
if not volt_match:
volt_match = re.search(r"([\d.]+)\s*V", elem_text)
voltage = float(volt_match.group(1)) if volt_match else None
result["voltages"].append({
"index": idx,
"status": status,
"voltage": voltage,
})
return result