Compare commits

..

3 Commits

Author SHA1 Message Date
842f733638 Add tree-line connector for RAID child drives in host drives card 2026-03-07 19:07:14 +00:00
b11c1bdf98 Replace in-memory TTL cache with Redis 2026-03-07 18:45:15 +00:00
0112875894 Add enclosure health details (PSUs, fans, temps, voltages) via SES
Parse sg_ses --page=0x02 output to surface enclosure-level health data
including power supply status, fan RPMs, temperature sensors, and voltage
rails. Failed/critical components are reflected in the overview totals
and shown as status pills in the enclosure card header with an expandable
detail panel.
2026-03-07 06:03:26 +00:00
13 changed files with 631 additions and 61 deletions

25
build.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGE="docker.adamksmith.xyz/jbod-monitor"
cd "$(dirname "$0")"
# Stage and commit all changes
git add -A
if git diff --cached --quiet; then
echo "No changes to commit, using HEAD"
else
git commit -m "${1:-Build and push image}"
fi
SHA=$(git rev-parse --short HEAD)
echo "Building ${IMAGE}:${SHA}"
docker build -t "${IMAGE}:${SHA}" -t "${IMAGE}:latest" .
echo "Pushing ${IMAGE}:${SHA}"
docker push "${IMAGE}:${SHA}"
docker push "${IMAGE}:latest"
echo "Done: ${IMAGE}:${SHA}"

View File

@@ -15,3 +15,22 @@ services:
- TZ=America/Denver
- UVICORN_LOG_LEVEL=info
- ZFS_USE_NSENTER=true
- REDIS_HOST=127.0.0.1
- REDIS_PORT=6379
- REDIS_DB=0
- SMART_CACHE_TTL=120
- SMART_POLL_INTERVAL=90
depends_on:
- redis
redis:
image: redis:7-alpine
container_name: jbod-redis
restart: unless-stopped
network_mode: host
volumes:
- redis-data:/data
command: redis-server --save 60 1 --loglevel warning
volumes:
redis-data:

View File

@@ -698,9 +698,20 @@ function HostDrivesCard({ drives, onSelect, t }) {
</span>
)}
</div>
{d.physical_drives?.map((pd, i) => (
<HostDriveRow key={pd.serial || i} d={pd} onSelect={onSelect} t={t} indent />
))}
{d.physical_drives?.length > 0 && (
<div style={{ marginLeft: 20, borderLeft: `2px solid ${t.cardBorder}`, paddingLeft: 0 }}>
{d.physical_drives.map((pd, i) => (
<div key={pd.serial || i} style={{ display: "flex", alignItems: "center", marginTop: 4 }}>
<div style={{
width: 16, height: 1, background: t.cardBorder, flexShrink: 0,
}} />
<div style={{ flex: 1 }}>
<HostDriveRow d={pd} onSelect={onSelect} t={t} indent />
</div>
</div>
))}
</div>
)}
</React.Fragment>
))}
</div>
@@ -709,7 +720,181 @@ function HostDrivesCard({ drives, onSelect, t }) {
);
}
function EnclosureHealthSummary({ health, t }) {
if (!health) return null;
const statusColors = {
CRITICAL: t.health.error,
WARNING: t.health.warning,
OK: t.health.healthy,
};
const sc = statusColors[health.overall_status] || statusColors.OK;
const failedPsus = health.psus.filter((p) => p.fail || p.status.toLowerCase() === "critical");
const failedFans = health.fans.filter((f) => f.fail);
const temps = health.temps.filter((s) => s.temperature_c != null);
const tempMin = temps.length > 0 ? Math.min(...temps.map((s) => s.temperature_c)) : null;
const tempMax = temps.length > 0 ? Math.max(...temps.map((s) => s.temperature_c)) : null;
return (
<div style={{ display: "flex", alignItems: "center", gap: 8, flexWrap: "wrap", marginTop: 6 }}>
{/* Overall badge */}
<span style={{
display: "inline-flex", alignItems: "center", gap: 5,
padding: "2px 10px", borderRadius: 99,
background: sc.bg, border: `1px solid ${sc.border}`,
fontSize: 11, fontWeight: 700, color: sc.text, letterSpacing: 0.3,
}}>
<span style={{ width: 6, height: 6, borderRadius: "50%", background: sc.dot }} />
{health.overall_status}
</span>
{/* PSU pills */}
{health.psus.map((psu) => {
const bad = psu.fail || psu.status.toLowerCase() === "critical";
const pc = bad ? t.health.error : t.health.healthy;
return (
<span key={psu.index} style={{
display: "inline-flex", alignItems: "center", gap: 4,
padding: "2px 8px", borderRadius: 99,
background: pc.bg, border: `1px solid ${pc.border}`,
fontSize: 10, fontWeight: 600, color: pc.text,
}}>
<span style={{ width: 5, height: 5, borderRadius: "50%", background: pc.dot }} />
PSU {psu.index} {bad ? "FAIL" : "OK"}
</span>
);
})}
{/* Fans summary */}
{health.fans.length > 0 && (
<span style={{
fontSize: 11, color: failedFans.length > 0 ? t.health.error.text : t.textSecondary,
fontWeight: 600,
}}>
{failedFans.length > 0
? `${failedFans.length}/${health.fans.length} fans failed`
: `${health.fans.length} fans OK`}
</span>
)}
{/* Temp range */}
{tempMin != null && (
<span style={{
fontSize: 11, color: tempMax >= 45 ? t.health.warning.text : t.textSecondary,
fontWeight: 600, fontFamily: "'JetBrains Mono', monospace",
}}>
{tempMin === tempMax ? `${tempMin}\u00B0C` : `${tempMin}\u2013${tempMax}\u00B0C`}
</span>
)}
</div>
);
}
function EnclosureHealthDetail({ health, t }) {
if (!health) return null;
const sectionStyle = { marginBottom: 12 };
const headerStyle = {
fontSize: 10, fontWeight: 700, color: t.textMuted,
textTransform: "uppercase", letterSpacing: 1, marginBottom: 6,
};
const rowStyle = {
display: "flex", justifyContent: "space-between", alignItems: "center",
padding: "4px 0", borderBottom: `1px solid ${t.divider}`, fontSize: 12,
};
return (
<div style={{
padding: "12px 16px", background: t.surface,
borderTop: `1px solid ${t.divider}`, borderBottom: `1px solid ${t.divider}`,
}}>
<div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(220px, 1fr))", gap: 16 }}>
{/* PSUs */}
{health.psus.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Power Supplies</div>
{health.psus.map((psu) => {
const bad = psu.fail || psu.status.toLowerCase() === "critical";
return (
<div key={psu.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>PSU {psu.index}</span>
<span style={{
fontWeight: 600, color: bad ? t.health.error.text : t.health.healthy.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{psu.status}{psu.ac_fail ? " (AC fail)" : ""}{psu.dc_fail ? " (DC fail)" : ""}
</span>
</div>
);
})}
</div>
)}
{/* Fans */}
{health.fans.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Fans</div>
{health.fans.map((fan) => (
<div key={fan.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>Fan {fan.index}</span>
<span style={{
fontWeight: 600,
color: fan.fail ? t.health.error.text : t.health.healthy.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{fan.rpm != null ? `${fan.rpm} RPM` : fan.status}
{fan.fail ? " FAIL" : ""}
</span>
</div>
))}
</div>
)}
{/* Temps */}
{health.temps.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Temperature Sensors</div>
{health.temps.map((ts) => (
<div key={ts.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>Sensor {ts.index}</span>
<span style={{
fontWeight: 600,
color: ts.temperature_c >= 45 ? t.health.warning.text : t.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{ts.temperature_c != null ? `${ts.temperature_c}\u00B0C` : ts.status}
</span>
</div>
))}
</div>
)}
{/* Voltages */}
{health.voltages.length > 0 && (
<div style={sectionStyle}>
<div style={headerStyle}>Voltage Rails</div>
{health.voltages.map((vs) => (
<div key={vs.index} style={rowStyle}>
<span style={{ color: t.textSecondary }}>Rail {vs.index}</span>
<span style={{
fontWeight: 600, color: t.text,
fontFamily: "'JetBrains Mono', monospace",
}}>
{vs.voltage != null ? `${vs.voltage} V` : vs.status}
</span>
</div>
))}
</div>
)}
</div>
</div>
);
}
function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
const [healthExpanded, setHealthExpanded] = useState(false);
return (
<div style={{
background: t.cardBg, borderRadius: 16,
@@ -720,7 +905,7 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
<div style={{
padding: "16px 20px",
borderBottom: `1px solid ${t.divider}`,
display: "flex", alignItems: "center", justifyContent: "space-between",
display: "flex", alignItems: "flex-start", justifyContent: "space-between",
flexWrap: "wrap", gap: 8,
}}>
<div>
@@ -730,11 +915,29 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
<div style={{ fontSize: 12, color: t.textSecondary, marginTop: 2 }}>
{enclosure.sg_device} &middot; {enclosure.populated_slots}/{enclosure.total_slots} slots populated
</div>
{enclosure.health && (
<div style={{ display: "flex", alignItems: "center", gap: 6 }}>
<EnclosureHealthSummary health={enclosure.health} t={t} />
<button
onClick={() => setHealthExpanded(!healthExpanded)}
style={{
background: "none", border: "none", cursor: "pointer",
fontSize: 11, color: t.accent, fontWeight: 600,
padding: "2px 6px", marginTop: 6,
}}
>
{healthExpanded ? "Hide details" : "Details"}
</button>
</div>
)}
</div>
<div style={{ fontSize: 11, color: t.textMuted, fontFamily: "'JetBrains Mono', monospace" }}>
ID {enclosure.id}
</div>
</div>
{healthExpanded && enclosure.health && (
<EnclosureHealthDetail health={enclosure.health} t={t} />
)}
<div style={{ padding: 16 }}>
{view === "grid" ? (
<GridView enclosure={enclosure} onSelect={onSelect} t={t} />

111
main.py
View File

@@ -1,15 +1,20 @@
import asyncio
import json
import logging
import os
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from models.schemas import HealthCheck
from routers import drives, enclosures, leds, overview
from services.smart import sg_ses_available, smartctl_available
from services.cache import cache_set, close_cache, init_cache, redis_available
from services.enclosure import discover_enclosures, list_slots
from services.smart import SMART_CACHE_TTL, _run_smartctl, sg_ses_available, smartctl_available
from services.zfs import get_zfs_pool_map
logging.basicConfig(
level=logging.INFO,
@@ -17,10 +22,96 @@ logging.basicConfig(
)
logger = logging.getLogger(__name__)
SMART_POLL_INTERVAL = int(os.environ.get("SMART_POLL_INTERVAL", "90"))
_tool_status: dict[str, bool] = {}
_poll_task: asyncio.Task | None = None
async def smart_poll_loop():
"""Pre-warm Redis with SMART data for all drives."""
await asyncio.sleep(2) # let app finish starting
while True:
try:
# Discover all enclosure devices
enclosures_raw = discover_enclosures()
devices: set[str] = set()
for enc in enclosures_raw:
for slot in list_slots(enc["id"]):
if slot["device"]:
devices.add(slot["device"])
# Discover host block devices via lsblk
try:
proc = await asyncio.create_subprocess_exec(
"lsblk", "-d", "-o", "NAME,TYPE", "-J",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()
if stdout:
for dev in json.loads(stdout).get("blockdevices", []):
if dev.get("type") == "disk":
name = dev.get("name", "")
if name and name not in devices:
devices.add(name)
except Exception as e:
logger.warning("lsblk discovery failed in poller: %s", e)
# Poll each drive and cache result
for device in sorted(devices):
try:
result = await _run_smartctl(device)
await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
except Exception as e:
logger.warning("Poll failed for %s: %s", device, e)
# Pre-warm ZFS map (bypasses cache by calling directly)
await get_zfs_pool_map()
logger.info("SMART poll complete: %d devices", len(devices))
except Exception as e:
logger.error("SMART poll loop error: %s", e)
await asyncio.sleep(SMART_POLL_INTERVAL)
@asynccontextmanager
async def lifespan(app: FastAPI):
global _poll_task
# Startup
_tool_status["smartctl"] = smartctl_available()
_tool_status["sg_ses"] = sg_ses_available()
if not _tool_status["smartctl"]:
logger.warning("smartctl not found — install smartmontools for SMART data")
if not _tool_status["sg_ses"]:
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
if os.geteuid() != 0:
logger.warning("Not running as root — smartctl may fail on some devices")
await init_cache()
_tool_status["redis"] = redis_available()
if redis_available():
_poll_task = asyncio.create_task(smart_poll_loop())
yield
# Shutdown
if _poll_task is not None:
_poll_task.cancel()
try:
await _poll_task
except asyncio.CancelledError:
pass
await close_cache()
app = FastAPI(
title="JBOD Monitor",
description="Drive health monitoring for JBOD enclosures",
version="0.1.0",
lifespan=lifespan,
)
app.add_middleware(
@@ -35,24 +126,10 @@ app.include_router(drives.router)
app.include_router(leds.router)
app.include_router(overview.router)
_tool_status: dict[str, bool] = {}
@app.on_event("startup")
async def check_dependencies():
_tool_status["smartctl"] = smartctl_available()
_tool_status["sg_ses"] = sg_ses_available()
if not _tool_status["smartctl"]:
logger.warning("smartctl not found — install smartmontools for SMART data")
if not _tool_status["sg_ses"]:
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
if os.geteuid() != 0:
logger.warning("Not running as root — smartctl may fail on some devices")
@app.get("/api/health", response_model=HealthCheck, tags=["health"])
async def health():
_tool_status["redis"] = redis_available()
return HealthCheck(status="ok", tools=_tool_status)

View File

@@ -65,6 +65,41 @@ class SlotWithDrive(BaseModel):
drive: DriveHealthSummary | None = None
class PsuStatus(BaseModel):
index: int
status: str
fail: bool = False
ac_fail: bool = False
dc_fail: bool = False
class FanStatus(BaseModel):
index: int
status: str
rpm: int | None = None
fail: bool = False
class TempSensor(BaseModel):
index: int
status: str
temperature_c: float | None = None
class VoltageSensor(BaseModel):
index: int
status: str
voltage: float | None = None
class EnclosureHealth(BaseModel):
overall_status: str = "OK"
psus: list[PsuStatus] = []
fans: list[FanStatus] = []
temps: list[TempSensor] = []
voltages: list[VoltageSensor] = []
class EnclosureWithDrives(BaseModel):
id: str
sg_device: str | None = None
@@ -74,6 +109,7 @@ class EnclosureWithDrives(BaseModel):
total_slots: int
populated_slots: int
slots: list[SlotWithDrive]
health: EnclosureHealth | None = None
class HostDrive(BaseModel):

View File

@@ -1,3 +1,4 @@
fastapi>=0.115.0
uvicorn>=0.34.0
pydantic>=2.10.0
redis>=5.0.0

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter, HTTPException
from fastapi import APIRouter, HTTPException, Response
from models.schemas import DriveDetail
from services.smart import get_smart_data
@@ -8,10 +8,10 @@ router = APIRouter(prefix="/api/drives", tags=["drives"])
@router.get("/{device}", response_model=DriveDetail)
async def get_drive_detail(device: str):
async def get_drive_detail(device: str, response: Response):
"""Get SMART detail for a specific block device."""
try:
data = await get_smart_data(device)
data, cache_hit = await get_smart_data(device)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
@@ -25,4 +25,6 @@ async def get_drive_detail(device: str):
data["zfs_vdev"] = zfs_info["vdev"]
data["zfs_state"] = zfs_info.get("state")
response.headers["X-Cache"] = "HIT" if cache_hit else "MISS"
return DriveDetail(**data)

View File

@@ -1,16 +1,17 @@
import asyncio
import logging
from fastapi import APIRouter
from fastapi import APIRouter, Response
from models.schemas import (
DriveHealthSummary,
EnclosureHealth,
EnclosureWithDrives,
HostDrive,
Overview,
SlotWithDrive,
)
from services.enclosure import discover_enclosures, list_slots
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
from services.host import get_host_drives
from services.smart import get_smart_data
from services.zfs import get_zfs_pool_map
@@ -21,18 +22,29 @@ router = APIRouter(prefix="/api/overview", tags=["overview"])
@router.get("", response_model=Overview)
async def get_overview():
async def get_overview(response: Response):
"""Aggregate view of all enclosures, slots, and drive health."""
enclosures_raw = discover_enclosures()
pool_map = await get_zfs_pool_map()
# Fetch SES health data for all enclosures concurrently
async def _get_health(enc):
if enc.get("sg_device"):
return await get_enclosure_status(enc["sg_device"])
return None
health_results = await asyncio.gather(
*[_get_health(enc) for enc in enclosures_raw],
return_exceptions=True,
)
enc_results: list[EnclosureWithDrives] = []
total_drives = 0
warnings = 0
errors = 0
all_healthy = True
for enc in enclosures_raw:
for enc_idx, enc in enumerate(enclosures_raw):
slots_raw = list_slots(enc["id"])
# Gather SMART data for all populated slots concurrently
@@ -41,12 +53,19 @@ async def get_overview():
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
smart_map: dict[str, dict] = {}
all_cache_hits = True
any_lookups = False
for (slot_info, dev), result in zip(populated, smart_results):
if isinstance(result, Exception):
logger.warning("SMART query failed for %s: %s", dev, result)
smart_map[dev] = {"device": dev, "smart_supported": False}
all_cache_hits = False
else:
smart_map[dev] = result
data, hit = result
smart_map[dev] = data
any_lookups = True
if not hit:
all_cache_hits = False
slots_out: list[SlotWithDrive] = []
for s in slots_raw:
@@ -110,6 +129,20 @@ async def get_overview():
drive=drive_summary,
))
# Attach enclosure health from SES
health_data = health_results[enc_idx]
enc_health = None
if isinstance(health_data, dict):
enc_health = EnclosureHealth(**health_data)
# Count enclosure-level issues
if enc_health.overall_status == "CRITICAL":
errors += 1
all_healthy = False
elif enc_health.overall_status == "WARNING":
warnings += 1
elif isinstance(health_data, Exception):
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
enc_results.append(EnclosureWithDrives(
id=enc["id"],
sg_device=enc.get("sg_device"),
@@ -119,6 +152,7 @@ async def get_overview():
total_slots=enc["total_slots"],
populated_slots=enc["populated_slots"],
slots=slots_out,
health=enc_health,
))
# Host drives (non-enclosure)
@@ -143,6 +177,8 @@ async def get_overview():
warnings += 1
host_drives_out.append(HostDrive(**hd))
response.headers["X-Cache"] = "HIT" if (any_lookups and all_cache_hits) else "MISS"
return Overview(
healthy=all_healthy and errors == 0,
drive_count=total_drives,

View File

@@ -1,29 +1,62 @@
import time
import json
import logging
import os
from typing import Any
import redis.asyncio as redis
class TTLCache:
"""Simple in-memory TTL cache."""
logger = logging.getLogger(__name__)
def __init__(self, ttl_seconds: int = 60):
self._ttl = ttl_seconds
self._store: dict[str, tuple[float, Any]] = {}
_redis: redis.Redis | None = None
def get(self, key: str) -> Any | None:
entry = self._store.get(key)
if entry is None:
async def init_cache() -> None:
"""Create Redis connection from environment variables."""
global _redis
host = os.environ.get("REDIS_HOST", "localhost")
port = int(os.environ.get("REDIS_PORT", "6379"))
db = int(os.environ.get("REDIS_DB", "0"))
try:
_redis = redis.Redis(host=host, port=port, db=db, decode_responses=True)
await _redis.ping()
logger.info("Redis connected at %s:%d/%d", host, port, db)
except Exception as e:
logger.warning("Redis connection failed: %s — running without cache", e)
_redis = None
async def close_cache() -> None:
"""Close Redis connection."""
global _redis
if _redis is not None:
await _redis.aclose()
_redis = None
def redis_available() -> bool:
"""Return whether Redis connection is live."""
return _redis is not None
async def cache_get(key: str) -> Any | None:
"""GET key from Redis, return deserialized value or None on miss/error."""
if _redis is None:
return None
try:
raw = await _redis.get(key)
if raw is None:
return None
ts, value = entry
if time.monotonic() - ts > self._ttl:
del self._store[key]
return None
return value
def set(self, key: str, value: Any) -> None:
self._store[key] = (time.monotonic(), value)
def clear(self) -> None:
self._store.clear()
return json.loads(raw)
except Exception as e:
logger.warning("Redis GET %s failed: %s", key, e)
return None
smart_cache = TTLCache(ttl_seconds=60)
async def cache_set(key: str, value: Any, ttl: int = 120) -> None:
"""SET key in Redis with expiry, silently catches errors."""
if _redis is None:
return
try:
await _redis.set(key, json.dumps(value), ex=ttl)
except Exception as e:
logger.warning("Redis SET %s failed: %s", key, e)

View File

@@ -1,5 +1,7 @@
import os
import asyncio
import logging
import os
import re
from pathlib import Path
logger = logging.getLogger(__name__)
@@ -136,3 +138,121 @@ def _parse_slot_number(entry: Path) -> int | None:
except ValueError:
return None
return None
async def get_enclosure_status(sg_device: str) -> dict | None:
"""Run sg_ses --page=0x02 and parse enclosure health data."""
try:
proc = await asyncio.create_subprocess_exec(
"sg_ses", "--page=0x02", sg_device,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
logger.warning("sg_ses failed for %s: %s", sg_device, stderr.decode().strip())
return None
return _parse_ses_page02(stdout.decode(errors="replace"))
except FileNotFoundError:
logger.warning("sg_ses not found")
return None
except Exception as e:
logger.warning("sg_ses error for %s: %s", sg_device, e)
return None
def _parse_ses_page02(text: str) -> dict:
"""Parse sg_ses --page=0x02 text output into structured health data."""
result = {
"overall_status": "OK",
"psus": [],
"fans": [],
"temps": [],
"voltages": [],
}
# Parse header line for overall status:
# INVOP=0, INFO=0, NON-CRIT=0, CRIT=1, UNRECOV=0
header_match = re.search(
r"INVOP=\d+,\s*INFO=\d+,\s*NON-CRIT=(\d+),\s*CRIT=(\d+),\s*UNRECOV=(\d+)",
text,
)
if header_match:
non_crit = int(header_match.group(1))
crit = int(header_match.group(2))
unrecov = int(header_match.group(3))
if crit > 0 or unrecov > 0:
result["overall_status"] = "CRITICAL"
elif non_crit > 0:
result["overall_status"] = "WARNING"
# Split into element type sections.
# Each section starts with "Element type: <type>"
sections = re.split(r"(?=\s*Element type:)", text)
for section in sections:
type_match = re.match(r"\s*Element type:\s*(.+)", section)
if not type_match:
continue
element_type = type_match.group(1).strip().rstrip(",").lower()
# Find individual element blocks (skip "Overall descriptor")
elements = re.split(r"(?=\s*Element \d+ descriptor:)", section)
for elem_text in elements:
desc_match = re.match(r"\s*Element (\d+) descriptor:", elem_text)
if not desc_match:
continue
idx = int(desc_match.group(1))
# Extract status line
status_match = re.search(r"status:\s*(.+?)(?:,|\n|$)", elem_text, re.IGNORECASE)
status = status_match.group(1).strip() if status_match else "Unknown"
if status.lower() == "not installed":
continue
if "power supply" in element_type:
fail = "Fail=1" in elem_text
ac_fail = "AC fail=1" in elem_text
dc_fail = "DC fail=1" in elem_text
result["psus"].append({
"index": idx,
"status": status,
"fail": fail,
"ac_fail": ac_fail,
"dc_fail": dc_fail,
})
elif "cooling" in element_type or "fan" in element_type:
fail = "Fail=1" in elem_text
rpm_match = re.search(r"Actual speed[=:]\s*(\d+)\s*rpm", elem_text, re.IGNORECASE)
rpm = int(rpm_match.group(1)) if rpm_match else None
result["fans"].append({
"index": idx,
"status": status,
"rpm": rpm,
"fail": fail,
})
elif "temperature" in element_type:
temp_match = re.search(r"Temperature=\s*([\d.]+)\s*C", elem_text)
temp = float(temp_match.group(1)) if temp_match else None
result["temps"].append({
"index": idx,
"status": status,
"temperature_c": temp,
})
elif "voltage" in element_type:
volt_match = re.search(r"Voltage:\s*([\d.]+)\s*V", elem_text, re.IGNORECASE)
if not volt_match:
volt_match = re.search(r"([\d.]+)\s*V", elem_text)
voltage = float(volt_match.group(1)) if volt_match else None
result["voltages"].append({
"index": idx,
"status": status,
"voltage": voltage,
})
return result

View File

@@ -65,12 +65,14 @@ async def get_host_drives() -> list[dict]:
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
results: list[dict] = []
for dev_info, smart in zip(host_devices, smart_results):
for dev_info, smart_result in zip(host_devices, smart_results):
name = dev_info["name"]
if isinstance(smart, Exception):
logger.warning("SMART query failed for host drive %s: %s", name, smart)
if isinstance(smart_result, Exception):
logger.warning("SMART query failed for host drive %s: %s", name, smart_result)
smart = {"device": name, "smart_supported": False}
else:
smart, _ = smart_result
# Compute health_status (same logic as overview.py)
healthy = smart.get("smart_healthy")

View File

@@ -1,10 +1,11 @@
import asyncio
import json
import logging
import os
import re
import shutil
from services.cache import smart_cache
from services.cache import cache_get, cache_set
logger = logging.getLogger(__name__)
@@ -17,6 +18,8 @@ ATTR_PENDING = 197
ATTR_UNCORRECTABLE = 198
ATTR_WEAR_LEVELING = 177 # SSD wear leveling
SMART_CACHE_TTL = int(os.environ.get("SMART_CACHE_TTL", "120"))
def smartctl_available() -> bool:
return shutil.which("smartctl") is not None
@@ -26,19 +29,22 @@ def sg_ses_available() -> bool:
return shutil.which("sg_ses") is not None
async def get_smart_data(device: str) -> dict:
"""Run smartctl -a -j against a device, with caching."""
async def get_smart_data(device: str) -> tuple[dict, bool]:
"""Run smartctl -a -j against a device, with caching.
Returns (data, cache_hit) tuple.
"""
# Sanitize device name: only allow alphanumeric and hyphens
if not re.match(r"^[a-zA-Z0-9\-]+$", device):
raise ValueError(f"Invalid device name: {device}")
cached = smart_cache.get(device)
cached = await cache_get(f"jbod:smart:{device}")
if cached is not None:
return cached
return (cached, True)
result = await _run_smartctl(device)
smart_cache.set(device, result)
return result
await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
return (result, False)
async def _run_smartctl(device: str) -> dict:

View File

@@ -4,11 +4,15 @@ import logging
import re
from pathlib import Path
from services.cache import cache_get, cache_set
logger = logging.getLogger(__name__)
# Allow overriding the zpool binary path via env (for bind-mounted host tools)
ZPOOL_BIN = os.environ.get("ZPOOL_BIN", "zpool")
ZFS_CACHE_TTL = 300
async def get_zfs_pool_map() -> dict[str, dict]:
"""Return a dict mapping device names to ZFS pool and vdev info.
@@ -16,6 +20,10 @@ async def get_zfs_pool_map() -> dict[str, dict]:
e.g. {"sda": {"pool": "tank", "vdev": "raidz2-0"},
"sdb": {"pool": "fast", "vdev": "mirror-0"}}
"""
cached = await cache_get("jbod:zfs_map")
if cached is not None:
return cached
pool_map = {}
try:
# When running in a container with pid:host, use nsenter to run
@@ -94,6 +102,8 @@ async def get_zfs_pool_map() -> dict[str, dict]:
pass
except FileNotFoundError:
logger.debug("zpool not available")
await cache_set("jbod:zfs_map", pool_map, ZFS_CACHE_TTL)
return pool_map