Compare commits
3 Commits
8ea8fdef08
...
842f733638
| Author | SHA1 | Date | |
|---|---|---|---|
| 842f733638 | |||
| b11c1bdf98 | |||
| 0112875894 |
25
build.sh
Executable file
25
build.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
IMAGE="docker.adamksmith.xyz/jbod-monitor"
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# Stage and commit all changes
|
||||
git add -A
|
||||
if git diff --cached --quiet; then
|
||||
echo "No changes to commit, using HEAD"
|
||||
else
|
||||
git commit -m "${1:-Build and push image}"
|
||||
fi
|
||||
|
||||
SHA=$(git rev-parse --short HEAD)
|
||||
|
||||
echo "Building ${IMAGE}:${SHA}"
|
||||
docker build -t "${IMAGE}:${SHA}" -t "${IMAGE}:latest" .
|
||||
|
||||
echo "Pushing ${IMAGE}:${SHA}"
|
||||
docker push "${IMAGE}:${SHA}"
|
||||
docker push "${IMAGE}:latest"
|
||||
|
||||
echo "Done: ${IMAGE}:${SHA}"
|
||||
@@ -15,3 +15,22 @@ services:
|
||||
- TZ=America/Denver
|
||||
- UVICORN_LOG_LEVEL=info
|
||||
- ZFS_USE_NSENTER=true
|
||||
- REDIS_HOST=127.0.0.1
|
||||
- REDIS_PORT=6379
|
||||
- REDIS_DB=0
|
||||
- SMART_CACHE_TTL=120
|
||||
- SMART_POLL_INTERVAL=90
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: jbod-redis
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
volumes:
|
||||
- redis-data:/data
|
||||
command: redis-server --save 60 1 --loglevel warning
|
||||
|
||||
volumes:
|
||||
redis-data:
|
||||
|
||||
@@ -698,9 +698,20 @@ function HostDrivesCard({ drives, onSelect, t }) {
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{d.physical_drives?.map((pd, i) => (
|
||||
<HostDriveRow key={pd.serial || i} d={pd} onSelect={onSelect} t={t} indent />
|
||||
{d.physical_drives?.length > 0 && (
|
||||
<div style={{ marginLeft: 20, borderLeft: `2px solid ${t.cardBorder}`, paddingLeft: 0 }}>
|
||||
{d.physical_drives.map((pd, i) => (
|
||||
<div key={pd.serial || i} style={{ display: "flex", alignItems: "center", marginTop: 4 }}>
|
||||
<div style={{
|
||||
width: 16, height: 1, background: t.cardBorder, flexShrink: 0,
|
||||
}} />
|
||||
<div style={{ flex: 1 }}>
|
||||
<HostDriveRow d={pd} onSelect={onSelect} t={t} indent />
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</React.Fragment>
|
||||
))}
|
||||
</div>
|
||||
@@ -709,7 +720,181 @@ function HostDrivesCard({ drives, onSelect, t }) {
|
||||
);
|
||||
}
|
||||
|
||||
function EnclosureHealthSummary({ health, t }) {
|
||||
if (!health) return null;
|
||||
|
||||
const statusColors = {
|
||||
CRITICAL: t.health.error,
|
||||
WARNING: t.health.warning,
|
||||
OK: t.health.healthy,
|
||||
};
|
||||
const sc = statusColors[health.overall_status] || statusColors.OK;
|
||||
|
||||
const failedPsus = health.psus.filter((p) => p.fail || p.status.toLowerCase() === "critical");
|
||||
const failedFans = health.fans.filter((f) => f.fail);
|
||||
const temps = health.temps.filter((s) => s.temperature_c != null);
|
||||
const tempMin = temps.length > 0 ? Math.min(...temps.map((s) => s.temperature_c)) : null;
|
||||
const tempMax = temps.length > 0 ? Math.max(...temps.map((s) => s.temperature_c)) : null;
|
||||
|
||||
return (
|
||||
<div style={{ display: "flex", alignItems: "center", gap: 8, flexWrap: "wrap", marginTop: 6 }}>
|
||||
{/* Overall badge */}
|
||||
<span style={{
|
||||
display: "inline-flex", alignItems: "center", gap: 5,
|
||||
padding: "2px 10px", borderRadius: 99,
|
||||
background: sc.bg, border: `1px solid ${sc.border}`,
|
||||
fontSize: 11, fontWeight: 700, color: sc.text, letterSpacing: 0.3,
|
||||
}}>
|
||||
<span style={{ width: 6, height: 6, borderRadius: "50%", background: sc.dot }} />
|
||||
{health.overall_status}
|
||||
</span>
|
||||
|
||||
{/* PSU pills */}
|
||||
{health.psus.map((psu) => {
|
||||
const bad = psu.fail || psu.status.toLowerCase() === "critical";
|
||||
const pc = bad ? t.health.error : t.health.healthy;
|
||||
return (
|
||||
<span key={psu.index} style={{
|
||||
display: "inline-flex", alignItems: "center", gap: 4,
|
||||
padding: "2px 8px", borderRadius: 99,
|
||||
background: pc.bg, border: `1px solid ${pc.border}`,
|
||||
fontSize: 10, fontWeight: 600, color: pc.text,
|
||||
}}>
|
||||
<span style={{ width: 5, height: 5, borderRadius: "50%", background: pc.dot }} />
|
||||
PSU {psu.index} {bad ? "FAIL" : "OK"}
|
||||
</span>
|
||||
);
|
||||
})}
|
||||
|
||||
{/* Fans summary */}
|
||||
{health.fans.length > 0 && (
|
||||
<span style={{
|
||||
fontSize: 11, color: failedFans.length > 0 ? t.health.error.text : t.textSecondary,
|
||||
fontWeight: 600,
|
||||
}}>
|
||||
{failedFans.length > 0
|
||||
? `${failedFans.length}/${health.fans.length} fans failed`
|
||||
: `${health.fans.length} fans OK`}
|
||||
</span>
|
||||
)}
|
||||
|
||||
{/* Temp range */}
|
||||
{tempMin != null && (
|
||||
<span style={{
|
||||
fontSize: 11, color: tempMax >= 45 ? t.health.warning.text : t.textSecondary,
|
||||
fontWeight: 600, fontFamily: "'JetBrains Mono', monospace",
|
||||
}}>
|
||||
{tempMin === tempMax ? `${tempMin}\u00B0C` : `${tempMin}\u2013${tempMax}\u00B0C`}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function EnclosureHealthDetail({ health, t }) {
|
||||
if (!health) return null;
|
||||
|
||||
const sectionStyle = { marginBottom: 12 };
|
||||
const headerStyle = {
|
||||
fontSize: 10, fontWeight: 700, color: t.textMuted,
|
||||
textTransform: "uppercase", letterSpacing: 1, marginBottom: 6,
|
||||
};
|
||||
const rowStyle = {
|
||||
display: "flex", justifyContent: "space-between", alignItems: "center",
|
||||
padding: "4px 0", borderBottom: `1px solid ${t.divider}`, fontSize: 12,
|
||||
};
|
||||
|
||||
return (
|
||||
<div style={{
|
||||
padding: "12px 16px", background: t.surface,
|
||||
borderTop: `1px solid ${t.divider}`, borderBottom: `1px solid ${t.divider}`,
|
||||
}}>
|
||||
<div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(220px, 1fr))", gap: 16 }}>
|
||||
{/* PSUs */}
|
||||
{health.psus.length > 0 && (
|
||||
<div style={sectionStyle}>
|
||||
<div style={headerStyle}>Power Supplies</div>
|
||||
{health.psus.map((psu) => {
|
||||
const bad = psu.fail || psu.status.toLowerCase() === "critical";
|
||||
return (
|
||||
<div key={psu.index} style={rowStyle}>
|
||||
<span style={{ color: t.textSecondary }}>PSU {psu.index}</span>
|
||||
<span style={{
|
||||
fontWeight: 600, color: bad ? t.health.error.text : t.health.healthy.text,
|
||||
fontFamily: "'JetBrains Mono', monospace",
|
||||
}}>
|
||||
{psu.status}{psu.ac_fail ? " (AC fail)" : ""}{psu.dc_fail ? " (DC fail)" : ""}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Fans */}
|
||||
{health.fans.length > 0 && (
|
||||
<div style={sectionStyle}>
|
||||
<div style={headerStyle}>Fans</div>
|
||||
{health.fans.map((fan) => (
|
||||
<div key={fan.index} style={rowStyle}>
|
||||
<span style={{ color: t.textSecondary }}>Fan {fan.index}</span>
|
||||
<span style={{
|
||||
fontWeight: 600,
|
||||
color: fan.fail ? t.health.error.text : t.health.healthy.text,
|
||||
fontFamily: "'JetBrains Mono', monospace",
|
||||
}}>
|
||||
{fan.rpm != null ? `${fan.rpm} RPM` : fan.status}
|
||||
{fan.fail ? " FAIL" : ""}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Temps */}
|
||||
{health.temps.length > 0 && (
|
||||
<div style={sectionStyle}>
|
||||
<div style={headerStyle}>Temperature Sensors</div>
|
||||
{health.temps.map((ts) => (
|
||||
<div key={ts.index} style={rowStyle}>
|
||||
<span style={{ color: t.textSecondary }}>Sensor {ts.index}</span>
|
||||
<span style={{
|
||||
fontWeight: 600,
|
||||
color: ts.temperature_c >= 45 ? t.health.warning.text : t.text,
|
||||
fontFamily: "'JetBrains Mono', monospace",
|
||||
}}>
|
||||
{ts.temperature_c != null ? `${ts.temperature_c}\u00B0C` : ts.status}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Voltages */}
|
||||
{health.voltages.length > 0 && (
|
||||
<div style={sectionStyle}>
|
||||
<div style={headerStyle}>Voltage Rails</div>
|
||||
{health.voltages.map((vs) => (
|
||||
<div key={vs.index} style={rowStyle}>
|
||||
<span style={{ color: t.textSecondary }}>Rail {vs.index}</span>
|
||||
<span style={{
|
||||
fontWeight: 600, color: t.text,
|
||||
fontFamily: "'JetBrains Mono', monospace",
|
||||
}}>
|
||||
{vs.voltage != null ? `${vs.voltage} V` : vs.status}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
|
||||
const [healthExpanded, setHealthExpanded] = useState(false);
|
||||
|
||||
return (
|
||||
<div style={{
|
||||
background: t.cardBg, borderRadius: 16,
|
||||
@@ -720,7 +905,7 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
|
||||
<div style={{
|
||||
padding: "16px 20px",
|
||||
borderBottom: `1px solid ${t.divider}`,
|
||||
display: "flex", alignItems: "center", justifyContent: "space-between",
|
||||
display: "flex", alignItems: "flex-start", justifyContent: "space-between",
|
||||
flexWrap: "wrap", gap: 8,
|
||||
}}>
|
||||
<div>
|
||||
@@ -730,11 +915,29 @@ function EnclosureCard({ enclosure, view, onSelect, selectedSerial, t }) {
|
||||
<div style={{ fontSize: 12, color: t.textSecondary, marginTop: 2 }}>
|
||||
{enclosure.sg_device} · {enclosure.populated_slots}/{enclosure.total_slots} slots populated
|
||||
</div>
|
||||
{enclosure.health && (
|
||||
<div style={{ display: "flex", alignItems: "center", gap: 6 }}>
|
||||
<EnclosureHealthSummary health={enclosure.health} t={t} />
|
||||
<button
|
||||
onClick={() => setHealthExpanded(!healthExpanded)}
|
||||
style={{
|
||||
background: "none", border: "none", cursor: "pointer",
|
||||
fontSize: 11, color: t.accent, fontWeight: 600,
|
||||
padding: "2px 6px", marginTop: 6,
|
||||
}}
|
||||
>
|
||||
{healthExpanded ? "Hide details" : "Details"}
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<div style={{ fontSize: 11, color: t.textMuted, fontFamily: "'JetBrains Mono', monospace" }}>
|
||||
ID {enclosure.id}
|
||||
</div>
|
||||
</div>
|
||||
{healthExpanded && enclosure.health && (
|
||||
<EnclosureHealthDetail health={enclosure.health} t={t} />
|
||||
)}
|
||||
<div style={{ padding: 16 }}>
|
||||
{view === "grid" ? (
|
||||
<GridView enclosure={enclosure} onSelect={onSelect} t={t} />
|
||||
|
||||
111
main.py
111
main.py
@@ -1,15 +1,20 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from models.schemas import HealthCheck
|
||||
from routers import drives, enclosures, leds, overview
|
||||
from services.smart import sg_ses_available, smartctl_available
|
||||
from services.cache import cache_set, close_cache, init_cache, redis_available
|
||||
from services.enclosure import discover_enclosures, list_slots
|
||||
from services.smart import SMART_CACHE_TTL, _run_smartctl, sg_ses_available, smartctl_available
|
||||
from services.zfs import get_zfs_pool_map
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -17,10 +22,96 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SMART_POLL_INTERVAL = int(os.environ.get("SMART_POLL_INTERVAL", "90"))
|
||||
|
||||
_tool_status: dict[str, bool] = {}
|
||||
_poll_task: asyncio.Task | None = None
|
||||
|
||||
|
||||
async def smart_poll_loop():
|
||||
"""Pre-warm Redis with SMART data for all drives."""
|
||||
await asyncio.sleep(2) # let app finish starting
|
||||
while True:
|
||||
try:
|
||||
# Discover all enclosure devices
|
||||
enclosures_raw = discover_enclosures()
|
||||
devices: set[str] = set()
|
||||
for enc in enclosures_raw:
|
||||
for slot in list_slots(enc["id"]):
|
||||
if slot["device"]:
|
||||
devices.add(slot["device"])
|
||||
|
||||
# Discover host block devices via lsblk
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"lsblk", "-d", "-o", "NAME,TYPE", "-J",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
if stdout:
|
||||
for dev in json.loads(stdout).get("blockdevices", []):
|
||||
if dev.get("type") == "disk":
|
||||
name = dev.get("name", "")
|
||||
if name and name not in devices:
|
||||
devices.add(name)
|
||||
except Exception as e:
|
||||
logger.warning("lsblk discovery failed in poller: %s", e)
|
||||
|
||||
# Poll each drive and cache result
|
||||
for device in sorted(devices):
|
||||
try:
|
||||
result = await _run_smartctl(device)
|
||||
await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
|
||||
except Exception as e:
|
||||
logger.warning("Poll failed for %s: %s", device, e)
|
||||
|
||||
# Pre-warm ZFS map (bypasses cache by calling directly)
|
||||
await get_zfs_pool_map()
|
||||
|
||||
logger.info("SMART poll complete: %d devices", len(devices))
|
||||
except Exception as e:
|
||||
logger.error("SMART poll loop error: %s", e)
|
||||
await asyncio.sleep(SMART_POLL_INTERVAL)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
global _poll_task
|
||||
# Startup
|
||||
_tool_status["smartctl"] = smartctl_available()
|
||||
_tool_status["sg_ses"] = sg_ses_available()
|
||||
|
||||
if not _tool_status["smartctl"]:
|
||||
logger.warning("smartctl not found — install smartmontools for SMART data")
|
||||
if not _tool_status["sg_ses"]:
|
||||
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
|
||||
if os.geteuid() != 0:
|
||||
logger.warning("Not running as root — smartctl may fail on some devices")
|
||||
|
||||
await init_cache()
|
||||
_tool_status["redis"] = redis_available()
|
||||
|
||||
if redis_available():
|
||||
_poll_task = asyncio.create_task(smart_poll_loop())
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
if _poll_task is not None:
|
||||
_poll_task.cancel()
|
||||
try:
|
||||
await _poll_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
await close_cache()
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="JBOD Monitor",
|
||||
description="Drive health monitoring for JBOD enclosures",
|
||||
version="0.1.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
@@ -35,24 +126,10 @@ app.include_router(drives.router)
|
||||
app.include_router(leds.router)
|
||||
app.include_router(overview.router)
|
||||
|
||||
_tool_status: dict[str, bool] = {}
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def check_dependencies():
|
||||
_tool_status["smartctl"] = smartctl_available()
|
||||
_tool_status["sg_ses"] = sg_ses_available()
|
||||
|
||||
if not _tool_status["smartctl"]:
|
||||
logger.warning("smartctl not found — install smartmontools for SMART data")
|
||||
if not _tool_status["sg_ses"]:
|
||||
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
|
||||
if os.geteuid() != 0:
|
||||
logger.warning("Not running as root — smartctl may fail on some devices")
|
||||
|
||||
|
||||
@app.get("/api/health", response_model=HealthCheck, tags=["health"])
|
||||
async def health():
|
||||
_tool_status["redis"] = redis_available()
|
||||
return HealthCheck(status="ok", tools=_tool_status)
|
||||
|
||||
|
||||
|
||||
@@ -65,6 +65,41 @@ class SlotWithDrive(BaseModel):
|
||||
drive: DriveHealthSummary | None = None
|
||||
|
||||
|
||||
class PsuStatus(BaseModel):
|
||||
index: int
|
||||
status: str
|
||||
fail: bool = False
|
||||
ac_fail: bool = False
|
||||
dc_fail: bool = False
|
||||
|
||||
|
||||
class FanStatus(BaseModel):
|
||||
index: int
|
||||
status: str
|
||||
rpm: int | None = None
|
||||
fail: bool = False
|
||||
|
||||
|
||||
class TempSensor(BaseModel):
|
||||
index: int
|
||||
status: str
|
||||
temperature_c: float | None = None
|
||||
|
||||
|
||||
class VoltageSensor(BaseModel):
|
||||
index: int
|
||||
status: str
|
||||
voltage: float | None = None
|
||||
|
||||
|
||||
class EnclosureHealth(BaseModel):
|
||||
overall_status: str = "OK"
|
||||
psus: list[PsuStatus] = []
|
||||
fans: list[FanStatus] = []
|
||||
temps: list[TempSensor] = []
|
||||
voltages: list[VoltageSensor] = []
|
||||
|
||||
|
||||
class EnclosureWithDrives(BaseModel):
|
||||
id: str
|
||||
sg_device: str | None = None
|
||||
@@ -74,6 +109,7 @@ class EnclosureWithDrives(BaseModel):
|
||||
total_slots: int
|
||||
populated_slots: int
|
||||
slots: list[SlotWithDrive]
|
||||
health: EnclosureHealth | None = None
|
||||
|
||||
|
||||
class HostDrive(BaseModel):
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
fastapi>=0.115.0
|
||||
uvicorn>=0.34.0
|
||||
pydantic>=2.10.0
|
||||
redis>=5.0.0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi import APIRouter, HTTPException, Response
|
||||
|
||||
from models.schemas import DriveDetail
|
||||
from services.smart import get_smart_data
|
||||
@@ -8,10 +8,10 @@ router = APIRouter(prefix="/api/drives", tags=["drives"])
|
||||
|
||||
|
||||
@router.get("/{device}", response_model=DriveDetail)
|
||||
async def get_drive_detail(device: str):
|
||||
async def get_drive_detail(device: str, response: Response):
|
||||
"""Get SMART detail for a specific block device."""
|
||||
try:
|
||||
data = await get_smart_data(device)
|
||||
data, cache_hit = await get_smart_data(device)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -25,4 +25,6 @@ async def get_drive_detail(device: str):
|
||||
data["zfs_vdev"] = zfs_info["vdev"]
|
||||
data["zfs_state"] = zfs_info.get("state")
|
||||
|
||||
response.headers["X-Cache"] = "HIT" if cache_hit else "MISS"
|
||||
|
||||
return DriveDetail(**data)
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi import APIRouter, Response
|
||||
|
||||
from models.schemas import (
|
||||
DriveHealthSummary,
|
||||
EnclosureHealth,
|
||||
EnclosureWithDrives,
|
||||
HostDrive,
|
||||
Overview,
|
||||
SlotWithDrive,
|
||||
)
|
||||
from services.enclosure import discover_enclosures, list_slots
|
||||
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
|
||||
from services.host import get_host_drives
|
||||
from services.smart import get_smart_data
|
||||
from services.zfs import get_zfs_pool_map
|
||||
@@ -21,18 +22,29 @@ router = APIRouter(prefix="/api/overview", tags=["overview"])
|
||||
|
||||
|
||||
@router.get("", response_model=Overview)
|
||||
async def get_overview():
|
||||
async def get_overview(response: Response):
|
||||
"""Aggregate view of all enclosures, slots, and drive health."""
|
||||
enclosures_raw = discover_enclosures()
|
||||
pool_map = await get_zfs_pool_map()
|
||||
|
||||
# Fetch SES health data for all enclosures concurrently
|
||||
async def _get_health(enc):
|
||||
if enc.get("sg_device"):
|
||||
return await get_enclosure_status(enc["sg_device"])
|
||||
return None
|
||||
|
||||
health_results = await asyncio.gather(
|
||||
*[_get_health(enc) for enc in enclosures_raw],
|
||||
return_exceptions=True,
|
||||
)
|
||||
|
||||
enc_results: list[EnclosureWithDrives] = []
|
||||
total_drives = 0
|
||||
warnings = 0
|
||||
errors = 0
|
||||
all_healthy = True
|
||||
|
||||
for enc in enclosures_raw:
|
||||
for enc_idx, enc in enumerate(enclosures_raw):
|
||||
slots_raw = list_slots(enc["id"])
|
||||
|
||||
# Gather SMART data for all populated slots concurrently
|
||||
@@ -41,12 +53,19 @@ async def get_overview():
|
||||
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
|
||||
|
||||
smart_map: dict[str, dict] = {}
|
||||
all_cache_hits = True
|
||||
any_lookups = False
|
||||
for (slot_info, dev), result in zip(populated, smart_results):
|
||||
if isinstance(result, Exception):
|
||||
logger.warning("SMART query failed for %s: %s", dev, result)
|
||||
smart_map[dev] = {"device": dev, "smart_supported": False}
|
||||
all_cache_hits = False
|
||||
else:
|
||||
smart_map[dev] = result
|
||||
data, hit = result
|
||||
smart_map[dev] = data
|
||||
any_lookups = True
|
||||
if not hit:
|
||||
all_cache_hits = False
|
||||
|
||||
slots_out: list[SlotWithDrive] = []
|
||||
for s in slots_raw:
|
||||
@@ -110,6 +129,20 @@ async def get_overview():
|
||||
drive=drive_summary,
|
||||
))
|
||||
|
||||
# Attach enclosure health from SES
|
||||
health_data = health_results[enc_idx]
|
||||
enc_health = None
|
||||
if isinstance(health_data, dict):
|
||||
enc_health = EnclosureHealth(**health_data)
|
||||
# Count enclosure-level issues
|
||||
if enc_health.overall_status == "CRITICAL":
|
||||
errors += 1
|
||||
all_healthy = False
|
||||
elif enc_health.overall_status == "WARNING":
|
||||
warnings += 1
|
||||
elif isinstance(health_data, Exception):
|
||||
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
|
||||
|
||||
enc_results.append(EnclosureWithDrives(
|
||||
id=enc["id"],
|
||||
sg_device=enc.get("sg_device"),
|
||||
@@ -119,6 +152,7 @@ async def get_overview():
|
||||
total_slots=enc["total_slots"],
|
||||
populated_slots=enc["populated_slots"],
|
||||
slots=slots_out,
|
||||
health=enc_health,
|
||||
))
|
||||
|
||||
# Host drives (non-enclosure)
|
||||
@@ -143,6 +177,8 @@ async def get_overview():
|
||||
warnings += 1
|
||||
host_drives_out.append(HostDrive(**hd))
|
||||
|
||||
response.headers["X-Cache"] = "HIT" if (any_lookups and all_cache_hits) else "MISS"
|
||||
|
||||
return Overview(
|
||||
healthy=all_healthy and errors == 0,
|
||||
drive_count=total_drives,
|
||||
|
||||
@@ -1,29 +1,62 @@
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import redis.asyncio as redis
|
||||
|
||||
class TTLCache:
|
||||
"""Simple in-memory TTL cache."""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def __init__(self, ttl_seconds: int = 60):
|
||||
self._ttl = ttl_seconds
|
||||
self._store: dict[str, tuple[float, Any]] = {}
|
||||
_redis: redis.Redis | None = None
|
||||
|
||||
def get(self, key: str) -> Any | None:
|
||||
entry = self._store.get(key)
|
||||
if entry is None:
|
||||
|
||||
async def init_cache() -> None:
|
||||
"""Create Redis connection from environment variables."""
|
||||
global _redis
|
||||
host = os.environ.get("REDIS_HOST", "localhost")
|
||||
port = int(os.environ.get("REDIS_PORT", "6379"))
|
||||
db = int(os.environ.get("REDIS_DB", "0"))
|
||||
try:
|
||||
_redis = redis.Redis(host=host, port=port, db=db, decode_responses=True)
|
||||
await _redis.ping()
|
||||
logger.info("Redis connected at %s:%d/%d", host, port, db)
|
||||
except Exception as e:
|
||||
logger.warning("Redis connection failed: %s — running without cache", e)
|
||||
_redis = None
|
||||
|
||||
|
||||
async def close_cache() -> None:
|
||||
"""Close Redis connection."""
|
||||
global _redis
|
||||
if _redis is not None:
|
||||
await _redis.aclose()
|
||||
_redis = None
|
||||
|
||||
|
||||
def redis_available() -> bool:
|
||||
"""Return whether Redis connection is live."""
|
||||
return _redis is not None
|
||||
|
||||
|
||||
async def cache_get(key: str) -> Any | None:
|
||||
"""GET key from Redis, return deserialized value or None on miss/error."""
|
||||
if _redis is None:
|
||||
return None
|
||||
ts, value = entry
|
||||
if time.monotonic() - ts > self._ttl:
|
||||
del self._store[key]
|
||||
try:
|
||||
raw = await _redis.get(key)
|
||||
if raw is None:
|
||||
return None
|
||||
return json.loads(raw)
|
||||
except Exception as e:
|
||||
logger.warning("Redis GET %s failed: %s", key, e)
|
||||
return None
|
||||
return value
|
||||
|
||||
def set(self, key: str, value: Any) -> None:
|
||||
self._store[key] = (time.monotonic(), value)
|
||||
|
||||
def clear(self) -> None:
|
||||
self._store.clear()
|
||||
|
||||
|
||||
smart_cache = TTLCache(ttl_seconds=60)
|
||||
async def cache_set(key: str, value: Any, ttl: int = 120) -> None:
|
||||
"""SET key in Redis with expiry, silently catches errors."""
|
||||
if _redis is None:
|
||||
return
|
||||
try:
|
||||
await _redis.set(key, json.dumps(value), ex=ttl)
|
||||
except Exception as e:
|
||||
logger.warning("Redis SET %s failed: %s", key, e)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -136,3 +138,121 @@ def _parse_slot_number(entry: Path) -> int | None:
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
async def get_enclosure_status(sg_device: str) -> dict | None:
|
||||
"""Run sg_ses --page=0x02 and parse enclosure health data."""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"sg_ses", "--page=0x02", sg_device,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
stdout, stderr = await proc.communicate()
|
||||
if proc.returncode != 0:
|
||||
logger.warning("sg_ses failed for %s: %s", sg_device, stderr.decode().strip())
|
||||
return None
|
||||
return _parse_ses_page02(stdout.decode(errors="replace"))
|
||||
except FileNotFoundError:
|
||||
logger.warning("sg_ses not found")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("sg_ses error for %s: %s", sg_device, e)
|
||||
return None
|
||||
|
||||
|
||||
def _parse_ses_page02(text: str) -> dict:
|
||||
"""Parse sg_ses --page=0x02 text output into structured health data."""
|
||||
result = {
|
||||
"overall_status": "OK",
|
||||
"psus": [],
|
||||
"fans": [],
|
||||
"temps": [],
|
||||
"voltages": [],
|
||||
}
|
||||
|
||||
# Parse header line for overall status:
|
||||
# INVOP=0, INFO=0, NON-CRIT=0, CRIT=1, UNRECOV=0
|
||||
header_match = re.search(
|
||||
r"INVOP=\d+,\s*INFO=\d+,\s*NON-CRIT=(\d+),\s*CRIT=(\d+),\s*UNRECOV=(\d+)",
|
||||
text,
|
||||
)
|
||||
if header_match:
|
||||
non_crit = int(header_match.group(1))
|
||||
crit = int(header_match.group(2))
|
||||
unrecov = int(header_match.group(3))
|
||||
if crit > 0 or unrecov > 0:
|
||||
result["overall_status"] = "CRITICAL"
|
||||
elif non_crit > 0:
|
||||
result["overall_status"] = "WARNING"
|
||||
|
||||
# Split into element type sections.
|
||||
# Each section starts with "Element type: <type>"
|
||||
sections = re.split(r"(?=\s*Element type:)", text)
|
||||
|
||||
for section in sections:
|
||||
type_match = re.match(r"\s*Element type:\s*(.+)", section)
|
||||
if not type_match:
|
||||
continue
|
||||
element_type = type_match.group(1).strip().rstrip(",").lower()
|
||||
|
||||
# Find individual element blocks (skip "Overall descriptor")
|
||||
elements = re.split(r"(?=\s*Element \d+ descriptor:)", section)
|
||||
|
||||
for elem_text in elements:
|
||||
desc_match = re.match(r"\s*Element (\d+) descriptor:", elem_text)
|
||||
if not desc_match:
|
||||
continue
|
||||
idx = int(desc_match.group(1))
|
||||
|
||||
# Extract status line
|
||||
status_match = re.search(r"status:\s*(.+?)(?:,|\n|$)", elem_text, re.IGNORECASE)
|
||||
status = status_match.group(1).strip() if status_match else "Unknown"
|
||||
|
||||
if status.lower() == "not installed":
|
||||
continue
|
||||
|
||||
if "power supply" in element_type:
|
||||
fail = "Fail=1" in elem_text
|
||||
ac_fail = "AC fail=1" in elem_text
|
||||
dc_fail = "DC fail=1" in elem_text
|
||||
result["psus"].append({
|
||||
"index": idx,
|
||||
"status": status,
|
||||
"fail": fail,
|
||||
"ac_fail": ac_fail,
|
||||
"dc_fail": dc_fail,
|
||||
})
|
||||
|
||||
elif "cooling" in element_type or "fan" in element_type:
|
||||
fail = "Fail=1" in elem_text
|
||||
rpm_match = re.search(r"Actual speed[=:]\s*(\d+)\s*rpm", elem_text, re.IGNORECASE)
|
||||
rpm = int(rpm_match.group(1)) if rpm_match else None
|
||||
result["fans"].append({
|
||||
"index": idx,
|
||||
"status": status,
|
||||
"rpm": rpm,
|
||||
"fail": fail,
|
||||
})
|
||||
|
||||
elif "temperature" in element_type:
|
||||
temp_match = re.search(r"Temperature=\s*([\d.]+)\s*C", elem_text)
|
||||
temp = float(temp_match.group(1)) if temp_match else None
|
||||
result["temps"].append({
|
||||
"index": idx,
|
||||
"status": status,
|
||||
"temperature_c": temp,
|
||||
})
|
||||
|
||||
elif "voltage" in element_type:
|
||||
volt_match = re.search(r"Voltage:\s*([\d.]+)\s*V", elem_text, re.IGNORECASE)
|
||||
if not volt_match:
|
||||
volt_match = re.search(r"([\d.]+)\s*V", elem_text)
|
||||
voltage = float(volt_match.group(1)) if volt_match else None
|
||||
result["voltages"].append({
|
||||
"index": idx,
|
||||
"status": status,
|
||||
"voltage": voltage,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
@@ -65,12 +65,14 @@ async def get_host_drives() -> list[dict]:
|
||||
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
|
||||
|
||||
results: list[dict] = []
|
||||
for dev_info, smart in zip(host_devices, smart_results):
|
||||
for dev_info, smart_result in zip(host_devices, smart_results):
|
||||
name = dev_info["name"]
|
||||
|
||||
if isinstance(smart, Exception):
|
||||
logger.warning("SMART query failed for host drive %s: %s", name, smart)
|
||||
if isinstance(smart_result, Exception):
|
||||
logger.warning("SMART query failed for host drive %s: %s", name, smart_result)
|
||||
smart = {"device": name, "smart_supported": False}
|
||||
else:
|
||||
smart, _ = smart_result
|
||||
|
||||
# Compute health_status (same logic as overview.py)
|
||||
healthy = smart.get("smart_healthy")
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from services.cache import smart_cache
|
||||
from services.cache import cache_get, cache_set
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -17,6 +18,8 @@ ATTR_PENDING = 197
|
||||
ATTR_UNCORRECTABLE = 198
|
||||
ATTR_WEAR_LEVELING = 177 # SSD wear leveling
|
||||
|
||||
SMART_CACHE_TTL = int(os.environ.get("SMART_CACHE_TTL", "120"))
|
||||
|
||||
|
||||
def smartctl_available() -> bool:
|
||||
return shutil.which("smartctl") is not None
|
||||
@@ -26,19 +29,22 @@ def sg_ses_available() -> bool:
|
||||
return shutil.which("sg_ses") is not None
|
||||
|
||||
|
||||
async def get_smart_data(device: str) -> dict:
|
||||
"""Run smartctl -a -j against a device, with caching."""
|
||||
async def get_smart_data(device: str) -> tuple[dict, bool]:
|
||||
"""Run smartctl -a -j against a device, with caching.
|
||||
|
||||
Returns (data, cache_hit) tuple.
|
||||
"""
|
||||
# Sanitize device name: only allow alphanumeric and hyphens
|
||||
if not re.match(r"^[a-zA-Z0-9\-]+$", device):
|
||||
raise ValueError(f"Invalid device name: {device}")
|
||||
|
||||
cached = smart_cache.get(device)
|
||||
cached = await cache_get(f"jbod:smart:{device}")
|
||||
if cached is not None:
|
||||
return cached
|
||||
return (cached, True)
|
||||
|
||||
result = await _run_smartctl(device)
|
||||
smart_cache.set(device, result)
|
||||
return result
|
||||
await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
|
||||
return (result, False)
|
||||
|
||||
|
||||
async def _run_smartctl(device: str) -> dict:
|
||||
|
||||
@@ -4,11 +4,15 @@ import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from services.cache import cache_get, cache_set
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Allow overriding the zpool binary path via env (for bind-mounted host tools)
|
||||
ZPOOL_BIN = os.environ.get("ZPOOL_BIN", "zpool")
|
||||
|
||||
ZFS_CACHE_TTL = 300
|
||||
|
||||
|
||||
async def get_zfs_pool_map() -> dict[str, dict]:
|
||||
"""Return a dict mapping device names to ZFS pool and vdev info.
|
||||
@@ -16,6 +20,10 @@ async def get_zfs_pool_map() -> dict[str, dict]:
|
||||
e.g. {"sda": {"pool": "tank", "vdev": "raidz2-0"},
|
||||
"sdb": {"pool": "fast", "vdev": "mirror-0"}}
|
||||
"""
|
||||
cached = await cache_get("jbod:zfs_map")
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
pool_map = {}
|
||||
try:
|
||||
# When running in a container with pid:host, use nsenter to run
|
||||
@@ -94,6 +102,8 @@ async def get_zfs_pool_map() -> dict[str, dict]:
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
logger.debug("zpool not available")
|
||||
|
||||
await cache_set("jbod:zfs_map", pool_map, ZFS_CACHE_TTL)
|
||||
return pool_map
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user