190 lines
6.9 KiB
Python
190 lines
6.9 KiB
Python
import asyncio
|
|
import logging
|
|
|
|
from fastapi import APIRouter, Response
|
|
|
|
from models.schemas import (
|
|
DriveHealthSummary,
|
|
EnclosureHealth,
|
|
EnclosureWithDrives,
|
|
HostDrive,
|
|
Overview,
|
|
SlotWithDrive,
|
|
)
|
|
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
|
|
from services.host import get_host_drives
|
|
from services.smart import get_smart_data
|
|
from services.zfs import get_zfs_pool_map
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/api/overview", tags=["overview"])
|
|
|
|
|
|
@router.get("", response_model=Overview)
|
|
async def get_overview(response: Response):
|
|
"""Aggregate view of all enclosures, slots, and drive health."""
|
|
enclosures_raw = discover_enclosures()
|
|
pool_map = await get_zfs_pool_map()
|
|
|
|
# Fetch SES health data for all enclosures concurrently
|
|
async def _get_health(enc):
|
|
if enc.get("sg_device"):
|
|
return await get_enclosure_status(enc["sg_device"])
|
|
return None
|
|
|
|
health_results = await asyncio.gather(
|
|
*[_get_health(enc) for enc in enclosures_raw],
|
|
return_exceptions=True,
|
|
)
|
|
|
|
enc_results: list[EnclosureWithDrives] = []
|
|
total_drives = 0
|
|
warnings = 0
|
|
errors = 0
|
|
all_healthy = True
|
|
all_cache_hits = True
|
|
any_lookups = False
|
|
|
|
for enc_idx, enc in enumerate(enclosures_raw):
|
|
slots_raw = list_slots(enc["id"])
|
|
|
|
# Gather SMART data for all populated slots concurrently
|
|
populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]]
|
|
smart_tasks = [get_smart_data(dev) for _, dev in populated]
|
|
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
|
|
|
|
smart_map: dict[str, dict] = {}
|
|
for (slot_info, dev), result in zip(populated, smart_results):
|
|
if isinstance(result, Exception):
|
|
logger.warning("SMART query failed for %s: %s", dev, result)
|
|
smart_map[dev] = {"device": dev, "smart_supported": False}
|
|
all_cache_hits = False
|
|
else:
|
|
data, hit = result
|
|
smart_map[dev] = data
|
|
any_lookups = True
|
|
if not hit:
|
|
all_cache_hits = False
|
|
|
|
slots_out: list[SlotWithDrive] = []
|
|
for s in slots_raw:
|
|
drive_summary = None
|
|
if s["device"] and s["device"] in smart_map:
|
|
sd = smart_map[s["device"]]
|
|
total_drives += 1
|
|
|
|
healthy = sd.get("smart_healthy")
|
|
if healthy is False:
|
|
errors += 1
|
|
all_healthy = False
|
|
elif healthy is None and sd.get("smart_supported", True):
|
|
warnings += 1
|
|
|
|
# Check for concerning SMART values
|
|
if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0:
|
|
warnings += 1
|
|
if sd.get("pending_sectors") and sd["pending_sectors"] > 0:
|
|
warnings += 1
|
|
if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0:
|
|
warnings += 1
|
|
|
|
# Compute health_status for frontend
|
|
realloc = sd.get("reallocated_sectors") or 0
|
|
pending = sd.get("pending_sectors") or 0
|
|
unc = sd.get("uncorrectable_errors") or 0
|
|
if healthy is False:
|
|
health_status = "error"
|
|
elif realloc > 0 or pending > 0 or unc > 0 or (healthy is None and sd.get("smart_supported", True)):
|
|
health_status = "warning"
|
|
else:
|
|
health_status = "healthy"
|
|
|
|
drive_summary = DriveHealthSummary(
|
|
device=sd["device"],
|
|
model=sd.get("model"),
|
|
serial=sd.get("serial"),
|
|
wwn=sd.get("wwn"),
|
|
firmware=sd.get("firmware"),
|
|
capacity_bytes=sd.get("capacity_bytes"),
|
|
smart_healthy=healthy,
|
|
smart_supported=sd.get("smart_supported", True),
|
|
temperature_c=sd.get("temperature_c"),
|
|
power_on_hours=sd.get("power_on_hours"),
|
|
reallocated_sectors=sd.get("reallocated_sectors"),
|
|
pending_sectors=sd.get("pending_sectors"),
|
|
uncorrectable_errors=sd.get("uncorrectable_errors"),
|
|
zfs_pool=pool_map.get(sd["device"], {}).get("pool"),
|
|
zfs_vdev=pool_map.get(sd["device"], {}).get("vdev"),
|
|
zfs_state=pool_map.get(sd["device"], {}).get("state"),
|
|
health_status=health_status,
|
|
)
|
|
elif s["populated"]:
|
|
total_drives += 1
|
|
|
|
slots_out.append(SlotWithDrive(
|
|
slot=s["slot"],
|
|
populated=s["populated"],
|
|
device=s["device"],
|
|
drive=drive_summary,
|
|
))
|
|
|
|
# Attach enclosure health from SES
|
|
health_data = health_results[enc_idx]
|
|
enc_health = None
|
|
if isinstance(health_data, dict):
|
|
enc_health = EnclosureHealth(**health_data)
|
|
# Count enclosure-level issues
|
|
if enc_health.overall_status == "CRITICAL":
|
|
errors += 1
|
|
all_healthy = False
|
|
elif enc_health.overall_status == "WARNING":
|
|
warnings += 1
|
|
elif isinstance(health_data, Exception):
|
|
logger.warning("SES health failed for %s: %s", enc["id"], health_data)
|
|
|
|
enc_results.append(EnclosureWithDrives(
|
|
id=enc["id"],
|
|
sg_device=enc.get("sg_device"),
|
|
vendor=enc["vendor"],
|
|
model=enc["model"],
|
|
revision=enc["revision"],
|
|
total_slots=enc["total_slots"],
|
|
populated_slots=enc["populated_slots"],
|
|
slots=slots_out,
|
|
health=enc_health,
|
|
))
|
|
|
|
# Host drives (non-enclosure)
|
|
host_drives_raw = await get_host_drives()
|
|
host_drives_out: list[HostDrive] = []
|
|
for hd in host_drives_raw:
|
|
total_drives += 1
|
|
hs = hd.get("health_status", "healthy")
|
|
if hs == "error":
|
|
errors += 1
|
|
all_healthy = False
|
|
elif hs == "warning":
|
|
warnings += 1
|
|
# Count physical drives behind RAID controllers
|
|
for pd in hd.get("physical_drives", []):
|
|
total_drives += 1
|
|
pd_hs = pd.get("health_status", "healthy")
|
|
if pd_hs == "error":
|
|
errors += 1
|
|
all_healthy = False
|
|
elif pd_hs == "warning":
|
|
warnings += 1
|
|
host_drives_out.append(HostDrive(**hd))
|
|
|
|
response.headers["X-Cache"] = "HIT" if (any_lookups and all_cache_hits) else "MISS"
|
|
|
|
return Overview(
|
|
healthy=all_healthy and errors == 0,
|
|
drive_count=total_drives,
|
|
warning_count=warnings,
|
|
error_count=errors,
|
|
enclosures=enc_results,
|
|
host_drives=host_drives_out,
|
|
)
|