jbod-monitor/routers/overview.py

import asyncio
import logging

from fastapi import APIRouter

from models.schemas import (
    DriveHealthSummary,
    EnclosureHealth,
    EnclosureWithDrives,
    HostDrive,
    Overview,
    SlotWithDrive,
)
from services.enclosure import discover_enclosures, get_enclosure_status, list_slots
from services.host import get_host_drives
from services.smart import get_smart_data
from services.zfs import get_zfs_pool_map

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/overview", tags=["overview"])


@router.get("", response_model=Overview)
async def get_overview():
    """Aggregate view of all enclosures, slots, and drive health."""
    enclosures_raw = discover_enclosures()
    pool_map = await get_zfs_pool_map()

    # Fetch SES health data for all enclosures concurrently
    async def _get_health(enc):
        if enc.get("sg_device"):
            return await get_enclosure_status(enc["sg_device"])
        return None

    health_results = await asyncio.gather(
        *[_get_health(enc) for enc in enclosures_raw],
        return_exceptions=True,
    )

    enc_results: list[EnclosureWithDrives] = []
    total_drives = 0
    warnings = 0
    errors = 0
    all_healthy = True

    for enc_idx, enc in enumerate(enclosures_raw):
        slots_raw = list_slots(enc["id"])

        # Gather SMART data for all populated slots concurrently
        populated = [(s, s["device"]) for s in slots_raw if s["populated"] and s["device"]]
        smart_tasks = [get_smart_data(dev) for _, dev in populated]
        smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)

        smart_map: dict[str, dict] = {}
        for (slot_info, dev), result in zip(populated, smart_results):
            if isinstance(result, Exception):
                logger.warning("SMART query failed for %s: %s", dev, result)
                smart_map[dev] = {"device": dev, "smart_supported": False}
            else:
                smart_map[dev] = result

        slots_out: list[SlotWithDrive] = []
        for s in slots_raw:
            drive_summary = None
            if s["device"] and s["device"] in smart_map:
                sd = smart_map[s["device"]]
                total_drives += 1

                healthy = sd.get("smart_healthy")
                if healthy is False:
                    errors += 1
                    all_healthy = False
                elif healthy is None and sd.get("smart_supported", True):
                    warnings += 1

                # Check for concerning SMART values
                if sd.get("reallocated_sectors") and sd["reallocated_sectors"] > 0:
                    warnings += 1
                if sd.get("pending_sectors") and sd["pending_sectors"] > 0:
                    warnings += 1
                if sd.get("uncorrectable_errors") and sd["uncorrectable_errors"] > 0:
                    warnings += 1

                # Compute health_status for frontend
                realloc = sd.get("reallocated_sectors") or 0
                pending = sd.get("pending_sectors") or 0
                unc = sd.get("uncorrectable_errors") or 0
                if healthy is False:
                    health_status = "error"
                elif realloc > 0 or pending > 0 or unc > 0 or (healthy is None and sd.get("smart_supported", True)):
                    health_status = "warning"
                else:
                    health_status = "healthy"

                drive_summary = DriveHealthSummary(
                    device=sd["device"],
                    model=sd.get("model"),
                    serial=sd.get("serial"),
                    wwn=sd.get("wwn"),
                    firmware=sd.get("firmware"),
                    capacity_bytes=sd.get("capacity_bytes"),
                    smart_healthy=healthy,
                    smart_supported=sd.get("smart_supported", True),
                    temperature_c=sd.get("temperature_c"),
                    power_on_hours=sd.get("power_on_hours"),
                    reallocated_sectors=sd.get("reallocated_sectors"),
                    pending_sectors=sd.get("pending_sectors"),
                    uncorrectable_errors=sd.get("uncorrectable_errors"),
                    zfs_pool=pool_map.get(sd["device"], {}).get("pool"),
                    zfs_vdev=pool_map.get(sd["device"], {}).get("vdev"),
                    zfs_state=pool_map.get(sd["device"], {}).get("state"),
                    health_status=health_status,
                )
            elif s["populated"]:
                total_drives += 1

            slots_out.append(SlotWithDrive(
                slot=s["slot"],
                populated=s["populated"],
                device=s["device"],
                drive=drive_summary,
            ))

        # Attach enclosure health from SES
        health_data = health_results[enc_idx]
        enc_health = None
        if isinstance(health_data, dict):
            enc_health = EnclosureHealth(**health_data)
            # Count enclosure-level issues
            if enc_health.overall_status == "CRITICAL":
                errors += 1
                all_healthy = False
            elif enc_health.overall_status == "WARNING":
                warnings += 1
        elif isinstance(health_data, Exception):
            logger.warning("SES health failed for %s: %s", enc["id"], health_data)

        enc_results.append(EnclosureWithDrives(
            id=enc["id"],
            sg_device=enc.get("sg_device"),
            vendor=enc["vendor"],
            model=enc["model"],
            revision=enc["revision"],
            total_slots=enc["total_slots"],
            populated_slots=enc["populated_slots"],
            slots=slots_out,
            health=enc_health,
        ))

    # Host drives (non-enclosure)
    host_drives_raw = await get_host_drives()
    host_drives_out: list[HostDrive] = []
    for hd in host_drives_raw:
        total_drives += 1
        hs = hd.get("health_status", "healthy")
        if hs == "error":
            errors += 1
            all_healthy = False
        elif hs == "warning":
            warnings += 1
        # Count physical drives behind RAID controllers
        for pd in hd.get("physical_drives", []):
            total_drives += 1
            pd_hs = pd.get("health_status", "healthy")
            if pd_hs == "error":
                errors += 1
                all_healthy = False
            elif pd_hs == "warning":
                warnings += 1
        host_drives_out.append(HostDrive(**hd))

    return Overview(
        healthy=all_healthy and errors == 0,
        drive_count=total_drives,
        warning_count=warnings,
        error_count=errors,
        enclosures=enc_results,
        host_drives=host_drives_out,
    )