jbod-monitor/services/smart.py

import asyncio
import json
import logging
import os
import re
import shutil

from services.cache import cache_get, cache_set

logger = logging.getLogger(__name__)

# SMART attribute IDs of interest
ATTR_REALLOCATED = 5
ATTR_POWER_ON_HOURS = 9
ATTR_TEMPERATURE = 194
ATTR_TEMPERATURE_ALT = 190
ATTR_PENDING = 197
ATTR_UNCORRECTABLE = 198
ATTR_WEAR_LEVELING = 177  # SSD wear leveling

SMART_CACHE_TTL = int(os.environ.get("SMART_CACHE_TTL", "120"))


def smartctl_available() -> bool:
    return shutil.which("smartctl") is not None


def sg_ses_available() -> bool:
    return shutil.which("sg_ses") is not None


async def get_smart_data(device: str) -> tuple[dict, bool]:
    """Run smartctl -a -j against a device, with caching.

    Returns (data, cache_hit) tuple.
    """
    # Sanitize device name: only allow alphanumeric and hyphens
    if not re.match(r"^[a-zA-Z0-9\-]+$", device):
        raise ValueError(f"Invalid device name: {device}")

    cached = await cache_get(f"jbod:smart:{device}")
    if cached is not None:
        return (cached, True)

    result = await _run_smartctl(device)
    await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
    return (result, False)


async def _run_smartctl(device: str) -> dict:
    """Execute smartctl and parse JSON output."""
    dev_path = f"/dev/{device}"

    try:
        proc = await asyncio.create_subprocess_exec(
            "smartctl", "-a", "-j", dev_path,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        stdout, stderr = await proc.communicate()
    except FileNotFoundError:
        return {"error": "smartctl not found", "smart_supported": False}

    # smartctl returns non-zero for many non-fatal reasons (bit flags).
    # The JSON output is still valid even with non-zero exit codes.
    if not stdout:
        return {
            "error": f"smartctl produced no output (exit code {proc.returncode})",
            "smart_supported": False,
        }

    try:
        data = json.loads(stdout)
    except json.JSONDecodeError:
        return {"error": "Failed to parse smartctl JSON output", "smart_supported": False}

    return _parse_smart_json(device, data)


def _parse_smart_json(device: str, data: dict) -> dict:
    """Extract relevant fields from smartctl JSON output."""
    smart_support = data.get("smart_support")
    is_supported = smart_support.get("available", True) if isinstance(smart_support, dict) else False
    result: dict = {"device": device, "smart_supported": is_supported}

    # Identity — ATA vs SCSI field names
    result["model"] = data.get("model_name") or data.get("scsi_model_name")
    result["serial"] = data.get("serial_number")
    result["firmware"] = data.get("firmware_version") or data.get("scsi_revision")
    result["capacity_bytes"] = data.get("user_capacity", {}).get("bytes")

    # WWN — ATA uses structured wwn{}, SCSI uses logical_unit_id string
    wwn_data = data.get("wwn")
    if wwn_data:
        naa = wwn_data.get("naa", 0)
        oui = wwn_data.get("oui", 0)
        wwn_id = wwn_data.get("id", 0)
        result["wwn"] = f"{naa:x}{oui:06x}{wwn_id:09x}"
    elif data.get("logical_unit_id"):
        result["wwn"] = data["logical_unit_id"]

    # SMART health
    smart_status = data.get("smart_status", {})
    result["smart_healthy"] = smart_status.get("passed")

    # Temperature
    temp = data.get("temperature", {})
    result["temperature_c"] = temp.get("current")

    # Power-on hours
    poh = data.get("power_on_time", {})
    result["power_on_hours"] = poh.get("hours")

    # SMART attributes (ATA)
    attrs = data.get("ata_smart_attributes", {}).get("table", [])
    result["smart_attributes"] = attrs

    result["reallocated_sectors"] = _get_attr_raw(attrs, ATTR_REALLOCATED)
    result["pending_sectors"] = _get_attr_raw(attrs, ATTR_PENDING)
    result["uncorrectable_errors"] = _get_attr_raw(attrs, ATTR_UNCORRECTABLE)
    result["wear_leveling_percent"] = _get_attr_value(attrs, ATTR_WEAR_LEVELING)

    # Power-on hours fallback from attributes
    if result["power_on_hours"] is None:
        result["power_on_hours"] = _get_attr_raw(attrs, ATTR_POWER_ON_HOURS)

    # Temperature fallback from attributes
    if result["temperature_c"] is None:
        result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE)
    if result["temperature_c"] is None:
        result["temperature_c"] = _get_attr_raw(attrs, ATTR_TEMPERATURE_ALT)

    # NVMe attributes (different structure)
    nvme_health = data.get("nvme_smart_health_information_log")
    if nvme_health:
        result["smart_attributes"] = [{"nvme_health": nvme_health}]
        if result["temperature_c"] is None:
            result["temperature_c"] = nvme_health.get("temperature")
        if result["power_on_hours"] is None:
            result["power_on_hours"] = nvme_health.get("power_on_hours")
        if result["wear_leveling_percent"] is None:
            pct_used = nvme_health.get("percentage_used")
            if pct_used is not None:
                result["wear_leveling_percent"] = 100 - pct_used

    # SAS/SCSI drives use different error counters
    scsi_errors = data.get("scsi_error_counter_log")
    if scsi_errors and not attrs:
        result["smart_attributes"] = [{"scsi_error_log": scsi_errors}]

    return result


async def scan_megaraid_drives() -> list[dict]:
    """Discover physical drives behind MegaRAID controllers via smartctl --scan."""
    try:
        proc = await asyncio.create_subprocess_exec(
            "smartctl", "--scan", "-j",
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
        stdout, _ = await proc.communicate()
        scan_data = json.loads(stdout)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        logger.warning("smartctl --scan failed: %s", e)
        return []

    devices = scan_data.get("devices", [])
    megaraid_entries = [
        d for d in devices
        if "megaraid" in (d.get("type") or "")
    ]

    if not megaraid_entries:
        return []

    # Query SMART for each physical drive concurrently
    async def _query(entry: dict) -> dict | None:
        dev_path = entry["name"]
        dev_type = entry["type"]
        try:
            proc = await asyncio.create_subprocess_exec(
                "smartctl", "-a", "-j", "-d", dev_type, dev_path,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            stdout, _ = await proc.communicate()
            if not stdout:
                return None
            data = json.loads(stdout)
        except (FileNotFoundError, json.JSONDecodeError):
            return None

        # Extract the disk number from type like "sat+megaraid,0"
        megaraid_id = dev_type.split("megaraid,")[-1] if "megaraid," in dev_type else dev_type

        result = _parse_smart_json(f"megaraid:{megaraid_id}", data)
        result["megaraid_id"] = megaraid_id
        result["megaraid_type"] = dev_type
        result["megaraid_device"] = dev_path
        return result

    tasks = [_query(e) for e in megaraid_entries]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return [r for r in results if isinstance(r, dict)]


def _get_attr_raw(attrs: list[dict], attr_id: int) -> int | None:
    """Get the raw_value for a SMART attribute by ID."""
    for attr in attrs:
        if attr.get("id") == attr_id:
            raw = attr.get("raw", {})
            return raw.get("value")
    return None


def _get_attr_value(attrs: list[dict], attr_id: int) -> int | None:
    """Get the normalized value for a SMART attribute by ID."""
    for attr in attrs:
        if attr.get("id") == attr_id:
            return attr.get("value")
    return None