Replace in-memory TTL cache with Redis

This commit is contained in:
2026-03-07 18:45:15 +00:00
parent 0112875894
commit b11c1bdf98
10 changed files with 238 additions and 54 deletions

25
build.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -euo pipefail
IMAGE="docker.adamksmith.xyz/jbod-monitor"
cd "$(dirname "$0")"
# Stage and commit all changes
git add -A
if git diff --cached --quiet; then
echo "No changes to commit, using HEAD"
else
git commit -m "${1:-Build and push image}"
fi
SHA=$(git rev-parse --short HEAD)
echo "Building ${IMAGE}:${SHA}"
docker build -t "${IMAGE}:${SHA}" -t "${IMAGE}:latest" .
echo "Pushing ${IMAGE}:${SHA}"
docker push "${IMAGE}:${SHA}"
docker push "${IMAGE}:latest"
echo "Done: ${IMAGE}:${SHA}"

View File

@@ -15,3 +15,22 @@ services:
- TZ=America/Denver - TZ=America/Denver
- UVICORN_LOG_LEVEL=info - UVICORN_LOG_LEVEL=info
- ZFS_USE_NSENTER=true - ZFS_USE_NSENTER=true
- REDIS_HOST=127.0.0.1
- REDIS_PORT=6379
- REDIS_DB=0
- SMART_CACHE_TTL=120
- SMART_POLL_INTERVAL=90
depends_on:
- redis
redis:
image: redis:7-alpine
container_name: jbod-redis
restart: unless-stopped
network_mode: host
volumes:
- redis-data:/data
command: redis-server --save 60 1 --loglevel warning
volumes:
redis-data:

111
main.py
View File

@@ -1,15 +1,20 @@
import asyncio
import json
import logging import logging
import os import os
from contextlib import asynccontextmanager
from pathlib import Path from pathlib import Path
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from models.schemas import HealthCheck from models.schemas import HealthCheck
from routers import drives, enclosures, leds, overview from routers import drives, enclosures, leds, overview
from services.smart import sg_ses_available, smartctl_available from services.cache import cache_set, close_cache, init_cache, redis_available
from services.enclosure import discover_enclosures, list_slots
from services.smart import SMART_CACHE_TTL, _run_smartctl, sg_ses_available, smartctl_available
from services.zfs import get_zfs_pool_map
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@@ -17,10 +22,96 @@ logging.basicConfig(
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SMART_POLL_INTERVAL = int(os.environ.get("SMART_POLL_INTERVAL", "90"))
_tool_status: dict[str, bool] = {}
_poll_task: asyncio.Task | None = None
async def smart_poll_loop():
"""Pre-warm Redis with SMART data for all drives."""
await asyncio.sleep(2) # let app finish starting
while True:
try:
# Discover all enclosure devices
enclosures_raw = discover_enclosures()
devices: set[str] = set()
for enc in enclosures_raw:
for slot in list_slots(enc["id"]):
if slot["device"]:
devices.add(slot["device"])
# Discover host block devices via lsblk
try:
proc = await asyncio.create_subprocess_exec(
"lsblk", "-d", "-o", "NAME,TYPE", "-J",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, _ = await proc.communicate()
if stdout:
for dev in json.loads(stdout).get("blockdevices", []):
if dev.get("type") == "disk":
name = dev.get("name", "")
if name and name not in devices:
devices.add(name)
except Exception as e:
logger.warning("lsblk discovery failed in poller: %s", e)
# Poll each drive and cache result
for device in sorted(devices):
try:
result = await _run_smartctl(device)
await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
except Exception as e:
logger.warning("Poll failed for %s: %s", device, e)
# Pre-warm ZFS map (bypasses cache by calling directly)
await get_zfs_pool_map()
logger.info("SMART poll complete: %d devices", len(devices))
except Exception as e:
logger.error("SMART poll loop error: %s", e)
await asyncio.sleep(SMART_POLL_INTERVAL)
@asynccontextmanager
async def lifespan(app: FastAPI):
global _poll_task
# Startup
_tool_status["smartctl"] = smartctl_available()
_tool_status["sg_ses"] = sg_ses_available()
if not _tool_status["smartctl"]:
logger.warning("smartctl not found — install smartmontools for SMART data")
if not _tool_status["sg_ses"]:
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
if os.geteuid() != 0:
logger.warning("Not running as root — smartctl may fail on some devices")
await init_cache()
_tool_status["redis"] = redis_available()
if redis_available():
_poll_task = asyncio.create_task(smart_poll_loop())
yield
# Shutdown
if _poll_task is not None:
_poll_task.cancel()
try:
await _poll_task
except asyncio.CancelledError:
pass
await close_cache()
app = FastAPI( app = FastAPI(
title="JBOD Monitor", title="JBOD Monitor",
description="Drive health monitoring for JBOD enclosures", description="Drive health monitoring for JBOD enclosures",
version="0.1.0", version="0.1.0",
lifespan=lifespan,
) )
app.add_middleware( app.add_middleware(
@@ -35,24 +126,10 @@ app.include_router(drives.router)
app.include_router(leds.router) app.include_router(leds.router)
app.include_router(overview.router) app.include_router(overview.router)
_tool_status: dict[str, bool] = {}
@app.on_event("startup")
async def check_dependencies():
_tool_status["smartctl"] = smartctl_available()
_tool_status["sg_ses"] = sg_ses_available()
if not _tool_status["smartctl"]:
logger.warning("smartctl not found — install smartmontools for SMART data")
if not _tool_status["sg_ses"]:
logger.warning("sg_ses not found — install sg3-utils for enclosure SES data")
if os.geteuid() != 0:
logger.warning("Not running as root — smartctl may fail on some devices")
@app.get("/api/health", response_model=HealthCheck, tags=["health"]) @app.get("/api/health", response_model=HealthCheck, tags=["health"])
async def health(): async def health():
_tool_status["redis"] = redis_available()
return HealthCheck(status="ok", tools=_tool_status) return HealthCheck(status="ok", tools=_tool_status)

View File

@@ -1,3 +1,4 @@
fastapi>=0.115.0 fastapi>=0.115.0
uvicorn>=0.34.0 uvicorn>=0.34.0
pydantic>=2.10.0 pydantic>=2.10.0
redis>=5.0.0

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter, HTTPException from fastapi import APIRouter, HTTPException, Response
from models.schemas import DriveDetail from models.schemas import DriveDetail
from services.smart import get_smart_data from services.smart import get_smart_data
@@ -8,10 +8,10 @@ router = APIRouter(prefix="/api/drives", tags=["drives"])
@router.get("/{device}", response_model=DriveDetail) @router.get("/{device}", response_model=DriveDetail)
async def get_drive_detail(device: str): async def get_drive_detail(device: str, response: Response):
"""Get SMART detail for a specific block device.""" """Get SMART detail for a specific block device."""
try: try:
data = await get_smart_data(device) data, cache_hit = await get_smart_data(device)
except ValueError as e: except ValueError as e:
raise HTTPException(status_code=400, detail=str(e)) raise HTTPException(status_code=400, detail=str(e))
@@ -25,4 +25,6 @@ async def get_drive_detail(device: str):
data["zfs_vdev"] = zfs_info["vdev"] data["zfs_vdev"] = zfs_info["vdev"]
data["zfs_state"] = zfs_info.get("state") data["zfs_state"] = zfs_info.get("state")
response.headers["X-Cache"] = "HIT" if cache_hit else "MISS"
return DriveDetail(**data) return DriveDetail(**data)

View File

@@ -1,7 +1,7 @@
import asyncio import asyncio
import logging import logging
from fastapi import APIRouter from fastapi import APIRouter, Response
from models.schemas import ( from models.schemas import (
DriveHealthSummary, DriveHealthSummary,
@@ -22,7 +22,7 @@ router = APIRouter(prefix="/api/overview", tags=["overview"])
@router.get("", response_model=Overview) @router.get("", response_model=Overview)
async def get_overview(): async def get_overview(response: Response):
"""Aggregate view of all enclosures, slots, and drive health.""" """Aggregate view of all enclosures, slots, and drive health."""
enclosures_raw = discover_enclosures() enclosures_raw = discover_enclosures()
pool_map = await get_zfs_pool_map() pool_map = await get_zfs_pool_map()
@@ -53,12 +53,19 @@ async def get_overview():
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True) smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
smart_map: dict[str, dict] = {} smart_map: dict[str, dict] = {}
all_cache_hits = True
any_lookups = False
for (slot_info, dev), result in zip(populated, smart_results): for (slot_info, dev), result in zip(populated, smart_results):
if isinstance(result, Exception): if isinstance(result, Exception):
logger.warning("SMART query failed for %s: %s", dev, result) logger.warning("SMART query failed for %s: %s", dev, result)
smart_map[dev] = {"device": dev, "smart_supported": False} smart_map[dev] = {"device": dev, "smart_supported": False}
all_cache_hits = False
else: else:
smart_map[dev] = result data, hit = result
smart_map[dev] = data
any_lookups = True
if not hit:
all_cache_hits = False
slots_out: list[SlotWithDrive] = [] slots_out: list[SlotWithDrive] = []
for s in slots_raw: for s in slots_raw:
@@ -170,6 +177,8 @@ async def get_overview():
warnings += 1 warnings += 1
host_drives_out.append(HostDrive(**hd)) host_drives_out.append(HostDrive(**hd))
response.headers["X-Cache"] = "HIT" if (any_lookups and all_cache_hits) else "MISS"
return Overview( return Overview(
healthy=all_healthy and errors == 0, healthy=all_healthy and errors == 0,
drive_count=total_drives, drive_count=total_drives,

View File

@@ -1,29 +1,62 @@
import time import json
import logging
import os
from typing import Any from typing import Any
import redis.asyncio as redis
class TTLCache: logger = logging.getLogger(__name__)
"""Simple in-memory TTL cache."""
def __init__(self, ttl_seconds: int = 60): _redis: redis.Redis | None = None
self._ttl = ttl_seconds
self._store: dict[str, tuple[float, Any]] = {}
def get(self, key: str) -> Any | None:
entry = self._store.get(key) async def init_cache() -> None:
if entry is None: """Create Redis connection from environment variables."""
global _redis
host = os.environ.get("REDIS_HOST", "localhost")
port = int(os.environ.get("REDIS_PORT", "6379"))
db = int(os.environ.get("REDIS_DB", "0"))
try:
_redis = redis.Redis(host=host, port=port, db=db, decode_responses=True)
await _redis.ping()
logger.info("Redis connected at %s:%d/%d", host, port, db)
except Exception as e:
logger.warning("Redis connection failed: %s — running without cache", e)
_redis = None
async def close_cache() -> None:
"""Close Redis connection."""
global _redis
if _redis is not None:
await _redis.aclose()
_redis = None
def redis_available() -> bool:
"""Return whether Redis connection is live."""
return _redis is not None
async def cache_get(key: str) -> Any | None:
"""GET key from Redis, return deserialized value or None on miss/error."""
if _redis is None:
return None
try:
raw = await _redis.get(key)
if raw is None:
return None return None
ts, value = entry return json.loads(raw)
if time.monotonic() - ts > self._ttl: except Exception as e:
del self._store[key] logger.warning("Redis GET %s failed: %s", key, e)
return None return None
return value
def set(self, key: str, value: Any) -> None:
self._store[key] = (time.monotonic(), value)
def clear(self) -> None:
self._store.clear()
smart_cache = TTLCache(ttl_seconds=60) async def cache_set(key: str, value: Any, ttl: int = 120) -> None:
"""SET key in Redis with expiry, silently catches errors."""
if _redis is None:
return
try:
await _redis.set(key, json.dumps(value), ex=ttl)
except Exception as e:
logger.warning("Redis SET %s failed: %s", key, e)

View File

@@ -65,12 +65,14 @@ async def get_host_drives() -> list[dict]:
smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True) smart_results = await asyncio.gather(*smart_tasks, return_exceptions=True)
results: list[dict] = [] results: list[dict] = []
for dev_info, smart in zip(host_devices, smart_results): for dev_info, smart_result in zip(host_devices, smart_results):
name = dev_info["name"] name = dev_info["name"]
if isinstance(smart, Exception): if isinstance(smart_result, Exception):
logger.warning("SMART query failed for host drive %s: %s", name, smart) logger.warning("SMART query failed for host drive %s: %s", name, smart_result)
smart = {"device": name, "smart_supported": False} smart = {"device": name, "smart_supported": False}
else:
smart, _ = smart_result
# Compute health_status (same logic as overview.py) # Compute health_status (same logic as overview.py)
healthy = smart.get("smart_healthy") healthy = smart.get("smart_healthy")

View File

@@ -1,10 +1,11 @@
import asyncio import asyncio
import json import json
import logging import logging
import os
import re import re
import shutil import shutil
from services.cache import smart_cache from services.cache import cache_get, cache_set
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -17,6 +18,8 @@ ATTR_PENDING = 197
ATTR_UNCORRECTABLE = 198 ATTR_UNCORRECTABLE = 198
ATTR_WEAR_LEVELING = 177 # SSD wear leveling ATTR_WEAR_LEVELING = 177 # SSD wear leveling
SMART_CACHE_TTL = int(os.environ.get("SMART_CACHE_TTL", "120"))
def smartctl_available() -> bool: def smartctl_available() -> bool:
return shutil.which("smartctl") is not None return shutil.which("smartctl") is not None
@@ -26,19 +29,22 @@ def sg_ses_available() -> bool:
return shutil.which("sg_ses") is not None return shutil.which("sg_ses") is not None
async def get_smart_data(device: str) -> dict: async def get_smart_data(device: str) -> tuple[dict, bool]:
"""Run smartctl -a -j against a device, with caching.""" """Run smartctl -a -j against a device, with caching.
Returns (data, cache_hit) tuple.
"""
# Sanitize device name: only allow alphanumeric and hyphens # Sanitize device name: only allow alphanumeric and hyphens
if not re.match(r"^[a-zA-Z0-9\-]+$", device): if not re.match(r"^[a-zA-Z0-9\-]+$", device):
raise ValueError(f"Invalid device name: {device}") raise ValueError(f"Invalid device name: {device}")
cached = smart_cache.get(device) cached = await cache_get(f"jbod:smart:{device}")
if cached is not None: if cached is not None:
return cached return (cached, True)
result = await _run_smartctl(device) result = await _run_smartctl(device)
smart_cache.set(device, result) await cache_set(f"jbod:smart:{device}", result, SMART_CACHE_TTL)
return result return (result, False)
async def _run_smartctl(device: str) -> dict: async def _run_smartctl(device: str) -> dict:

View File

@@ -4,11 +4,15 @@ import logging
import re import re
from pathlib import Path from pathlib import Path
from services.cache import cache_get, cache_set
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Allow overriding the zpool binary path via env (for bind-mounted host tools) # Allow overriding the zpool binary path via env (for bind-mounted host tools)
ZPOOL_BIN = os.environ.get("ZPOOL_BIN", "zpool") ZPOOL_BIN = os.environ.get("ZPOOL_BIN", "zpool")
ZFS_CACHE_TTL = 300
async def get_zfs_pool_map() -> dict[str, dict]: async def get_zfs_pool_map() -> dict[str, dict]:
"""Return a dict mapping device names to ZFS pool and vdev info. """Return a dict mapping device names to ZFS pool and vdev info.
@@ -16,6 +20,10 @@ async def get_zfs_pool_map() -> dict[str, dict]:
e.g. {"sda": {"pool": "tank", "vdev": "raidz2-0"}, e.g. {"sda": {"pool": "tank", "vdev": "raidz2-0"},
"sdb": {"pool": "fast", "vdev": "mirror-0"}} "sdb": {"pool": "fast", "vdev": "mirror-0"}}
""" """
cached = await cache_get("jbod:zfs_map")
if cached is not None:
return cached
pool_map = {} pool_map = {}
try: try:
# When running in a container with pid:host, use nsenter to run # When running in a container with pid:host, use nsenter to run
@@ -94,6 +102,8 @@ async def get_zfs_pool_map() -> dict[str, dict]:
pass pass
except FileNotFoundError: except FileNotFoundError:
logger.debug("zpool not available") logger.debug("zpool not available")
await cache_set("jbod:zfs_map", pool_map, ZFS_CACHE_TTL)
return pool_map return pool_map