`8b59c25`

Scaffold serve package and add [serve] extra

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 2 weeks ago

SHA: 8b59c25b86808e305269b93cb7e64b89e1365fd8
Parents: c1d5a63
Tree: 6b52381

3 changed files

Status	File	+
M	`pyproject.toml`	12
A	`src/dlm_sway/serve/__init__.py`	28
A	`src/dlm_sway/serve/cache.py`	205

pyproject.tomlmodified

  pytest = [
      "pytest>=8.0",
+ ]
 +# Long-running daemon mode (S36). FastAPI + uvicorn give us a warm-
 +# backend HTTP API that turns iterative ``sway run`` calls from
 +# 15-second cold-loads into 2-second warm dispatches. uvicorn[standard]
 +# pulls httptools + uvloop for production-quality serving on
 +# Linux/macOS.
 +serve = [
 +    "fastapi>=0.110",
 +    "uvicorn[standard]>=0.30",
 +    "httpx>=0.27",
 +]
  # Visualization (P9 + S12 HTML report).
  viz = [
      "matplotlib>=3.8",
      "pytest>=8.0",
      "matplotlib>=3.8",
      "plotly>=5.20",
 +    "fastapi>=0.110",
 +    "uvicorn[standard]>=0.30",
+ ]
  [project.scripts]

src/dlm_sway/serve/__init__.pyadded

 +"""``sway serve`` daemon: warm-backend HTTP API for iterative workflows.
++
 +Loading the HF backend takes 15s cold (model + adapter weights, KV cache
 +allocation, deterministic-mode setup). For interactive flows — notebook
 +exploration, the S34 ``sway watch`` loop, the S29 live HTML report —
 +that 15s startup is the dominant cost on every run.
++
 +This package exposes ``sway serve`` as a long-running daemon that loads
 +the backend once and serves a small HTTP API. First call: ~15s cold.
 +Every subsequent call against the same model: ~2s warm. Five-to-ten-X
 +DX win for users who iterate.
++
 +The package is gated behind the ``[serve]`` extra (FastAPI + uvicorn)
 +so users who only run one-shot ``sway run`` invocations don't pull
 +the daemon dependencies.
++
 +Public surface:
++
 +- :class:`dlm_sway.serve.client.ServeClient` — Python SDK for
 +  notebooks; one-liner ``ServeClient(url).run(spec)``.
 +- :func:`dlm_sway.serve.app.create_app` — FastAPI app factory used by
 +  the CLI's uvicorn launcher and unit tests' ``TestClient``.
 +- :class:`dlm_sway.serve.cache.BackendCache` — LRU backend cache the
 +  app uses to keep multiple loaded models warm; capped via the
 +  ``--max-loaded-models`` CLI flag.
 +"""
++
 +from __future__ import annotations

src/dlm_sway/serve/cache.pyadded

 +"""LRU cache for warm differential backends.
++
 +The point of the daemon is keeping backends loaded across requests.
 +This module owns the cache: keyed by an immutable identity tuple over
 +the ModelSpec fields that determine backend identity, capped at a
 +configurable size, evicts least-recently-used on overflow with proper
 +``close()`` so weights actually get freed.
++
 +The cache is **process-local** — there's no on-disk component. Restart
 +the daemon and the cache resets cold. That's intentional: warm-backend
 +caching is a memory tradeoff, and persisting weights to disk would
 +duplicate what HuggingFace's own cache already does at the file level.
 +"""
++
 +from __future__ import annotations
++
 +import logging
 +import threading
 +from collections import OrderedDict
 +from collections.abc import Hashable
 +from dataclasses import dataclass
 +from pathlib import Path
++
 +from dlm_sway.core.errors import SwayError
 +from dlm_sway.core.model import ModelSpec
 +from dlm_sway.core.scoring import DifferentialBackend
++
 +_LOG = logging.getLogger(__name__)
++
++
 +def cache_key_for(spec: ModelSpec) -> tuple[Hashable, ...]:
 +    """Identity tuple for a ModelSpec.
++
 +    Two ModelSpecs that differ only in fields that don't affect the
 +    loaded backend (e.g. ``trust_remote_code`` on the same already-
 +    cached model) hash to the same key. We're conservative — any field
 +    that touches model identity (``base``, ``adapter``, ``dtype``,
 +    ``device``, ``kind``) goes into the key. Path normalization
 +    happens upstream in ModelSpec's validator.
 +    """
 +    return (
 +        spec.kind,
 +        spec.base,
 +        str(spec.adapter) if spec.adapter is not None else None,
 +        spec.dtype,
 +        spec.device,
 +    )
++
++
 +@dataclass(frozen=True, slots=True)
 +class CachedBackend:
 +    """One entry in the cache. Frozen — fields don't mutate after load.
++
 +    ``key`` is the identity tuple; ``backend`` is the live object;
 +    ``model_spec`` is kept for introspection (``GET /health`` lists
 +    the loaded models so users can see what's warm).
 +    """
++
 +    key: tuple[Hashable, ...]
 +    backend: DifferentialBackend
 +    model_spec: ModelSpec
 +    load_seconds: float
++
++
 +class BackendCache:
 +    """LRU cache of differential backends.
++
 +    Thread-safe via a single internal lock. The cache contract:
++
 +    1. ``get_or_load(spec)`` returns a backend; on miss, builds it
 +       (paying the load cost) and admits to the cache.
 +    2. On overflow (``len > max_size``), evict LRU. Eviction calls
 +       ``backend.close()`` if the backend implements it; otherwise
 +       drops the reference and lets GC handle it.
 +    3. ``get_or_load`` is single-flight per key — concurrent requests
 +       for the same model wait on the loader thread instead of
 +       building the backend twice.
 +    """
++
 +    def __init__(self, max_size: int = 2) -> None:
 +        if max_size < 1:
 +            raise ValueError(f"max_size must be >= 1; got {max_size}")
 +        self._max = int(max_size)
 +        self._entries: OrderedDict[tuple[Hashable, ...], CachedBackend] = OrderedDict()
 +        self._lock = threading.RLock()
 +        # Per-key load locks so concurrent requests for the same model
 +        # serialize at the loader instead of building twice.
 +        self._key_locks: dict[tuple[Hashable, ...], threading.Lock] = {}
++
 +    @property
 +    def max_size(self) -> int:
 +        return self._max
++
 +    def loaded_keys(self) -> list[tuple[Hashable, ...]]:
 +        """Snapshot of currently-cached keys, MRU last. Used by /health."""
 +        with self._lock:
 +            return list(self._entries.keys())
++
 +    def loaded_specs(self) -> list[ModelSpec]:
 +        """Snapshot of currently-cached model specs, MRU last."""
 +        with self._lock:
 +            return [entry.model_spec for entry in self._entries.values()]
++
 +    def get_or_load(self, spec: ModelSpec, *, adapter_path: Path | None = None) -> CachedBackend:
 +        """Return a cached backend for ``spec`` or build + admit one.
++
 +        ``adapter_path`` overrides ``spec.adapter`` for the build call —
 +        mirrors the upstream :func:`dlm_sway.backends.build` contract
 +        so callers handing in a separately-resolved adapter (e.g. via
 +        the dlm bridge) don't have to construct a copy of the spec.
 +        Cache key uses ``spec.adapter``, NOT the override; if you want
 +        a different adapter to cache distinctly, pass a spec that
 +        encodes it.
 +        """
 +        key = cache_key_for(spec)
++
 +        # Fast path — spec already cached.
 +        with self._lock:
 +            entry = self._entries.get(key)
 +            if entry is not None:
 +                # Touch LRU position.
 +                self._entries.move_to_end(key)
 +                return entry
 +            key_lock = self._key_locks.setdefault(key, threading.Lock())
++
 +        # Slow path — single-flight load.
 +        with key_lock:
 +            with self._lock:
 +                # Recheck after acquiring the load lock — another thread
 +                # may have completed the load while we waited.
 +                entry = self._entries.get(key)
 +                if entry is not None:
 +                    self._entries.move_to_end(key)
 +                    return entry
++
 +            entry = _build_entry(spec, key=key, adapter_path=adapter_path)
++
 +            with self._lock:
 +                # Evict to fit before admitting; ensures we never spike
 +                # over max_size + 1 between admission and eviction.
 +                while len(self._entries) >= self._max:
 +                    self._evict_lru_locked()
 +                self._entries[key] = entry
 +                self._entries.move_to_end(key)
 +            return entry
++
 +    def evict_all(self) -> None:
 +        """Close every backend. Called on daemon shutdown."""
 +        with self._lock:
 +            keys = list(self._entries.keys())
 +            for k in keys:
 +                self._evict_locked(k)
++
 +    # -- internals -----------------------------------------------------
++
 +    def _evict_lru_locked(self) -> None:
 +        # Caller holds self._lock. ``OrderedDict.__iter__`` yields
 +        # insertion order; LRU is the first key.
 +        if not self._entries:
 +            return
 +        lru_key = next(iter(self._entries))
 +        self._evict_locked(lru_key)
++
 +    def _evict_locked(self, key: tuple[Hashable, ...]) -> None:
 +        # Caller holds self._lock.
 +        entry = self._entries.pop(key, None)
 +        if entry is None:
 +            return
 +        # Backends carry a ``close()`` when they own GPU memory or
 +        # network connections (HF, MLX, API). Dummy doesn't —
 +        # so don't require it. Failure during close is logged and
 +        # swallowed: a daemon stays up even if one backend's close()
 +        # raises.
 +        close = getattr(entry.backend, "close", None)
 +        if callable(close):
 +            try:
 +                close()
 +            except Exception as exc:  # noqa: BLE001
 +                _LOG.warning("backend close raised on eviction: %s", exc)
 +        self._key_locks.pop(key, None)
++
++
 +def _build_entry(
 +    spec: ModelSpec,
 +    *,
 +    key: tuple[Hashable, ...],
 +    adapter_path: Path | None,
 +) -> CachedBackend:
 +    """Materialize a backend from a spec, timing the load."""
 +    import time
++
 +    from dlm_sway.backends import build as build_backend
++
 +    started = time.monotonic()
 +    try:
 +        backend = build_backend(spec, adapter_path=adapter_path)
 +    except SwayError:
 +        raise
 +    except Exception as exc:  # noqa: BLE001 — surface load failures as SwayError
 +        raise SwayError(
 +            f"backend load failed for kind={spec.kind} base={spec.base!r}: "
 +            f"{type(exc).__name__}: {exc}"
 +        ) from exc
 +    elapsed = time.monotonic() - started
 +    return CachedBackend(key=key, backend=backend, model_spec=spec, load_seconds=elapsed)