`905b890`

probes/null_adapter: on-disk cache keyed by backend identity + calibration params

Authored by

espadonne 3 weeks ago

SHA: 905b890fcf1791c6ffde046d951b17008e6a00b4
Parents: 8e52af6
Tree: f287be3

5 changed files

Status	File	+	-
M	`src/dlm_sway/backends/hf.py`	9	0
A	`src/dlm_sway/probes/_null_cache.py`	91	0
M	`src/dlm_sway/probes/null_adapter.py`	76	10
A	`tests/unit/test_null_cache.py`	70	0
M	`tests/unit/test_null_calibration.py`	59	0

src/dlm_sway/backends/hf.pymodified

      _PREFLIGHT_PROMPT = "hello"
      _PREFLIGHT_TOP_K = 8
 +    def cache_identity(self) -> str:
 +        """Stable string identifying this backend for on-disk caching.
++
 +        The base model id + the adapter's resolved absolute path is
 +        enough to key a null-calibration cache: swapping either
 +        invalidates the previously-computed stats.
 +        """
 +        return f"hf:{self._spec.base}:{self._adapter_path}"
++
      def preflight_finite_check(self) -> tuple[bool, str]:
          """One forward pass per view; assert both produce finite logits.

src/dlm_sway/probes/_null_cache.pyadded

 +"""On-disk cache for null-adapter calibration stats.
++
 +Null calibration runs a miniature version of every downstream numeric
 +probe across N seeds before the suite proper. For a 10-probe suite at
 +``runs=3`` that's ~120 forward passes; on an HF backend against a real
 +model this can dominate wall time. Results are deterministic in the
 +calibration inputs — so we cache them at
 +``~/.dlm-sway/null-stats/<key>.json`` keyed by the tuple that actually
 +influences the output.
++
 +Scope here is intentionally minimal. Sprint 07 adds a shared
 +forward-pass cache that cuts into a lower level; this module only
 +amortizes the per-suite calibration pass.
 +"""
++
 +from __future__ import annotations
++
 +import hashlib
 +import json
 +import os
 +from pathlib import Path
 +from typing import Any
++
 +#: Environment knob — set to ``"1"`` to bypass load + save (development
 +#: / CI tests that want to prove calibration actually runs).
 +_ENV_DISABLE = "SWAY_DISABLE_NULL_CACHE"
++
++
 +def _cache_root() -> Path:
 +    """Root directory for cached null stats. Honors ``$XDG_CACHE_HOME``
 +    when set; otherwise falls back to ``~/.dlm-sway/null-stats``."""
 +    xdg = os.environ.get("XDG_CACHE_HOME")
 +    if xdg:
 +        return Path(xdg).expanduser() / "dlm-sway" / "null-stats"
 +    return Path.home() / ".dlm-sway" / "null-stats"
++
++
 +def compute_key(*, backend_identity: str | None, params: dict[str, Any]) -> str | None:
 +    """Hash backend identity + calibration params into a stable filename.
++
 +    Returns ``None`` when ``backend_identity`` is ``None`` — backends that
 +    can't uniquely identify themselves (e.g., the dummy backend used in
 +    tests) skip caching entirely.
 +    """
 +    if not backend_identity:
 +        return None
 +    payload = {
 +        "backend": backend_identity,
 +        "params": params,
 +    }
 +    blob = json.dumps(payload, sort_keys=True, default=str).encode("utf-8")
 +    return hashlib.sha256(blob).hexdigest()[:32]
++
++
 +def load(key: str | None) -> dict[str, Any] | None:
 +    """Return the cached null-stats dict for ``key``, or ``None`` on miss.
++
 +    Malformed / unreadable cache files are treated as a miss — we'd
 +    rather recompute than crash the suite. A stale / schema-mismatched
 +    cache can be wiped with ``rm -rf ~/.dlm-sway/null-stats``.
 +    """
 +    if key is None or os.environ.get(_ENV_DISABLE) == "1":
 +        return None
 +    path = _cache_root() / f"{key}.json"
 +    if not path.exists():
 +        return None
 +    try:
 +        with path.open("r", encoding="utf-8") as f:
 +            data = json.load(f)
 +    except (OSError, json.JSONDecodeError):
 +        return None
 +    if not isinstance(data, dict):
 +        return None
 +    return data
++
++
 +def save(key: str | None, stats: dict[str, Any]) -> None:
 +    """Persist ``stats`` under ``key``. Silently no-ops on I/O errors —
 +    the cache is a speed-up, not a correctness contract."""
 +    if key is None or os.environ.get(_ENV_DISABLE) == "1":
 +        return
 +    root = _cache_root()
 +    try:
 +        root.mkdir(parents=True, exist_ok=True)
 +        path = root / f"{key}.json"
 +        tmp = path.with_suffix(".json.tmp")
 +        with tmp.open("w", encoding="utf-8") as f:
 +            json.dump(stats, f, indent=2, sort_keys=True)
 +        tmp.replace(path)
 +    except OSError:
 +        return

src/dlm_sway/probes/null_adapter.pymodified

  from dlm_sway.core.result import ProbeResult, Verdict, safe_finalize
  from dlm_sway.core.scoring import NullCalibratedBackend
 +from dlm_sway.probes._null_cache import compute_key, load, save
  from dlm_sway.probes._null_proxy import NullCalibrationBackendProxy
  from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, registry
      ``ctx.downstream_kinds`` (the kinds that appear after this probe
      in the suite). Set explicitly to force calibration of specific
      kinds regardless of suite order."""
 +    cache: bool = True
 +    """Read / write the on-disk calibration cache under
 +    ``~/.dlm-sway/null-stats``. Keyed by backend identity + calibration
 +    params. Disable to force a fresh calibration (e.g. when you suspect
 +    the cached stats are stale)."""
  class NullAdapterProbe(Probe):
              filtered.append(k)
          target_kinds = filtered
 +        # Cache lookup: backends can opt in by providing a
 +        # ``cache_identity()`` method returning a stable string. The
 +        # key incorporates both that identity and the calibration
 +        # parameters that actually influence the output.
 +        cache_key: str | None = None
 +        if spec.cache:
 +            backend_identity = _backend_identity(ctx.backend)
 +            cache_key = compute_key(
 +                backend_identity=backend_identity,
 +                params={
 +                    "runs": spec.runs,
 +                    "init_scale": spec.init_scale,
 +                    "seed_base": spec.seed_base,
 +                    "top_k": ctx.top_k,
 +                    "kinds": sorted(target_kinds),
 +                },
 +            )
 +            cached = load(cache_key)
 +            if cached is not None and "null_stats" in cached:
 +                cached_evidence: dict[str, Any] = dict(cached)
 +                cached_evidence.setdefault("skipped_kinds", [])
 +                cached_evidence.setdefault("calibrated_kinds", list(cached["null_stats"].keys()))
 +                cached_evidence["weight"] = spec.weight
 +                cached_evidence["from_cache"] = True
 +                return safe_finalize(
 +                    name=spec.name,
 +                    kind=spec.kind,
 +                    verdict=Verdict.PASS,
 +                    score=1.0,
 +                    evidence=cached_evidence,
 +                    message=(
 +                        f"null calibration: {len(cached['null_stats'])} kinds (loaded from cache)"
 +                    ),
 +                )
++
          per_kind_stats: dict[str, dict[str, float]] = {}
          per_kind_samples: dict[str, list[float]] = {}
          skipped_kinds: list[dict[str, str]] = []
              try:
                  cal_spec = probe_cls.calibrate_spec(ctx)
              except Exception as exc:  # noqa: BLE001 — defensive
 -                skipped_kinds.append(
 -                    {"kind": kind, "reason": f"calibrate_spec raised: {exc}"}
 -                )
 +                skipped_kinds.append({"kind": kind, "reason": f"calibrate_spec raised: {exc}"})
                  continue
              if cal_spec is None:
                  skipped_kinds.append(
                  if raw is not None and math.isfinite(raw):
                      raws.append(float(raw))
                  elif cal_result.verdict == Verdict.ERROR:
 -                    errors.append(
 -                        f"seed={seed}: probe ERROR — {cal_result.message}"
 -                    )
 +                    errors.append(f"seed={seed}: probe ERROR — {cal_result.message}")
              if raws:
                  mean = statistics.fmean(raws)
              "init_scale": spec.init_scale,
              "seed_base": spec.seed_base,
              "weight": spec.weight,
 +            "from_cache": False,
+         }
 -        message = (
 -            f"null calibration: {len(per_kind_stats)} kinds calibrated "
 -            f"over {spec.runs} seeds"
 -        )
 +        if cache_key is not None:
 +            # Persist the stats dict only — the samples list can be
 +            # large, and downstream consumers only need the aggregates.
 +            save(
 +                cache_key,
 +                {
 +                    "null_stats": per_kind_stats,
 +                    "runs": spec.runs,
 +                    "init_scale": spec.init_scale,
 +                    "seed_base": spec.seed_base,
 +                    "calibrated_kinds": list(per_kind_stats.keys()),
 +                },
 +            )
++
 +        message = f"null calibration: {len(per_kind_stats)} kinds calibrated over {spec.runs} seeds"
          if skipped_kinds:
              message += f" ({len(skipped_kinds)} opted out)"
+         )
 +def _backend_identity(backend: Any) -> str | None:
 +    """Ask the backend for a stable cache identity string, if it has one.
++
 +    Duck-typed: backends that can't uniquely identify themselves (the
 +    dummy backend in tests, for example) simply don't provide this
 +    method, and caching is skipped for them.
 +    """
 +    fn = getattr(backend, "cache_identity", None)
 +    if not callable(fn):
 +        return None
 +    try:
 +        value = fn()
 +    except Exception:  # noqa: BLE001 — cache is best-effort
 +        return None
 +    return str(value) if value else None
++
++
  def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None:
      """Look up null-adapter stats for ``probe_kind`` in the run context.

tests/unit/test_null_cache.pyadded

 +"""Tests for the on-disk null-calibration cache."""
++
 +from __future__ import annotations
++
 +import pytest
++
 +from dlm_sway.probes._null_cache import compute_key, load, save
++
++
 +@pytest.fixture
 +def isolated_cache(tmp_path, monkeypatch):
 +    """Redirect the cache root into a per-test tmp dir."""
 +    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
 +    return tmp_path
++
++
 +class TestComputeKey:
 +    def test_none_identity_returns_none(self) -> None:
 +        assert compute_key(backend_identity=None, params={"runs": 3}) is None
++
 +    def test_empty_identity_returns_none(self) -> None:
 +        assert compute_key(backend_identity="", params={"runs": 3}) is None
++
 +    def test_stable_across_calls(self) -> None:
 +        k1 = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        k2 = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        assert k1 == k2
++
 +    def test_changes_when_params_change(self) -> None:
 +        k1 = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        k2 = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 5})
 +        assert k1 != k2
++
 +    def test_changes_when_identity_changes(self) -> None:
 +        k1 = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        k2 = compute_key(backend_identity="hf:foo:/tmp/b", params={"runs": 3})
 +        assert k1 != k2
++
++
 +class TestLoadSave:
 +    def test_save_then_load_roundtrip(self, isolated_cache) -> None:
 +        stats = {"null_stats": {"delta_kl": {"mean": 0.01, "std": 0.002, "n": 3}}}
 +        key = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        assert key is not None
 +        save(key, stats)
 +        loaded = load(key)
 +        assert loaded == stats
++
 +    def test_load_miss_returns_none(self, isolated_cache) -> None:
 +        key = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        assert load(key) is None
++
 +    def test_none_key_roundtrip_noop(self, isolated_cache) -> None:
 +        save(None, {"null_stats": {}})
 +        assert load(None) is None
++
 +    def test_malformed_json_is_treated_as_miss(self, isolated_cache, tmp_path) -> None:
 +        key = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        assert key is not None
 +        # Manually write malformed content at the expected path.
 +        cache_root = tmp_path / "dlm-sway" / "null-stats"
 +        cache_root.mkdir(parents=True)
 +        (cache_root / f"{key}.json").write_text("{ not json")
 +        assert load(key) is None
++
 +    def test_env_disable_bypasses_both(self, isolated_cache, monkeypatch) -> None:
 +        monkeypatch.setenv("SWAY_DISABLE_NULL_CACHE", "1")
 +        key = compute_key(backend_identity="hf:foo:/tmp/a", params={"runs": 3})
 +        save(key, {"null_stats": {"delta_kl": {"mean": 0.01, "std": 0.002, "n": 3}}})
 +        assert load(key) is None

tests/unit/test_null_calibration.pymodified

              f"evidence={dk_result.evidence}, message={dk_result.message}"
+         )
 +    def test_cache_hit_short_circuits_calibration(self, tmp_path, monkeypatch) -> None:
 +        """A cached stats blob is loaded without re-running any probes."""
 +        monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
++
 +        class _IdBackend(DummyDifferentialBackend):
 +            def cache_identity(self) -> str:
 +                return "test:id-backend"
++
 +        backend = _IdBackend(base=DummyResponses(), ft=DummyResponses())
++
 +        # First call: populates the cache.
 +        probe, spec = build_probe(
 +            {
 +                "name": "null",
 +                "kind": "null_adapter",
 +                "runs": 2,
 +                "calibrate_kinds": ["delta_kl"],
 +            }
 +        )
 +        ctx = RunContext(backend=backend)
 +        r1 = probe.run(spec, ctx)
 +        assert r1.evidence["from_cache"] is False
++
 +        # Second call: same params, same identity → cache hit.
 +        r2 = probe.run(spec, ctx)
 +        assert r2.evidence["from_cache"] is True
 +        assert "delta_kl" in r2.evidence["null_stats"]
++
 +    def test_cache_disabled_forces_recompute(self, tmp_path, monkeypatch) -> None:
 +        """``cache=false`` bypasses the cache even if a prior run populated it."""
 +        monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
++
 +        class _IdBackend(DummyDifferentialBackend):
 +            def cache_identity(self) -> str:
 +                return "test:id-backend-2"
++
 +        backend = _IdBackend(base=DummyResponses(), ft=DummyResponses())
 +        probe, populating_spec = build_probe(
 +            {
 +                "name": "null",
 +                "kind": "null_adapter",
 +                "runs": 2,
 +                "calibrate_kinds": ["delta_kl"],
 +            }
 +        )
 +        probe.run(populating_spec, RunContext(backend=backend))
++
 +        _, fresh_spec = build_probe(
 +            {
 +                "name": "null",
 +                "kind": "null_adapter",
 +                "runs": 2,
 +                "calibrate_kinds": ["delta_kl"],
 +                "cache": False,
 +            }
 +        )
 +        r = probe.run(fresh_spec, RunContext(backend=backend))
 +        assert r.evidence["from_cache"] is False
++
      def test_skip_when_backend_not_null_calibrated(self) -> None:
          class _Bare:
              def as_base(self):  # noqa: ANN202