`1e28540`

probes/null_adapter: per-kind calibration matrix (fixes P02, B2, C9)

Authored by

espadonne 3 weeks ago

SHA: 1e28540187e96e9aae50146a8d6faf2f8306d475
Parents: fc49f7e
Tree: 640aab0

3 changed files

Status	File	+	-
M	`src/dlm_sway/probes/null_adapter.py`	160	82
M	`tests/unit/test_null_calibration.py`	61	2
M	`tests/unit/test_suite_runner.py`	9	1

src/dlm_sway/probes/null_adapter.pymodified

 -"""Null-adapter baseline probe.
+-
 -Every numeric primitive reports its raw metric *and* a z-score against a
 -null-adapter distribution. This probe is the runtime engine that
 -establishes that distribution — it builds random-init "null" adapters
 -(structurally identical to the real adapter but with weights drawn from
 -a Gaussian) and measures how much signal they produce.
+-
 -The resulting ``(mean, std, n)`` per kind is attached to this probe's
 -``evidence["null_stats"]``. The runner picks it up and threads it into
 -:attr:`RunContext.null_stats`, where every downstream probe can read it
 -and turn a raw metric into a z-score.
+-
 -Backends that don't implement :class:`~dlm_sway.core.scoring.NullCalibratedBackend`
 -cause this probe to :attr:`Verdict.SKIP` — downstream probes fall back
 -to their fixed thresholds in that case.
 +"""Null-adapter baseline probe — per-kind calibration matrix (S02).
++
 +Every numeric primitive reports its raw metric *and* a z-score against
 +a null-adapter distribution. This probe is the runtime engine that
 +establishes that distribution — for **every** numeric probe kind the
 +user has downstream in the suite, not just one.
++
 +How it works:
++
 +1. The runner populates ``ctx.downstream_kinds`` with every probe kind
 +   that appears after this one in the suite.
 +2. For each target kind, we ask its probe class for a
 +   :meth:`~dlm_sway.probes.base.Probe.calibrate_spec` — a small spec
 +   suitable for null calibration. A probe that returns ``None`` opts
 +   out (typically because its inputs can't be synthesized, e.g.
 +   ``adapter_revert`` without an embedder, or ``adapter_ablation``
 +   which needs ``as_scaled_adapter`` that the proxy doesn't expose).
 +3. For each calibrating kind × seed, we run the probe through a
 +   :class:`~dlm_sway.probes._null_proxy.NullCalibrationBackendProxy`
 +   which makes ``as_finetuned()`` yield ``as_null_adapter(seed)`` —
 +   so the probe's own math is computing "what does my metric look
 +   like when the fine-tune is structural noise?".
 +4. We harvest each run's ``raw`` value, aggregate to ``(mean, std, n)``
 +   per kind, and publish under ``evidence["null_stats"]``.
 +5. The runner threads ``null_stats`` into ``RunContext`` for every
 +   subsequent probe, which then prefers the z-score path over the
 +   fixed-threshold path (see :mod:`dlm_sway.probes._zscore`).
++
 +Backends that don't implement
 +:class:`~dlm_sway.core.scoring.NullCalibratedBackend` cause this probe
 +to ``Verdict.SKIP``; every downstream probe falls back to fixed
 +thresholds and surfaces ``(no calibration)`` in the report.
  """
  from __future__ import annotations
 +import math
  import statistics
 -from typing import Literal
 +from typing import Any, Literal
  from pydantic import Field
 -from dlm_sway.core.result import ProbeResult, Verdict
 +from dlm_sway.core.result import ProbeResult, Verdict, safe_finalize
  from dlm_sway.core.scoring import NullCalibratedBackend
 -from dlm_sway.probes._divergence import divergence
 -from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
 +from dlm_sway.probes._null_proxy import NullCalibrationBackendProxy
 +from dlm_sway.probes.base import Probe, ProbeSpec, RunContext, registry
  class NullAdapterSpec(ProbeSpec):
      """Spec for ``kind: null_adapter``.
 -    Authors place this probe **first** in the suite so its output
 -    populates :attr:`RunContext.null_stats` before subsequent probes
 -    consult it.
 +    Place this probe **first** in the suite so its output populates
 +    :attr:`RunContext.null_stats` before subsequent probes consult it.
      """
      kind: Literal["null_adapter"] = "null_adapter"
      """Number of independent null adapters to evaluate. Three is the
      smallest that yields a usable std; more is better but quickly
      dominates suite runtime."""
 -    prompts: list[str] = Field(default_factory=list)
 -    """Prompt set for null calibration. Keep small — calibration runs
 -    ``runs × len(prompts)`` forward passes. 4–8 prompts is typical.
 -    If empty, a minimal built-in prompt set is used so the probe
 -    always produces stats."""
      init_scale: float = 0.02
      """Stddev of the zero-mean Gaussian used to fill lora_A/lora_B."""
      seed_base: int = 1000
      """First seed; successive runs use ``seed_base + run_idx``."""
+-
+-
 -_DEFAULT_PROMPTS: tuple[str, ...] = (
 -    "The quick brown fox",
 -    "Once upon a time",
 -    "In this document we explain",
 -    "The key takeaway is",
 -    "An important point to remember",
 -)
 +    calibrate_kinds: list[str] = Field(default_factory=list)
 +    """Which probe kinds to calibrate. Empty = auto-populate from
 +    ``ctx.downstream_kinds`` (the kinds that appear after this probe
 +    in the suite). Set explicitly to force calibration of specific
 +    kinds regardless of suite order."""
  class NullAdapterProbe(Probe):
 -    """Populate ``ctx.null_stats``; report a :attr:`Verdict.PASS` verdict itself.
 +    """Populate ``ctx.null_stats`` with per-kind null distributions.
 -    The probe never fails on its own terms — its *job* is calibration.
 -    Downstream probes pick up :attr:`RunContext.null_stats` keyed by
 -    probe kind (``delta_kl``, ``adapter_ablation`` …) and use the
 -    populated mean/std to z-score their own raw metrics.
 +    The probe itself reports ``Verdict.PASS`` on success — its job is
 +    calibration, not judgment. If the backend can't support null-view
 +    substitution, reports ``Verdict.SKIP`` with a clear message; every
 +    downstream numeric probe then falls back to fixed thresholds.
      """
      kind = "null_adapter"
                      "numeric probes will fall back to fixed thresholds"
                  ),
+             )
 -        prompts = list(spec.prompts) or list(_DEFAULT_PROMPTS)
+-
 -        per_seed_means: list[float] = []
 -        for run_idx in range(spec.runs):
 -            seed = spec.seed_base + run_idx
 -            per_prompt: list[float] = []
 -            for prompt in prompts:
 -                with ctx.backend.as_base() as base_view:
 -                    base_dist = base_view.next_token_dist(prompt, top_k=ctx.top_k)
 -                with ctx.backend.as_null_adapter(seed, init_scale=spec.init_scale) as null_view:
 -                    null_dist = null_view.next_token_dist(prompt, top_k=ctx.top_k)
 -                per_prompt.append(divergence(base_dist, null_dist, kind="js"))
 -            per_seed_means.append(statistics.fmean(per_prompt) if per_prompt else 0.0)
+-
 -        mean = statistics.fmean(per_seed_means)
 -        std = statistics.pstdev(per_seed_means) if len(per_seed_means) > 1 else 0.0
+-
 -        # Publish per-kind stats. delta_kl is the primary kind; other
 -        # divergence-based probes (adapter_ablation) share this scale.
 -        null_stats = {
 -            "delta_kl": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
 -            "adapter_ablation": {"mean": mean, "std": max(std, 1e-6), "n": float(spec.runs)},
++
 +        registered = registry()
++
 +        # Decide which kinds to calibrate. Explicit spec field wins;
 +        # otherwise auto-populate from downstream_kinds.
 +        target_kinds: list[str] = list(spec.calibrate_kinds)
 +        if not target_kinds:
 +            target_kinds = [k for k in ctx.downstream_kinds if k and k != spec.kind]
 +        # De-dupe while preserving order; drop self and unregistered.
 +        seen: set[str] = set()
 +        filtered: list[str] = []
 +        for k in target_kinds:
 +            if k == spec.kind or k in seen or k not in registered:
 +                continue
 +            seen.add(k)
 +            filtered.append(k)
 +        target_kinds = filtered
++
 +        per_kind_stats: dict[str, dict[str, float]] = {}
 +        per_kind_samples: dict[str, list[float]] = {}
 +        skipped_kinds: list[dict[str, str]] = []
++
 +        for kind in target_kinds:
 +            probe_cls = registered[kind]
 +            try:
 +                cal_spec = probe_cls.calibrate_spec(ctx)
 +            except Exception as exc:  # noqa: BLE001 — defensive
 +                skipped_kinds.append(
 +                    {"kind": kind, "reason": f"calibrate_spec raised: {exc}"}
 +                )
 +                continue
 +            if cal_spec is None:
 +                skipped_kinds.append(
 +                    {
 +                        "kind": kind,
 +                        "reason": "probe opted out (calibrate_spec returned None)",
 +                    }
 +                )
 +                continue
++
 +            probe = probe_cls()
 +            raws: list[float] = []
 +            errors: list[str] = []
 +            for run_idx in range(spec.runs):
 +                seed = spec.seed_base + run_idx
 +                proxy = NullCalibrationBackendProxy(
 +                    ctx.backend, seed=seed, init_scale=spec.init_scale
 +                )
 +                cal_ctx = RunContext(
 +                    backend=proxy,
 +                    seed=seed,
 +                    top_k=ctx.top_k,
 +                    sections=ctx.sections,
 +                    doc_text=ctx.doc_text,
 +                    null_stats={},  # calibration uses fixed thresholds — no recursion
 +                    downstream_kinds=(),
 +                )
 +                try:
 +                    cal_result = probe.run(cal_spec, cal_ctx)
 +                except Exception as exc:  # noqa: BLE001
 +                    errors.append(f"seed={seed}: {type(exc).__name__}: {exc}")
 +                    continue
 +                raw = cal_result.raw
 +                if raw is not None and math.isfinite(raw):
 +                    raws.append(float(raw))
 +                elif cal_result.verdict == Verdict.ERROR:
 +                    errors.append(
 +                        f"seed={seed}: probe ERROR — {cal_result.message}"
 +                    )
++
 +            if raws:
 +                mean = statistics.fmean(raws)
 +                std = statistics.pstdev(raws) if len(raws) > 1 else 0.0
 +                per_kind_stats[kind] = {
 +                    "mean": mean,
 +                    # C9: clamp the std floor so the downstream z-score
 +                    # path doesn't blow up when every seed produces
 +                    # identical raws.
 +                    "std": max(std, 1e-6),
 +                    "n": float(len(raws)),
 +                }
 +                per_kind_samples[kind] = raws
 +            else:
 +                reason = "no finite raws across all seeds"
 +                if errors:
 +                    reason += f" ({errors[0]})"
 +                skipped_kinds.append({"kind": kind, "reason": reason})
++
 +        evidence: dict[str, Any] = {
 +            "null_stats": per_kind_stats,
 +            "per_kind_raw_samples": per_kind_samples,
 +            "skipped_kinds": skipped_kinds,
 +            "calibrated_kinds": list(per_kind_stats.keys()),
 +            "runs": spec.runs,
 +            "init_scale": spec.init_scale,
 +            "seed_base": spec.seed_base,
 +            "weight": spec.weight,
+         }
 -        return ProbeResult(
 +        message = (
 +            f"null calibration: {len(per_kind_stats)} kinds calibrated "
 +            f"over {spec.runs} seeds"
 +        )
 +        if skipped_kinds:
 +            message += f" ({len(skipped_kinds)} opted out)"
++
 +        return safe_finalize(
              name=spec.name,
              kind=spec.kind,
              verdict=Verdict.PASS,
              score=1.0,
 -            raw=mean,
 -            evidence={
 -                "null_stats": null_stats,
 -                "per_seed_mean_js": per_seed_means,
 -                "init_scale": spec.init_scale,
 -                "runs": spec.runs,
 -                "num_prompts": len(prompts),
 -                "weight": spec.weight,
 -            },
 -            message=(
 -                f"null JS divergence μ={mean:.4f} ± {std:.4f} "
 -                f"(over {spec.runs} seeds × {len(prompts)} prompts) — "
 -                f"downstream probes will z-score against this baseline"
 -            ),
 +            evidence=evidence,
 +            message=message,
+         )
  def get_null_stats(ctx: RunContext, probe_kind: str) -> dict[str, float] | None:
 -    """Look up null-adapter stats for ``probe_kind``.
 +    """Look up null-adapter stats for ``probe_kind`` in the run context.
      Returns ``{"mean": …, "std": …, "n": …}`` when calibration ran for
 -    this kind, else ``None``. Probes treat ``None`` as "fall back to the
 -    fixed threshold from your spec."
 +    this kind, else ``None``. Probes treat ``None`` as "fall back to
 +    the fixed threshold from your spec" and surface ``(no calibration)``
 +    in the report.
      """
      return ctx.null_stats.get(probe_kind)

tests/unit/test_null_calibration.pymodified

  class TestProbe:
      def test_populates_null_stats(self) -> None:
 +        """Explicit `calibrate_kinds` calibrates regardless of suite order."""
          backend = _diverging_backend()
          probe, spec = build_probe(
+             {
                  "name": "null",
                  "kind": "null_adapter",
                  "runs": 3,
 -                "prompts": ["q1", "q2"],
 +                "calibrate_kinds": ["delta_kl"],
+             }
+         )
          ctx = RunContext(backend=backend)
          assert stats["delta_kl"]["n"] == 3.0
          assert stats["delta_kl"]["std"] > 0.0  # seeded perturbations produce variance
 +    def test_auto_populates_from_downstream_kinds(self) -> None:
 +        """When `calibrate_kinds` is empty, falls back to `ctx.downstream_kinds`."""
 +        backend = _diverging_backend()
 +        probe, spec = build_probe({"name": "null", "kind": "null_adapter", "runs": 2})
 +        ctx = RunContext(
 +            backend=backend,
 +            downstream_kinds=("delta_kl", "prompt_collapse"),
 +        )
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.PASS
 +        stats = result.evidence["null_stats"]
 +        # Every downstream numeric kind that opts in gets stats.
 +        assert "delta_kl" in stats
 +        assert "prompt_collapse" in stats
++
 +    def test_empty_calibrate_kinds_with_no_downstream_is_noop(self) -> None:
 +        """No kinds, no calibration — probe still PASSes with empty stats."""
 +        backend = _diverging_backend()
 +        probe, spec = build_probe({"name": "null", "kind": "null_adapter", "runs": 2})
 +        ctx = RunContext(backend=backend)  # no downstream_kinds
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.PASS
 +        assert result.evidence["null_stats"] == {}
 +        assert result.evidence["calibrated_kinds"] == []
++
 +    def test_unregistered_kind_is_silently_skipped(self) -> None:
 +        backend = _diverging_backend()
 +        probe, spec = build_probe(
 +            {
 +                "name": "null",
 +                "kind": "null_adapter",
 +                "runs": 2,
 +                "calibrate_kinds": ["delta_kl", "nonexistent_kind"],
 +            }
 +        )
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert "delta_kl" in result.evidence["null_stats"]
 +        assert "nonexistent_kind" not in result.evidence["null_stats"]
++
 +    def test_opt_out_probe_is_reported_as_skipped(self) -> None:
 +        """A kind whose calibrate_spec returns None surfaces in skipped_kinds."""
 +        backend = _diverging_backend()
 +        probe, spec = build_probe(
 +            {
 +                "name": "null",
 +                "kind": "null_adapter",
 +                "runs": 2,
 +                # adapter_revert.calibrate_spec returns None by default
 +                # (inherits from base), so we expect it to opt out.
 +                "calibrate_kinds": ["adapter_revert", "delta_kl"],
 +            }
 +        )
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert "delta_kl" in result.evidence["null_stats"]
 +        skipped = [s["kind"] for s in result.evidence["skipped_kinds"]]
 +        assert "adapter_revert" in skipped
++
      def test_runner_threads_null_stats_to_subsequent_probes(self) -> None:
          """End-to-end: null_adapter first → delta_kl picks up z-score path."""
          backend = _diverging_backend()
                          "name": "null",
                          "kind": "null_adapter",
                          "runs": 3,
 -                        "prompts": ["p1", "p2"],
                      },
+                     {
                          "name": "dk",

tests/unit/test_suite_runner.pymodified

          self, backend: DummyDifferentialBackend
      ) -> None:
          # Dummy backend implements NullCalibratedBackend, so calibration runs.
 -        spec = _spec({"name": "null", "kind": "null_adapter", "runs": 2, "prompts": ["q1"]})
 +        # Explicit calibrate_kinds so it runs even without downstream probes.
 +        spec = _spec(
 +            {
 +                "name": "null",
 +                "kind": "null_adapter",
 +                "runs": 2,
 +                "calibrate_kinds": ["delta_kl"],
 +            }
 +        )
          result = run(spec, backend)
          assert result.probes[0].kind == "null_adapter"
          assert result.probes[0].verdict == Verdict.PASS