`cb92c9f`

sway(probes): C2 calibration_drift + 30-item built-in general-knowledge pack

Authored by

espadonne 3 weeks ago

SHA: cb92c9f865859e1a2dbb6a0686ad8db75aa52e41
Parents: f0b5d50
Tree: be99664

3 changed files

Status	File	+
A	`src/dlm_sway/probes/_calibration_pack.py`	63
A	`src/dlm_sway/probes/calibration_drift.py`	135
A	`tests/unit/test_probe_calibration_drift.py`	57

src/dlm_sway/probes/_calibration_pack.pyadded

 +"""A small, built-in general-knowledge probe pack for C2.
++
 +Each item is a ``(prompt, gold)`` pair where ``gold`` is the next few
 +tokens a competent base model should assign high probability to. The
 +items are deliberately *factually trivial* — the point isn't "does the
 +model know this?" but "did the fine-tune forget this?" — so the pack
 +skews toward grade-school geography, chemistry, arithmetic, and
 +high-frequency idiom.
++
 +A real v1.0 will ship a 200-item pack sliced from TriviaQA + SQuAD +
 +OpenBookQA. This 30-item seed lets the probe ship today and catches the
 +most egregious over-fit cases.
 +"""
++
 +from __future__ import annotations
++
 +from typing import Final
++
 +CalibrationItem = tuple[str, str]
++
 +BUILT_IN_PACK: Final[tuple[CalibrationItem, ...]] = (
 +    # Geography
 +    ("The capital of France is", " Paris"),
 +    ("The capital of Japan is", " Tokyo"),
 +    ("The largest ocean on Earth is the", " Pacific"),
 +    ("Mount Everest is located on the border of Nepal and", " China"),
 +    ("The longest river in South America is the", " Amazon"),
 +    # Natural sciences
 +    ("Water freezes at zero degrees", " Celsius"),
 +    ("The chemical symbol for gold is", " Au"),
 +    ("Light travels faster than", " sound"),
 +    ("Plants convert sunlight into energy through", " photosynthesis"),
 +    ("The Earth orbits around the", " Sun"),
 +    # Arithmetic
 +    ("Two plus two equals", " four"),
 +    ("Ten times ten equals", " one hundred"),
 +    ("Half of one hundred is", " fifty"),
 +    ("A dozen means", " twelve"),
 +    # Language and idiom
 +    ("A rose by any other name would smell as", " sweet"),
 +    ("To be or not to be, that is the", " question"),
 +    ("The early bird catches the", " worm"),
 +    ("Actions speak louder than", " words"),
 +    ("A picture is worth a thousand", " words"),
 +    # History
 +    ("World War II ended in the year", " 1945"),
 +    ("The first president of the United States was", " George Washington"),
 +    ("The Berlin Wall fell in", " 1989"),
 +    # Biology
 +    ("Humans have twenty", " fingers and toes"),
 +    ("The human body has two", " lungs"),
 +    ("Blood is pumped through the body by the", " heart"),
 +    # Technology
 +    ("HTML stands for HyperText", " Markup Language"),
 +    ("The World Wide Web was invented by Tim", " Berners-Lee"),
 +    # Miscellaneous
 +    ("One year has", " 365 days"),
 +    ("A week has seven", " days"),
 +    ("There are seven colors in a", " rainbow"),
 +)
 +"""30 items covering geography, science, arithmetic, language, history,
 +biology, and technology. Pulled from public-domain grade-school facts so
 +there's no licensing concern about shipping with the wheel."""

src/dlm_sway/probes/calibration_drift.pyadded

 +"""C2 CalibrationDrift — did we break general knowledge while fitting the doc?
++
 +The classic small-doc fine-tune failure mode: the adapter learned the
 +document so well that it forgot the world. C2 catches this by scoring
 +base and ft on a packaged set of general-knowledge completions (the
 +``BUILT_IN_PACK`` — a 30-item seed of public-domain grade-school facts)
 +and flagging items whose per-token logprob regressed significantly.
++
 +A healthy fine-tune: some items drift slightly (mild confidence shift,
 +normal), but essentially none regress below a nat of slack. An over-fit
 +fine-tune: 20%+ of items regress, the adapter has torched its ability
 +to answer anything outside the document.
++
 +Pass when ``fraction_regressed < assert_fraction_regressed_lt`` AND
 +``mean_delta_nats >= assert_mean_delta_gte``. Both thresholds default
 +to values that trigger on genuine damage but tolerate normal drift.
 +"""
++
 +from __future__ import annotations
++
 +import statistics
 +from typing import Literal
++
 +from pydantic import Field
++
 +from dlm_sway.core.result import ProbeResult, Verdict
 +from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
 +from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
++
 +class CalibrationItemSpec(ProbeSpec):
 +    """Not used directly — documents the shape of an item override."""
++
 +    kind: Literal["__calibration_item"] = "__calibration_item"
 +    prompt: str = ""
 +    gold: str = ""
++
++
 +class CalibrationDriftSpec(ProbeSpec):
 +    kind: Literal["calibration_drift"] = "calibration_drift"
 +    pack: Literal["builtin"] = "builtin"
 +    """Source of items. ``"builtin"`` uses :data:`BUILT_IN_PACK`. Custom
 +    packs will ship via a file reference in a later milestone."""
 +    items_limit: int | None = None
 +    """If set, truncate the pack to this many items (for fast runs)."""
 +    assert_fraction_regressed_lt: float = 0.15
 +    assert_mean_delta_gte: float = -0.5
 +    """Mean per-token logprob delta (ft − base) across the pack. Slightly
 +    negative is tolerable; deeply negative is not."""
 +    regression_nats: float = 1.0
 +    """How many nats worse an item must get to count as regressed."""
 +    items: list[tuple[str, str]] = Field(default_factory=list)
 +    """Optional inline override of the packaged items."""
++
++
 +class CalibrationDriftProbe(Probe):
 +    kind = "calibration_drift"
 +    spec_cls = CalibrationDriftSpec
 +    category = "calibration"
++
 +    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
 +        assert isinstance(spec, CalibrationDriftSpec)
 +        items = list(spec.items) if spec.items else list(BUILT_IN_PACK)
 +        if spec.items_limit is not None:
 +            items = items[: spec.items_limit]
 +        if not items:
 +            return ProbeResult(
 +                name=spec.name,
 +                kind=spec.kind,
 +                verdict=Verdict.ERROR,
 +                score=None,
 +                message="no calibration items",
 +            )
++
 +        deltas: list[float] = []
 +        regressed = 0
 +        worst: list[dict[str, float | str]] = []
++
 +        for prompt, gold in items:
 +            tokens = max(_token_estimate(gold), 1)
 +            with ctx.backend.as_base() as b:
 +                lp_base = b.logprob_of(prompt, gold) / tokens
 +            with ctx.backend.as_finetuned() as f:
 +                lp_ft = f.logprob_of(prompt, gold) / tokens
 +            delta = lp_ft - lp_base
 +            deltas.append(delta)
 +            if delta < -spec.regression_nats:
 +                regressed += 1
 +                worst.append({"prompt": prompt, "gold": gold, "delta": delta})
++
 +        # Surface the worst offenders — up to 5.
 +        worst.sort(key=lambda d: float(d["delta"]))
 +        worst = worst[:5]
++
 +        frac_regressed = regressed / len(items)
 +        mean_delta = statistics.fmean(deltas)
++
 +        passed = (
 +            frac_regressed < spec.assert_fraction_regressed_lt
 +            and mean_delta >= spec.assert_mean_delta_gte
 +        )
 +        verdict = Verdict.PASS if passed else Verdict.FAIL
 +        # Score: 1.0 at zero regression + zero drift, declining with either.
 +        regress_component = max(
 +            0.0, 1.0 - frac_regressed / max(spec.assert_fraction_regressed_lt, 1e-6)
 +        )
 +        drift_component = max(0.0, min(1.0, (mean_delta + 1.0) / 1.5))
 +        score = 0.6 * regress_component + 0.4 * drift_component
++
 +        return ProbeResult(
 +            name=spec.name,
 +            kind=spec.kind,
 +            verdict=verdict,
 +            score=score,
 +            raw=frac_regressed,
 +            base_value=None,
 +            ft_value=mean_delta,
 +            evidence={
 +                "fraction_regressed": frac_regressed,
 +                "mean_delta_nats": mean_delta,
 +                "regressed_count": regressed,
 +                "total_items": len(items),
 +                "worst_offenders": worst,
 +                "regression_nats_threshold": spec.regression_nats,
 +                "weight": spec.weight,
 +            },
 +            message=(
 +                f"{regressed}/{len(items)} items regressed >{spec.regression_nats:.1f} nats "
 +                f"(frac={frac_regressed:.1%}), mean_delta={mean_delta:+.3f} nats/tok"
 +            ),
 +        )
++
++
 +def _token_estimate(s: str) -> int:
 +    return max(1, len(s) // 4)

tests/unit/test_probe_calibration_drift.pyadded

 +"""Tests for :mod:`dlm_sway.probes.calibration_drift`."""
++
 +from __future__ import annotations
++
 +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
 +from dlm_sway.probes.base import RunContext, build_probe
++
++
 +def _backend(delta_per_token: float) -> DummyDifferentialBackend:
 +    """Apply a uniform per-token logprob delta across every item."""
 +    base_lp: dict[tuple[str, str], float] = {}
 +    ft_lp: dict[tuple[str, str], float] = {}
 +    for prompt, gold in BUILT_IN_PACK:
 +        base_lp[(prompt, gold)] = -5.0 * max(len(gold) // 4, 1)
 +        ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + delta_per_token * max(len(gold) // 4, 1)
 +    return DummyDifferentialBackend(
 +        base=DummyResponses(logprobs=base_lp),
 +        ft=DummyResponses(logprobs=ft_lp),
 +    )
++
++
 +class TestCalibrationDrift:
 +    def test_healthy_when_no_regression(self) -> None:
 +        backend = _backend(delta_per_token=0.0)  # no drift
 +        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.PASS
 +        assert result.raw == 0.0  # zero fraction regressed
++
 +    def test_fail_on_uniform_large_regression(self) -> None:
 +        backend = _backend(delta_per_token=-2.0)  # every item regresses
 +        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.verdict == Verdict.FAIL
 +        assert result.raw == 1.0
++
 +    def test_respects_items_limit(self) -> None:
 +        backend = _backend(delta_per_token=0.0)
 +        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift", "items_limit": 5})
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        assert result.evidence["total_items"] == 5
++
 +    def test_worst_offenders_reported(self) -> None:
 +        backend = _backend(delta_per_token=-2.0)
 +        probe, spec = build_probe({"name": "c2", "kind": "calibration_drift"})
 +        ctx = RunContext(backend=backend)
 +        result = probe.run(spec, ctx)
 +        worst = result.evidence["worst_offenders"]
 +        assert len(worst) <= 5
 +        # Each worst-offender record carries prompt/gold/delta fields.
 +        if worst:
 +            assert {"prompt", "gold", "delta"} <= set(worst[0].keys())