Python · 5177 bytes Raw Blame History
1 """S09 prove-the-value: ``external_perplexity`` catches diffuse forgetting
2 that ``calibration_drift`` misses.
3
4 Motivation (from the sprint file / Audit §F3): ``calibration_drift``
5 flags items that regress past a per-item threshold (default 1.0 nats).
6 A fine-tune that nudges *every* item by a small amount (say 0.3 nats)
7 slides under that threshold on every item — mean_delta passes
8 ``assert_mean_delta_gte=-0.5`` comfortably too — so ``calibration_drift``
9 reports PASS. That same 0.3-nat-per-token drop on held-out English prose
10 is exactly what ``external_perplexity`` measures, and 0.3 < 0.1 (the
11 ``assert_mean_delta_gte=-0.1`` default) → FAIL.
12
13 This test constructs a dummy backend that exhibits exactly that
14 signature across both probes, runs both in one suite, and asserts the
15 verdict split. That split is the F3 differentiator; without it, the
16 probe would be a second ``calibration_drift`` with slightly different
17 inputs.
18 """
19
20 from __future__ import annotations
21
22 import numpy as np
23
24 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
25 from dlm_sway.core.result import Verdict
26 from dlm_sway.core.scoring import RollingLogprob
27 from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
28 from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
29 from dlm_sway.suite.runner import run as run_suite
30 from dlm_sway.suite.spec import SwaySpec
31
32 # Every pack item and every corpus chunk loses 0.3 nats per token on ft.
33 # This sits:
34 # - Above calibration_drift's `regression_nats` threshold (1.0 nats),
35 # so no pack item counts as regressed → frac_regressed=0 → PASS.
36 # - Above calibration_drift's `assert_mean_delta_gte` (-0.5), so the
37 # mean-delta gate also passes.
38 # - Below external_perplexity's `assert_mean_delta_gte` (-0.1), so
39 # external_perplexity fails.
40 _DIFFUSE_DELTA = -0.3
41
42
43 def _token_estimate(s: str) -> int:
44 # Mirrors ``calibration_drift._token_estimate``: tokens ≈ len // 4.
45 return max(1, len(s) // 4)
46
47
48 def _rolling(text: str, per_tok: float) -> RollingLogprob:
49 tokens = text.split()
50 n = max(len(tokens), 1)
51 lp = np.full(max(n - 1, 0), per_tok, dtype=np.float32)
52 return RollingLogprob(
53 token_ids=np.arange(n, dtype=np.int64),
54 logprobs=lp,
55 num_tokens=n,
56 total_logprob=float(per_tok * max(n - 1, 0)),
57 )
58
59
60 def _diffuse_forgetting_backend() -> DummyDifferentialBackend:
61 """Backend where ft assigns uniformly lower logprob across:
62 - every item in BUILT_IN_PACK (for calibration_drift), and
63 - every chunk of the public-domain corpus (for external_perplexity).
64 """
65 # calibration_drift uses logprob_of(prompt, gold) / tokens.
66 # Scale per-item delta by tokens so the per-token delta is -0.3.
67 base_lp: dict[tuple[str, str], float] = {}
68 ft_lp: dict[tuple[str, str], float] = {}
69 for prompt, gold in BUILT_IN_PACK:
70 n_tok = _token_estimate(gold)
71 base_lp[(prompt, gold)] = -5.0 * n_tok
72 ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + _DIFFUSE_DELTA * n_tok
73
74 # external_perplexity uses rolling_logprob(chunk).
75 corpus = load_corpus("public_domain_en")
76 chunks = chunk_corpus(corpus, chunk_chars=2048, max_chunks=16)
77 base_rolling = {c: _rolling(c, -2.0) for c in chunks}
78 ft_rolling = {c: _rolling(c, -2.0 + _DIFFUSE_DELTA) for c in chunks}
79
80 return DummyDifferentialBackend(
81 base=DummyResponses(logprobs=base_lp, rolling=base_rolling),
82 ft=DummyResponses(logprobs=ft_lp, rolling=ft_rolling),
83 )
84
85
86 def test_diffuse_forgetting_splits_verdicts() -> None:
87 backend = _diffuse_forgetting_backend()
88 raw_spec = SwaySpec.model_validate(
89 {
90 "version": 1,
91 "models": {
92 "base": {"base": "b"},
93 "ft": {"base": "b", "adapter": "/tmp/a"},
94 },
95 "suite": [
96 # Fixed-threshold paths on both probes — skip null to
97 # isolate the claim to the primary metric gates.
98 {"name": "cal", "kind": "calibration_drift", "items_limit": 30},
99 {"name": "ext", "kind": "external_perplexity", "max_chunks": 4},
100 ],
101 }
102 )
103 result = run_suite(raw_spec, backend)
104 assert len(result.probes) == 2
105 cal_result = result.probes[0]
106 ext_result = result.probes[1]
107
108 # calibration_drift PASSes: no individual item crossed the 1.0-nat
109 # regression threshold, and mean_delta (-0.3) is above -0.5.
110 assert cal_result.verdict == Verdict.PASS, (
111 f"calibration_drift should have passed on diffuse drift; "
112 f"message={cal_result.message}, evidence={cal_result.evidence}"
113 )
114 assert cal_result.evidence["fraction_regressed"] == 0.0
115 assert -0.35 < cal_result.evidence["mean_delta_nats"] < -0.25
116
117 # external_perplexity FAILs: the per-token mean-delta (-0.3) is
118 # below the -0.1 fixed-threshold gate.
119 assert ext_result.verdict == Verdict.FAIL, (
120 f"external_perplexity should have failed on diffuse drift; "
121 f"message={ext_result.message}, evidence={ext_result.evidence}"
122 )
123 assert ext_result.raw is not None
124 assert -0.35 < ext_result.raw < -0.25