tenseleyflow/sway / c5ff2d0

Browse files

tests/unit: prove-the-value — diffuse forgetting splits external_perplexity vs calibration_drift verdicts

Authored by espadonne
SHA
c5ff2d0cee6800962ae412c76626f670b47138ef
Parents
00b870b
Tree
ddf66a1

1 changed file

StatusFile+-
A tests/unit/test_ext_ppl_vs_calibration_drift.py 124 0
tests/unit/test_ext_ppl_vs_calibration_drift.pyadded
@@ -0,0 +1,124 @@
1
+"""S09 prove-the-value: ``external_perplexity`` catches diffuse forgetting
2
+that ``calibration_drift`` misses.
3
+
4
+Motivation (from the sprint file / Audit §F3): ``calibration_drift``
5
+flags items that regress past a per-item threshold (default 1.0 nats).
6
+A fine-tune that nudges *every* item by a small amount (say 0.3 nats)
7
+slides under that threshold on every item — mean_delta passes
8
+``assert_mean_delta_gte=-0.5`` comfortably too — so ``calibration_drift``
9
+reports PASS. That same 0.3-nat-per-token drop on held-out English prose
10
+is exactly what ``external_perplexity`` measures, and 0.3 < 0.1 (the
11
+``assert_mean_delta_gte=-0.1`` default) → FAIL.
12
+
13
+This test constructs a dummy backend that exhibits exactly that
14
+signature across both probes, runs both in one suite, and asserts the
15
+verdict split. That split is the F3 differentiator; without it, the
16
+probe would be a second ``calibration_drift`` with slightly different
17
+inputs.
18
+"""
19
+
20
+from __future__ import annotations
21
+
22
+import numpy as np
23
+
24
+from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
25
+from dlm_sway.core.result import Verdict
26
+from dlm_sway.core.scoring import RollingLogprob
27
+from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
28
+from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
29
+from dlm_sway.suite.runner import run as run_suite
30
+from dlm_sway.suite.spec import SwaySpec
31
+
32
+# Every pack item and every corpus chunk loses 0.3 nats per token on ft.
33
+# This sits:
34
+#   - Above calibration_drift's `regression_nats` threshold (1.0 nats),
35
+#     so no pack item counts as regressed → frac_regressed=0 → PASS.
36
+#   - Above calibration_drift's `assert_mean_delta_gte` (-0.5), so the
37
+#     mean-delta gate also passes.
38
+#   - Below external_perplexity's `assert_mean_delta_gte` (-0.1), so
39
+#     external_perplexity fails.
40
+_DIFFUSE_DELTA = -0.3
41
+
42
+
43
+def _token_estimate(s: str) -> int:
44
+    # Mirrors ``calibration_drift._token_estimate``: tokens ≈ len // 4.
45
+    return max(1, len(s) // 4)
46
+
47
+
48
+def _rolling(text: str, per_tok: float) -> RollingLogprob:
49
+    tokens = text.split()
50
+    n = max(len(tokens), 1)
51
+    lp = np.full(max(n - 1, 0), per_tok, dtype=np.float32)
52
+    return RollingLogprob(
53
+        token_ids=np.arange(n, dtype=np.int64),
54
+        logprobs=lp,
55
+        num_tokens=n,
56
+        total_logprob=float(per_tok * max(n - 1, 0)),
57
+    )
58
+
59
+
60
+def _diffuse_forgetting_backend() -> DummyDifferentialBackend:
61
+    """Backend where ft assigns uniformly lower logprob across:
62
+    - every item in BUILT_IN_PACK (for calibration_drift), and
63
+    - every chunk of the public-domain corpus (for external_perplexity).
64
+    """
65
+    # calibration_drift uses logprob_of(prompt, gold) / tokens.
66
+    # Scale per-item delta by tokens so the per-token delta is -0.3.
67
+    base_lp: dict[tuple[str, str], float] = {}
68
+    ft_lp: dict[tuple[str, str], float] = {}
69
+    for prompt, gold in BUILT_IN_PACK:
70
+        n_tok = _token_estimate(gold)
71
+        base_lp[(prompt, gold)] = -5.0 * n_tok
72
+        ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + _DIFFUSE_DELTA * n_tok
73
+
74
+    # external_perplexity uses rolling_logprob(chunk).
75
+    corpus = load_corpus("public_domain_en")
76
+    chunks = chunk_corpus(corpus, chunk_chars=2048, max_chunks=16)
77
+    base_rolling = {c: _rolling(c, -2.0) for c in chunks}
78
+    ft_rolling = {c: _rolling(c, -2.0 + _DIFFUSE_DELTA) for c in chunks}
79
+
80
+    return DummyDifferentialBackend(
81
+        base=DummyResponses(logprobs=base_lp, rolling=base_rolling),
82
+        ft=DummyResponses(logprobs=ft_lp, rolling=ft_rolling),
83
+    )
84
+
85
+
86
+def test_diffuse_forgetting_splits_verdicts() -> None:
87
+    backend = _diffuse_forgetting_backend()
88
+    raw_spec = SwaySpec.model_validate(
89
+        {
90
+            "version": 1,
91
+            "models": {
92
+                "base": {"base": "b"},
93
+                "ft": {"base": "b", "adapter": "/tmp/a"},
94
+            },
95
+            "suite": [
96
+                # Fixed-threshold paths on both probes — skip null to
97
+                # isolate the claim to the primary metric gates.
98
+                {"name": "cal", "kind": "calibration_drift", "items_limit": 30},
99
+                {"name": "ext", "kind": "external_perplexity", "max_chunks": 4},
100
+            ],
101
+        }
102
+    )
103
+    result = run_suite(raw_spec, backend)
104
+    assert len(result.probes) == 2
105
+    cal_result = result.probes[0]
106
+    ext_result = result.probes[1]
107
+
108
+    # calibration_drift PASSes: no individual item crossed the 1.0-nat
109
+    # regression threshold, and mean_delta (-0.3) is above -0.5.
110
+    assert cal_result.verdict == Verdict.PASS, (
111
+        f"calibration_drift should have passed on diffuse drift; "
112
+        f"message={cal_result.message}, evidence={cal_result.evidence}"
113
+    )
114
+    assert cal_result.evidence["fraction_regressed"] == 0.0
115
+    assert -0.35 < cal_result.evidence["mean_delta_nats"] < -0.25
116
+
117
+    # external_perplexity FAILs: the per-token mean-delta (-0.3) is
118
+    # below the -0.1 fixed-threshold gate.
119
+    assert ext_result.verdict == Verdict.FAIL, (
120
+        f"external_perplexity should have failed on diffuse drift; "
121
+        f"message={ext_result.message}, evidence={ext_result.evidence}"
122
+    )
123
+    assert ext_result.raw is not None
124
+    assert -0.35 < ext_result.raw < -0.25