@@ -0,0 +1,124 @@ |
| 1 | +"""S09 prove-the-value: ``external_perplexity`` catches diffuse forgetting |
| 2 | +that ``calibration_drift`` misses. |
| 3 | + |
| 4 | +Motivation (from the sprint file / Audit §F3): ``calibration_drift`` |
| 5 | +flags items that regress past a per-item threshold (default 1.0 nats). |
| 6 | +A fine-tune that nudges *every* item by a small amount (say 0.3 nats) |
| 7 | +slides under that threshold on every item — mean_delta passes |
| 8 | +``assert_mean_delta_gte=-0.5`` comfortably too — so ``calibration_drift`` |
| 9 | +reports PASS. That same 0.3-nat-per-token drop on held-out English prose |
| 10 | +is exactly what ``external_perplexity`` measures, and 0.3 < 0.1 (the |
| 11 | +``assert_mean_delta_gte=-0.1`` default) → FAIL. |
| 12 | + |
| 13 | +This test constructs a dummy backend that exhibits exactly that |
| 14 | +signature across both probes, runs both in one suite, and asserts the |
| 15 | +verdict split. That split is the F3 differentiator; without it, the |
| 16 | +probe would be a second ``calibration_drift`` with slightly different |
| 17 | +inputs. |
| 18 | +""" |
| 19 | + |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +import numpy as np |
| 23 | + |
| 24 | +from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses |
| 25 | +from dlm_sway.core.result import Verdict |
| 26 | +from dlm_sway.core.scoring import RollingLogprob |
| 27 | +from dlm_sway.probes._calibration_pack import BUILT_IN_PACK |
| 28 | +from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus |
| 29 | +from dlm_sway.suite.runner import run as run_suite |
| 30 | +from dlm_sway.suite.spec import SwaySpec |
| 31 | + |
| 32 | +# Every pack item and every corpus chunk loses 0.3 nats per token on ft. |
| 33 | +# This sits: |
| 34 | +# - Above calibration_drift's `regression_nats` threshold (1.0 nats), |
| 35 | +# so no pack item counts as regressed → frac_regressed=0 → PASS. |
| 36 | +# - Above calibration_drift's `assert_mean_delta_gte` (-0.5), so the |
| 37 | +# mean-delta gate also passes. |
| 38 | +# - Below external_perplexity's `assert_mean_delta_gte` (-0.1), so |
| 39 | +# external_perplexity fails. |
| 40 | +_DIFFUSE_DELTA = -0.3 |
| 41 | + |
| 42 | + |
| 43 | +def _token_estimate(s: str) -> int: |
| 44 | + # Mirrors ``calibration_drift._token_estimate``: tokens ≈ len // 4. |
| 45 | + return max(1, len(s) // 4) |
| 46 | + |
| 47 | + |
| 48 | +def _rolling(text: str, per_tok: float) -> RollingLogprob: |
| 49 | + tokens = text.split() |
| 50 | + n = max(len(tokens), 1) |
| 51 | + lp = np.full(max(n - 1, 0), per_tok, dtype=np.float32) |
| 52 | + return RollingLogprob( |
| 53 | + token_ids=np.arange(n, dtype=np.int64), |
| 54 | + logprobs=lp, |
| 55 | + num_tokens=n, |
| 56 | + total_logprob=float(per_tok * max(n - 1, 0)), |
| 57 | + ) |
| 58 | + |
| 59 | + |
| 60 | +def _diffuse_forgetting_backend() -> DummyDifferentialBackend: |
| 61 | + """Backend where ft assigns uniformly lower logprob across: |
| 62 | + - every item in BUILT_IN_PACK (for calibration_drift), and |
| 63 | + - every chunk of the public-domain corpus (for external_perplexity). |
| 64 | + """ |
| 65 | + # calibration_drift uses logprob_of(prompt, gold) / tokens. |
| 66 | + # Scale per-item delta by tokens so the per-token delta is -0.3. |
| 67 | + base_lp: dict[tuple[str, str], float] = {} |
| 68 | + ft_lp: dict[tuple[str, str], float] = {} |
| 69 | + for prompt, gold in BUILT_IN_PACK: |
| 70 | + n_tok = _token_estimate(gold) |
| 71 | + base_lp[(prompt, gold)] = -5.0 * n_tok |
| 72 | + ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + _DIFFUSE_DELTA * n_tok |
| 73 | + |
| 74 | + # external_perplexity uses rolling_logprob(chunk). |
| 75 | + corpus = load_corpus("public_domain_en") |
| 76 | + chunks = chunk_corpus(corpus, chunk_chars=2048, max_chunks=16) |
| 77 | + base_rolling = {c: _rolling(c, -2.0) for c in chunks} |
| 78 | + ft_rolling = {c: _rolling(c, -2.0 + _DIFFUSE_DELTA) for c in chunks} |
| 79 | + |
| 80 | + return DummyDifferentialBackend( |
| 81 | + base=DummyResponses(logprobs=base_lp, rolling=base_rolling), |
| 82 | + ft=DummyResponses(logprobs=ft_lp, rolling=ft_rolling), |
| 83 | + ) |
| 84 | + |
| 85 | + |
| 86 | +def test_diffuse_forgetting_splits_verdicts() -> None: |
| 87 | + backend = _diffuse_forgetting_backend() |
| 88 | + raw_spec = SwaySpec.model_validate( |
| 89 | + { |
| 90 | + "version": 1, |
| 91 | + "models": { |
| 92 | + "base": {"base": "b"}, |
| 93 | + "ft": {"base": "b", "adapter": "/tmp/a"}, |
| 94 | + }, |
| 95 | + "suite": [ |
| 96 | + # Fixed-threshold paths on both probes — skip null to |
| 97 | + # isolate the claim to the primary metric gates. |
| 98 | + {"name": "cal", "kind": "calibration_drift", "items_limit": 30}, |
| 99 | + {"name": "ext", "kind": "external_perplexity", "max_chunks": 4}, |
| 100 | + ], |
| 101 | + } |
| 102 | + ) |
| 103 | + result = run_suite(raw_spec, backend) |
| 104 | + assert len(result.probes) == 2 |
| 105 | + cal_result = result.probes[0] |
| 106 | + ext_result = result.probes[1] |
| 107 | + |
| 108 | + # calibration_drift PASSes: no individual item crossed the 1.0-nat |
| 109 | + # regression threshold, and mean_delta (-0.3) is above -0.5. |
| 110 | + assert cal_result.verdict == Verdict.PASS, ( |
| 111 | + f"calibration_drift should have passed on diffuse drift; " |
| 112 | + f"message={cal_result.message}, evidence={cal_result.evidence}" |
| 113 | + ) |
| 114 | + assert cal_result.evidence["fraction_regressed"] == 0.0 |
| 115 | + assert -0.35 < cal_result.evidence["mean_delta_nats"] < -0.25 |
| 116 | + |
| 117 | + # external_perplexity FAILs: the per-token mean-delta (-0.3) is |
| 118 | + # below the -0.1 fixed-threshold gate. |
| 119 | + assert ext_result.verdict == Verdict.FAIL, ( |
| 120 | + f"external_perplexity should have failed on diffuse drift; " |
| 121 | + f"message={ext_result.message}, evidence={ext_result.evidence}" |
| 122 | + ) |
| 123 | + assert ext_result.raw is not None |
| 124 | + assert -0.35 < ext_result.raw < -0.25 |