| 1 | """S09 prove-the-value: ``external_perplexity`` catches diffuse forgetting |
| 2 | that ``calibration_drift`` misses. |
| 3 | |
| 4 | Motivation (from the sprint file / Audit §F3): ``calibration_drift`` |
| 5 | flags items that regress past a per-item threshold (default 1.0 nats). |
| 6 | A fine-tune that nudges *every* item by a small amount (say 0.3 nats) |
| 7 | slides under that threshold on every item — mean_delta passes |
| 8 | ``assert_mean_delta_gte=-0.5`` comfortably too — so ``calibration_drift`` |
| 9 | reports PASS. That same 0.3-nat-per-token drop on held-out English prose |
| 10 | is exactly what ``external_perplexity`` measures, and 0.3 < 0.1 (the |
| 11 | ``assert_mean_delta_gte=-0.1`` default) → FAIL. |
| 12 | |
| 13 | This test constructs a dummy backend that exhibits exactly that |
| 14 | signature across both probes, runs both in one suite, and asserts the |
| 15 | verdict split. That split is the F3 differentiator; without it, the |
| 16 | probe would be a second ``calibration_drift`` with slightly different |
| 17 | inputs. |
| 18 | """ |
| 19 | |
| 20 | from __future__ import annotations |
| 21 | |
| 22 | import numpy as np |
| 23 | |
| 24 | from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses |
| 25 | from dlm_sway.core.result import Verdict |
| 26 | from dlm_sway.core.scoring import RollingLogprob |
| 27 | from dlm_sway.probes._calibration_pack import BUILT_IN_PACK |
| 28 | from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus |
| 29 | from dlm_sway.suite.runner import run as run_suite |
| 30 | from dlm_sway.suite.spec import SwaySpec |
| 31 | |
| 32 | # Every pack item and every corpus chunk loses 0.3 nats per token on ft. |
| 33 | # This sits: |
| 34 | # - Above calibration_drift's `regression_nats` threshold (1.0 nats), |
| 35 | # so no pack item counts as regressed → frac_regressed=0 → PASS. |
| 36 | # - Above calibration_drift's `assert_mean_delta_gte` (-0.5), so the |
| 37 | # mean-delta gate also passes. |
| 38 | # - Below external_perplexity's `assert_mean_delta_gte` (-0.1), so |
| 39 | # external_perplexity fails. |
| 40 | _DIFFUSE_DELTA = -0.3 |
| 41 | |
| 42 | |
| 43 | def _token_estimate(s: str) -> int: |
| 44 | # Mirrors ``calibration_drift._token_estimate``: tokens ≈ len // 4. |
| 45 | return max(1, len(s) // 4) |
| 46 | |
| 47 | |
| 48 | def _rolling(text: str, per_tok: float) -> RollingLogprob: |
| 49 | tokens = text.split() |
| 50 | n = max(len(tokens), 1) |
| 51 | lp = np.full(max(n - 1, 0), per_tok, dtype=np.float32) |
| 52 | return RollingLogprob( |
| 53 | token_ids=np.arange(n, dtype=np.int64), |
| 54 | logprobs=lp, |
| 55 | num_tokens=n, |
| 56 | total_logprob=float(per_tok * max(n - 1, 0)), |
| 57 | ) |
| 58 | |
| 59 | |
| 60 | def _diffuse_forgetting_backend() -> DummyDifferentialBackend: |
| 61 | """Backend where ft assigns uniformly lower logprob across: |
| 62 | - every item in BUILT_IN_PACK (for calibration_drift), and |
| 63 | - every chunk of the public-domain corpus (for external_perplexity). |
| 64 | """ |
| 65 | # calibration_drift uses logprob_of(prompt, gold) / tokens. |
| 66 | # Scale per-item delta by tokens so the per-token delta is -0.3. |
| 67 | base_lp: dict[tuple[str, str], float] = {} |
| 68 | ft_lp: dict[tuple[str, str], float] = {} |
| 69 | for prompt, gold in BUILT_IN_PACK: |
| 70 | n_tok = _token_estimate(gold) |
| 71 | base_lp[(prompt, gold)] = -5.0 * n_tok |
| 72 | ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + _DIFFUSE_DELTA * n_tok |
| 73 | |
| 74 | # external_perplexity uses rolling_logprob(chunk). |
| 75 | corpus = load_corpus("public_domain_en") |
| 76 | chunks = chunk_corpus(corpus, chunk_chars=2048, max_chunks=16) |
| 77 | base_rolling = {c: _rolling(c, -2.0) for c in chunks} |
| 78 | ft_rolling = {c: _rolling(c, -2.0 + _DIFFUSE_DELTA) for c in chunks} |
| 79 | |
| 80 | return DummyDifferentialBackend( |
| 81 | base=DummyResponses(logprobs=base_lp, rolling=base_rolling), |
| 82 | ft=DummyResponses(logprobs=ft_lp, rolling=ft_rolling), |
| 83 | ) |
| 84 | |
| 85 | |
| 86 | def test_diffuse_forgetting_splits_verdicts() -> None: |
| 87 | backend = _diffuse_forgetting_backend() |
| 88 | raw_spec = SwaySpec.model_validate( |
| 89 | { |
| 90 | "version": 1, |
| 91 | "models": { |
| 92 | "base": {"base": "b"}, |
| 93 | "ft": {"base": "b", "adapter": "/tmp/a"}, |
| 94 | }, |
| 95 | "suite": [ |
| 96 | # Fixed-threshold paths on both probes — skip null to |
| 97 | # isolate the claim to the primary metric gates. |
| 98 | {"name": "cal", "kind": "calibration_drift", "items_limit": 30}, |
| 99 | {"name": "ext", "kind": "external_perplexity", "max_chunks": 4}, |
| 100 | ], |
| 101 | } |
| 102 | ) |
| 103 | result = run_suite(raw_spec, backend) |
| 104 | assert len(result.probes) == 2 |
| 105 | cal_result = result.probes[0] |
| 106 | ext_result = result.probes[1] |
| 107 | |
| 108 | # calibration_drift PASSes: no individual item crossed the 1.0-nat |
| 109 | # regression threshold, and mean_delta (-0.3) is above -0.5. |
| 110 | assert cal_result.verdict == Verdict.PASS, ( |
| 111 | f"calibration_drift should have passed on diffuse drift; " |
| 112 | f"message={cal_result.message}, evidence={cal_result.evidence}" |
| 113 | ) |
| 114 | assert cal_result.evidence["fraction_regressed"] == 0.0 |
| 115 | assert -0.35 < cal_result.evidence["mean_delta_nats"] < -0.25 |
| 116 | |
| 117 | # external_perplexity FAILs: the per-token mean-delta (-0.3) is |
| 118 | # below the -0.1 fixed-threshold gate. |
| 119 | assert ext_result.verdict == Verdict.FAIL, ( |
| 120 | f"external_perplexity should have failed on diffuse drift; " |
| 121 | f"message={ext_result.message}, evidence={ext_result.evidence}" |
| 122 | ) |
| 123 | assert ext_result.raw is not None |
| 124 | assert -0.35 < ext_result.raw < -0.25 |