sway Public

Watch 0 Fork 0 Star 0

Python · 5177 bytes Raw Blame History

  
        1
        """S09 prove-the-value: ``external_perplexity`` catches diffuse forgetting
      
        2
        that ``calibration_drift`` misses.
      
        3
        
        4
        Motivation (from the sprint file / Audit §F3): ``calibration_drift``
      
        5
        flags items that regress past a per-item threshold (default 1.0 nats).
      
        6
        A fine-tune that nudges *every* item by a small amount (say 0.3 nats)
      
        7
        slides under that threshold on every item — mean_delta passes
      
        8
        ``assert_mean_delta_gte=-0.5`` comfortably too — so ``calibration_drift``
      
        9
        reports PASS. That same 0.3-nat-per-token drop on held-out English prose
      
        10
        is exactly what ``external_perplexity`` measures, and 0.3 < 0.1 (the
      
        11
        ``assert_mean_delta_gte=-0.1`` default) → FAIL.
      
        12
        
        13
        This test constructs a dummy backend that exhibits exactly that
      
        14
        signature across both probes, runs both in one suite, and asserts the
      
        15
        verdict split. That split is the F3 differentiator; without it, the
      
        16
        probe would be a second ``calibration_drift`` with slightly different
      
        17
        inputs.
      
        18
        """
      
        19
        
        20
        from __future__ import annotations
      
        21
        
        22
        import numpy as np
      
        23
        
        24
        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
      
        25
        from dlm_sway.core.result import Verdict
      
        26
        from dlm_sway.core.scoring import RollingLogprob
      
        27
        from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
      
        28
        from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
      
        29
        from dlm_sway.suite.runner import run as run_suite
      
        30
        from dlm_sway.suite.spec import SwaySpec
      
        31
        
        32
        # Every pack item and every corpus chunk loses 0.3 nats per token on ft.
      
        33
        # This sits:
      
        34
        #   - Above calibration_drift's `regression_nats` threshold (1.0 nats),
      
        35
        #     so no pack item counts as regressed → frac_regressed=0 → PASS.
      
        36
        #   - Above calibration_drift's `assert_mean_delta_gte` (-0.5), so the
      
        37
        #     mean-delta gate also passes.
      
        38
        #   - Below external_perplexity's `assert_mean_delta_gte` (-0.1), so
      
        39
        #     external_perplexity fails.
      
        40
        _DIFFUSE_DELTA = -0.3
      
        41
        
        42
        
        43
        def _token_estimate(s: str) -> int:
      
        44
            # Mirrors ``calibration_drift._token_estimate``: tokens ≈ len // 4.
      
        45
            return max(1, len(s) // 4)
      
        46
        
        47
        
        48
        def _rolling(text: str, per_tok: float) -> RollingLogprob:
      
        49
            tokens = text.split()
      
        50
            n = max(len(tokens), 1)
      
        51
            lp = np.full(max(n - 1, 0), per_tok, dtype=np.float32)
      
        52
            return RollingLogprob(
      
        53
                token_ids=np.arange(n, dtype=np.int64),
      
        54
                logprobs=lp,
      
        55
                num_tokens=n,
      
        56
                total_logprob=float(per_tok * max(n - 1, 0)),
      
        57
            )
      
        58
        
        59
        
        60
        def _diffuse_forgetting_backend() -> DummyDifferentialBackend:
      
        61
            """Backend where ft assigns uniformly lower logprob across:
      
        62
            - every item in BUILT_IN_PACK (for calibration_drift), and
      
        63
            - every chunk of the public-domain corpus (for external_perplexity).
      
        64
            """
      
        65
            # calibration_drift uses logprob_of(prompt, gold) / tokens.
      
        66
            # Scale per-item delta by tokens so the per-token delta is -0.3.
      
        67
            base_lp: dict[tuple[str, str], float] = {}
      
        68
            ft_lp: dict[tuple[str, str], float] = {}
      
        69
            for prompt, gold in BUILT_IN_PACK:
      
        70
                n_tok = _token_estimate(gold)
      
        71
                base_lp[(prompt, gold)] = -5.0 * n_tok
      
        72
                ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + _DIFFUSE_DELTA * n_tok
      
        73
        
        74
            # external_perplexity uses rolling_logprob(chunk).
      
        75
            corpus = load_corpus("public_domain_en")
      
        76
            chunks = chunk_corpus(corpus, chunk_chars=2048, max_chunks=16)
      
        77
            base_rolling = {c: _rolling(c, -2.0) for c in chunks}
      
        78
            ft_rolling = {c: _rolling(c, -2.0 + _DIFFUSE_DELTA) for c in chunks}
      
        79
        
        80
            return DummyDifferentialBackend(
      
        81
                base=DummyResponses(logprobs=base_lp, rolling=base_rolling),
      
        82
                ft=DummyResponses(logprobs=ft_lp, rolling=ft_rolling),
      
        83
            )
      
        84
        
        85
        
        86
        def test_diffuse_forgetting_splits_verdicts() -> None:
      
        87
            backend = _diffuse_forgetting_backend()
      
        88
            raw_spec = SwaySpec.model_validate(
      
        89
                {
      
        90
                    "version": 1,
      
        91
                    "models": {
      
        92
                        "base": {"base": "b"},
      
        93
                        "ft": {"base": "b", "adapter": "/tmp/a"},
      
        94
                    },
      
        95
                    "suite": [
      
        96
                        # Fixed-threshold paths on both probes — skip null to
      
        97
                        # isolate the claim to the primary metric gates.
      
        98
                        {"name": "cal", "kind": "calibration_drift", "items_limit": 30},
      
        99
                        {"name": "ext", "kind": "external_perplexity", "max_chunks": 4},
      
        100
                    ],
      
        101
                }
      
        102
            )
      
        103
            result = run_suite(raw_spec, backend)
      
        104
            assert len(result.probes) == 2
      
        105
            cal_result = result.probes[0]
      
        106
            ext_result = result.probes[1]
      
        107
        
        108
            # calibration_drift PASSes: no individual item crossed the 1.0-nat
      
        109
            # regression threshold, and mean_delta (-0.3) is above -0.5.
      
        110
            assert cal_result.verdict == Verdict.PASS, (
      
        111
                f"calibration_drift should have passed on diffuse drift; "
      
        112
                f"message={cal_result.message}, evidence={cal_result.evidence}"
      
        113
            )
      
        114
            assert cal_result.evidence["fraction_regressed"] == 0.0
      
        115
            assert -0.35 < cal_result.evidence["mean_delta_nats"] < -0.25
      
        116
        
        117
            # external_perplexity FAILs: the per-token mean-delta (-0.3) is
      
        118
            # below the -0.1 fixed-threshold gate.
      
        119
            assert ext_result.verdict == Verdict.FAIL, (
      
        120
                f"external_perplexity should have failed on diffuse drift; "
      
        121
                f"message={ext_result.message}, evidence={ext_result.evidence}"
      
        122
            )
      
        123
            assert ext_result.raw is not None
      
        124
            assert -0.35 < ext_result.raw < -0.25

1	"""S09 prove-the-value: ``external_perplexity`` catches diffuse forgetting
2	that ``calibration_drift`` misses.
3
4	Motivation (from the sprint file / Audit §F3): ``calibration_drift``
5	flags items that regress past a per-item threshold (default 1.0 nats).
6	A fine-tune that nudges every item by a small amount (say 0.3 nats)
7	slides under that threshold on every item — mean_delta passes
8	``assert_mean_delta_gte=-0.5`` comfortably too — so ``calibration_drift``
9	reports PASS. That same 0.3-nat-per-token drop on held-out English prose
10	is exactly what ``external_perplexity`` measures, and 0.3 < 0.1 (the
11	``assert_mean_delta_gte=-0.1`` default) → FAIL.
12
13	This test constructs a dummy backend that exhibits exactly that
14	signature across both probes, runs both in one suite, and asserts the
15	verdict split. That split is the F3 differentiator; without it, the
16	probe would be a second ``calibration_drift`` with slightly different
17	inputs.
18	"""
19
20	from __future__ import annotations
21
22	import numpy as np
23
24	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
25	from dlm_sway.core.result import Verdict
26	from dlm_sway.core.scoring import RollingLogprob
27	from dlm_sway.probes._calibration_pack import BUILT_IN_PACK
28	from dlm_sway.probes._external_corpus import chunk_corpus, load_corpus
29	from dlm_sway.suite.runner import run as run_suite
30	from dlm_sway.suite.spec import SwaySpec
31
32	# Every pack item and every corpus chunk loses 0.3 nats per token on ft.
33	# This sits:
34	# - Above calibration_drift's `regression_nats` threshold (1.0 nats),
35	# so no pack item counts as regressed → frac_regressed=0 → PASS.
36	# - Above calibration_drift's `assert_mean_delta_gte` (-0.5), so the
37	# mean-delta gate also passes.
38	# - Below external_perplexity's `assert_mean_delta_gte` (-0.1), so
39	# external_perplexity fails.
40	_DIFFUSE_DELTA = -0.3
41
42
43	def _token_estimate(s: str) -> int:
44	# Mirrors ``calibration_drift._token_estimate``: tokens ≈ len // 4.
45	return max(1, len(s) // 4)
46
47
48	def _rolling(text: str, per_tok: float) -> RollingLogprob:
49	tokens = text.split()
50	n = max(len(tokens), 1)
51	lp = np.full(max(n - 1, 0), per_tok, dtype=np.float32)
52	return RollingLogprob(
53	token_ids=np.arange(n, dtype=np.int64),
54	logprobs=lp,
55	num_tokens=n,
56	total_logprob=float(per_tok * max(n - 1, 0)),
57	)
58
59
60	def _diffuse_forgetting_backend() -> DummyDifferentialBackend:
61	"""Backend where ft assigns uniformly lower logprob across:
62	- every item in BUILT_IN_PACK (for calibration_drift), and
63	- every chunk of the public-domain corpus (for external_perplexity).
64	"""
65	# calibration_drift uses logprob_of(prompt, gold) / tokens.
66	# Scale per-item delta by tokens so the per-token delta is -0.3.
67	base_lp: dict[tuple[str, str], float] = {}
68	ft_lp: dict[tuple[str, str], float] = {}
69	for prompt, gold in BUILT_IN_PACK:
70	n_tok = _token_estimate(gold)
71	base_lp[(prompt, gold)] = -5.0 * n_tok
72	ft_lp[(prompt, gold)] = base_lp[(prompt, gold)] + _DIFFUSE_DELTA * n_tok
73
74	# external_perplexity uses rolling_logprob(chunk).
75	corpus = load_corpus("public_domain_en")
76	chunks = chunk_corpus(corpus, chunk_chars=2048, max_chunks=16)
77	base_rolling = {c: _rolling(c, -2.0) for c in chunks}
78	ft_rolling = {c: _rolling(c, -2.0 + _DIFFUSE_DELTA) for c in chunks}
79
80	return DummyDifferentialBackend(
81	base=DummyResponses(logprobs=base_lp, rolling=base_rolling),
82	ft=DummyResponses(logprobs=ft_lp, rolling=ft_rolling),
83	)
84
85
86	def test_diffuse_forgetting_splits_verdicts() -> None:
87	backend = _diffuse_forgetting_backend()
88	raw_spec = SwaySpec.model_validate(
89	{
90	"version": 1,
91	"models": {
92	"base": {"base": "b"},
93	"ft": {"base": "b", "adapter": "/tmp/a"},
94	},
95	"suite": [
96	# Fixed-threshold paths on both probes — skip null to
97	# isolate the claim to the primary metric gates.
98	{"name": "cal", "kind": "calibration_drift", "items_limit": 30},
99	{"name": "ext", "kind": "external_perplexity", "max_chunks": 4},
100	],
101	}
102	)
103	result = run_suite(raw_spec, backend)
104	assert len(result.probes) == 2
105	cal_result = result.probes[0]
106	ext_result = result.probes[1]
107
108	# calibration_drift PASSes: no individual item crossed the 1.0-nat
109	# regression threshold, and mean_delta (-0.3) is above -0.5.
110	assert cal_result.verdict == Verdict.PASS, (
111	f"calibration_drift should have passed on diffuse drift; "
112	f"message={cal_result.message}, evidence={cal_result.evidence}"
113	)
114	assert cal_result.evidence["fraction_regressed"] == 0.0
115	assert -0.35 < cal_result.evidence["mean_delta_nats"] < -0.25
116
117	# external_perplexity FAILs: the per-token mean-delta (-0.3) is
118	# below the -0.1 fixed-threshold gate.
119	assert ext_result.verdict == Verdict.FAIL, (
120	f"external_perplexity should have failed on diffuse drift; "
121	f"message={ext_result.message}, evidence={ext_result.evidence}"
122	)
123	assert ext_result.raw is not None
124	assert -0.35 < ext_result.raw < -0.25