Python · 11016 bytes Raw Blame History
1 """Tests for the multi-rank null-adapter calibration path (S10 / F4)."""
2
3 from __future__ import annotations
4
5 import math
6
7 import numpy as np
8 import pytest
9
10 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
11 from dlm_sway.core.result import Verdict
12 from dlm_sway.probes._zscore import format_z_profile, z_scores_by_rank
13 from dlm_sway.probes.base import RunContext, build_probe
14 from dlm_sway.suite.runner import run as run_suite
15 from dlm_sway.suite.spec import SwaySpec
16
17
18 def _backend() -> DummyDifferentialBackend:
19 return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
20
21
22 class TestDummyBackendRankScale:
23 def test_rank_scale_scales_noise_std(self) -> None:
24 """sqrt(rank_scale) scales the null-view perturbation std."""
25 backend = _backend()
26 prompt = "hello"
27
28 # Collect 40 samples at each rank_scale to estimate std.
29 def _std_at(rank_scale: float) -> float:
30 lps = []
31 for seed in range(40):
32 with backend.as_null_adapter(seed=seed, rank_scale=rank_scale) as view:
33 d = view.next_token_dist(prompt, top_k=8)
34 lps.append(d.logprobs)
35 arr = np.asarray(lps)
36 # Variance across seeds at each position, averaged.
37 return float(np.mean(np.std(arr, axis=0)))
38
39 std_1 = _std_at(1.0)
40 std_half = _std_at(0.5)
41 std_2 = _std_at(2.0)
42 # std ∝ sqrt(rank_scale). Tolerance loose because of the
43 # top-k renorm and seed discretion.
44 assert std_half < std_1 < std_2
45 # Ratio should be roughly sqrt(2) with 20% tolerance.
46 ratio_up = std_2 / std_1
47 ratio_down = std_1 / std_half
48 assert 1.15 < ratio_up < 1.75, f"2x ratio={ratio_up}"
49 assert 1.15 < ratio_down < 1.75, f"0.5x ratio={ratio_down}"
50
51 @pytest.mark.parametrize("bad", [0.0, -1.0, float("nan"), float("inf")])
52 def test_rejects_non_positive_rank_scale(self, bad: float) -> None:
53 backend = _backend()
54 with pytest.raises(ValueError, match="rank_scale"):
55 with backend.as_null_adapter(seed=0, rank_scale=bad):
56 pass
57
58 def test_rank_scale_1_preserves_pre_s10_behavior(self) -> None:
59 """rank_scale=1.0 → identical output to calling without the kwarg."""
60 backend = _backend()
61 with backend.as_null_adapter(seed=7) as v1:
62 d1 = v1.next_token_dist("hello", top_k=8)
63 with backend.as_null_adapter(seed=7, rank_scale=1.0) as v2:
64 d2 = v2.next_token_dist("hello", top_k=8)
65 np.testing.assert_array_equal(d1.logprobs, d2.logprobs)
66
67
68 class TestNullProbeMultiRank:
69 def test_single_rank_default_matches_pre_s10(self) -> None:
70 """Default rank_multipliers=[1.0] produces the same shape of
71 evidence as pre-S10 + a null_stats_by_rank with one entry."""
72 backend = _backend()
73 probe, spec = build_probe(
74 {
75 "name": "null",
76 "kind": "null_adapter",
77 "runs": 3,
78 "calibrate_kinds": ["delta_kl"],
79 }
80 )
81 ctx = RunContext(backend=backend)
82 result = probe.run(spec, ctx)
83 assert result.verdict == Verdict.PASS
84 stats = result.evidence["null_stats"]
85 by_rank = result.evidence["null_stats_by_rank"]
86 assert "delta_kl" in stats
87 assert set(by_rank) == {"rank_1.00"}
88 assert by_rank["rank_1.00"] == stats
89
90 def test_three_ranks_produce_three_groups(self) -> None:
91 backend = _backend()
92 probe, spec = build_probe(
93 {
94 "name": "null",
95 "kind": "null_adapter",
96 "runs": 3,
97 "rank_multipliers": [0.5, 1.0, 2.0],
98 "calibrate_kinds": ["delta_kl"],
99 }
100 )
101 ctx = RunContext(backend=backend)
102 result = probe.run(spec, ctx)
103 assert result.verdict == Verdict.PASS
104 by_rank = result.evidence["null_stats_by_rank"]
105 assert set(by_rank) == {"rank_0.50", "rank_1.00", "rank_2.00"}
106 for rkey, kind_stats in by_rank.items():
107 assert "delta_kl" in kind_stats, f"{rkey} missing delta_kl"
108 assert kind_stats["delta_kl"]["std"] > 0.0
109
110 def test_rank_0_and_negative_rejected(self) -> None:
111 backend = _backend()
112 probe, spec = build_probe(
113 {
114 "name": "null",
115 "kind": "null_adapter",
116 "runs": 2,
117 "rank_multipliers": [1.0, -0.5],
118 "calibrate_kinds": ["delta_kl"],
119 }
120 )
121 ctx = RunContext(backend=backend)
122 result = probe.run(spec, ctx)
123 assert result.verdict == Verdict.ERROR
124 assert "rank_multipliers" in (result.message or "")
125
126 def test_higher_rank_has_larger_null_std(self) -> None:
127 """A 2x rank null should show more delta_kl variance than a 0.5x one."""
128 backend = _backend()
129 probe, spec = build_probe(
130 {
131 "name": "null",
132 "kind": "null_adapter",
133 "runs": 5,
134 "rank_multipliers": [0.5, 2.0],
135 "calibrate_kinds": ["delta_kl"],
136 "cache": False,
137 }
138 )
139 ctx = RunContext(backend=backend)
140 result = probe.run(spec, ctx)
141 by_rank = result.evidence["null_stats_by_rank"]
142 std_half = by_rank["rank_0.50"]["delta_kl"]["std"]
143 std_2 = by_rank["rank_2.00"]["delta_kl"]["std"]
144 assert std_2 > std_half, f"2x std={std_2} not > 0.5x std={std_half}"
145
146
147 class TestRunnerThreadsNullStatsByRank:
148 def test_delta_kl_emits_z_by_rank(self) -> None:
149 """null_adapter → delta_kl: evidence carries z_by_rank with three entries."""
150 backend = _backend()
151 raw_spec = SwaySpec.model_validate(
152 {
153 "version": 1,
154 "models": {
155 "base": {"base": "b"},
156 "ft": {"base": "b", "adapter": "/tmp/a"},
157 },
158 "suite": [
159 {
160 "name": "null",
161 "kind": "null_adapter",
162 "runs": 3,
163 "rank_multipliers": [0.5, 1.0, 2.0],
164 "cache": False,
165 },
166 {
167 "name": "dk",
168 "kind": "delta_kl",
169 "prompts": ["p1", "p2"],
170 "assert_z_gte": -100.0, # permissive
171 },
172 ],
173 }
174 )
175 result = run_suite(raw_spec, backend)
176 assert len(result.probes) == 2
177 dk = result.probes[1]
178 z_by_rank = dk.evidence.get("z_by_rank")
179 assert z_by_rank is not None
180 assert set(z_by_rank) == {"rank_0.50", "rank_1.00", "rank_2.00"}
181 # Each z is finite.
182 for z in z_by_rank.values():
183 assert math.isfinite(z)
184
185
186 class TestZScoreHelpers:
187 def test_z_scores_by_rank_positive_sign(self) -> None:
188 raw = 1.0
189 stats_by_rank = {
190 "rank_1.00": {"mean": 0.5, "std": 0.1, "n": 3.0},
191 "rank_0.50": {"mean": 0.3, "std": 0.1, "n": 3.0},
192 }
193 z = z_scores_by_rank(raw, stats_by_rank, sign=+1)
194 assert z is not None
195 assert abs(z["rank_1.00"] - 5.0) < 1e-9
196 assert abs(z["rank_0.50"] - 7.0) < 1e-9
197
198 def test_z_scores_by_rank_negative_sign(self) -> None:
199 """Lower-is-better probes invert the sign."""
200 raw = 0.1
201 stats_by_rank = {"rank_1.00": {"mean": 0.5, "std": 0.1, "n": 3.0}}
202 z = z_scores_by_rank(raw, stats_by_rank, sign=-1)
203 assert z is not None
204 assert abs(z["rank_1.00"] - 4.0) < 1e-9 # -((0.1 - 0.5)/0.1) = 4
205
206 def test_z_scores_by_rank_none_on_empty(self) -> None:
207 assert z_scores_by_rank(0.0, None) is None
208 assert z_scores_by_rank(0.0, {}) is None
209
210 def test_z_scores_by_rank_drops_degenerate_ranks(self) -> None:
211 """Ranks with std < MIN_STD silently drop out."""
212 stats_by_rank = {
213 "rank_1.00": {"mean": 0.0, "std": 0.1, "n": 3.0},
214 "rank_0.50": {"mean": 0.0, "std": 1e-9, "n": 3.0}, # degenerate
215 }
216 z = z_scores_by_rank(1.0, stats_by_rank, sign=+1)
217 assert z is not None
218 assert set(z) == {"rank_1.00"}
219
220 def test_format_z_profile_readable_labels(self) -> None:
221 s = format_z_profile(
222 {"rank_1.00": 4.2, "rank_0.50": 6.8, "rank_2.00": 2.1},
223 )
224 assert "+4.20σ @ 1x" in s
225 assert "+6.80σ @ 0.5x" in s
226 assert "+2.10σ @ 2x" in s
227 assert " / " in s
228
229 def test_format_z_profile_empty(self) -> None:
230 assert format_z_profile(None) == ""
231 assert format_z_profile({}) == ""
232
233
234 class TestProveTheValueRankSaturation:
235 """S10 prove-the-value (§F4): rank profile reveals adapter saturation.
236
237 The dummy backend's null view injects noise scaled by
238 ``sqrt(rank_scale)`` into ``next_token_dist``. That scales the
239 null distribution of ``delta_kl``'s raw metric (mean JS divergence
240 across prompts) so that smaller ``rank_scale`` → tighter null →
241 larger z at the same adapter divergence.
242
243 Test: hold the adapter fixed (ft responses that produce a known
244 divergence from base), vary rank_scale across {0.5, 1.0, 2.0}, and
245 assert z_0.5 > z_1 > z_2 — exactly the signature of a rank-sized
246 adapter: stronger signal vs a smaller-rank null, weaker signal vs
247 a larger-rank null.
248 """
249
250 def test_rank_profile_monotone_in_inverse_rank(self) -> None:
251 backend = _backend()
252 raw_spec = SwaySpec.model_validate(
253 {
254 "version": 1,
255 "models": {
256 "base": {"base": "b"},
257 "ft": {"base": "b", "adapter": "/tmp/a"},
258 },
259 "suite": [
260 {
261 "name": "null",
262 "kind": "null_adapter",
263 "runs": 5,
264 "rank_multipliers": [0.5, 1.0, 2.0],
265 "cache": False,
266 },
267 {
268 "name": "dk",
269 "kind": "delta_kl",
270 "prompts": ["p1", "p2", "p3", "p4"],
271 "assert_z_gte": -100.0, # permissive
272 },
273 ],
274 }
275 )
276 result = run_suite(raw_spec, backend)
277 dk = result.probes[1]
278 z_by_rank = dk.evidence["z_by_rank"]
279 # Smaller rank → tighter null → larger (more positive) z.
280 z_half = z_by_rank["rank_0.50"]
281 z_1 = z_by_rank["rank_1.00"]
282 z_2 = z_by_rank["rank_2.00"]
283 assert z_half > z_1 > z_2, (
284 f"expected z monotone-decreasing in rank; got "
285 f"0.5x={z_half:.2f}, 1x={z_1:.2f}, 2x={z_2:.2f}"
286 )