sway Public

Watch 0 Fork 0 Star 0

Python · 5051 bytes Raw Blame History

  
        1
        """S14 F9 prove-the-value: bootstrap CI width shrinks as N grows.
      
        2
        
        3
        The audit's F9 pitch is "every numeric probe should publish a CI so
      
        4
        downstream claims are honest about sampling noise." The narrow-vs-
      
        5
        wide behavior is the concrete evidence that the CI is informative
      
        6
        rather than a fixed-width decoration.
      
        7
        
        8
        Test construction: build a dummy backend that produces *per-prompt
      
        9
        varying* divergences by seeding each prompt's ft token distribution
      
        10
        with its hash. Run ``delta_kl`` at N=4 and N=32 on that backend,
      
        11
        assert the N=32 CI is strictly narrower than the N=4 CI — the F9
      
        12
        claim in concrete form.
      
        13
        
        14
        The stock `DummyDifferentialBackend.as_finetuned()` returns the same
      
        15
        synthesized distribution for every prompt, which produces identical
      
        16
        per-prompt divergences and a zero-width CI at any N. This test's
      
        17
        fixture subclasses the dummy to inject per-prompt variation so the
      
        18
        bootstrap has actual dispersion to measure.
      
        19
        """
      
        20
        
        21
        from __future__ import annotations
      
        22
        
        23
        import math
      
        24
        from collections.abc import Iterator
      
        25
        from contextlib import contextmanager
      
        26
        
        27
        import numpy as np
      
        28
        
        29
        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses, _DummyView
      
        30
        from dlm_sway.core.scoring import TokenDist
      
        31
        from dlm_sway.probes.base import RunContext, build_probe
      
        32
        
        33
        
        34
        class _VariableFtView(_DummyView):
      
        35
            """A dummy view whose ``next_token_dist`` varies by prompt.
      
        36
        
        37
            Each prompt gets a deterministically-seeded small perturbation of
      
        38
            the default ft distribution — enough to produce per-prompt JS
      
        39
            differences in the 0.001–0.05 range, which is where the bootstrap
      
        40
            CI narrowing is visible.
      
        41
            """
      
        42
        
        43
            def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
      
        44
                base_dist = super().next_token_dist(prompt, top_k=top_k)
      
        45
                # Use a stable hash (hashlib) instead of Python's built-in
      
        46
                # ``hash()``, which salts per-process via PYTHONHASHSEED and
      
        47
                # would make per-prompt dispersion vary across pytest runs.
      
        48
                import hashlib
      
        49
        
        50
                seed = int(hashlib.md5(prompt.encode("utf-8")).hexdigest()[:8], 16)
      
        51
                rng = np.random.default_rng(seed)
      
        52
                noise = rng.normal(0.0, 0.5, size=base_dist.logprobs.shape).astype(np.float32)
      
        53
                perturbed = base_dist.logprobs + noise
      
        54
                # Renormalize (within the top-k slice).
      
        55
                max_lp = perturbed.max()
      
        56
                probs = np.exp(perturbed - max_lp)
      
        57
                probs /= probs.sum()
      
        58
                return TokenDist(
      
        59
                    token_ids=base_dist.token_ids,
      
        60
                    logprobs=np.log(probs).astype(np.float32),
      
        61
                    vocab_size=base_dist.vocab_size,
      
        62
                    tail_logprob=base_dist.tail_logprob,
      
        63
                )
      
        64
        
        65
        
        66
        class _VariableFtBackend(DummyDifferentialBackend):
      
        67
            """Dummy backend whose ft view perturbs per-prompt."""
      
        68
        
        69
            @contextmanager
      
        70
            def as_finetuned(self) -> Iterator[_DummyView]:
      
        71
                self._enter("ft")
      
        72
                try:
      
        73
                    view = _VariableFtView("ft", self._ft_r, inst=self._inst)
      
        74
                    yield view
      
        75
                finally:
      
        76
                    self._exit()
      
        77
        
        78
        
        79
        def _run_delta_kl(n_prompts: int) -> tuple[float, float, float]:
      
        80
            """Run delta_kl with ``n_prompts`` synthesized prompts. Returns
      
        81
            ``(raw, ci_lo, ci_hi)``.
      
        82
            """
      
        83
            backend = _VariableFtBackend(base=DummyResponses(), ft=DummyResponses())
      
        84
            prompts = [f"prompt-{i:03d}" for i in range(n_prompts)]
      
        85
            probe, spec = build_probe({"name": f"dk_{n_prompts}", "kind": "delta_kl", "prompts": prompts})
      
        86
            ctx = RunContext(backend=backend)
      
        87
            result = probe.run(spec, ctx)
      
        88
            assert result.raw is not None, "dummy backend delta_kl should produce a raw value"
      
        89
            assert result.ci_95 is not None, "bootstrap_ci should land on delta_kl output"
      
        90
            lo, hi = result.ci_95
      
        91
            return result.raw, lo, hi
      
        92
        
        93
        
        94
        def test_ci_width_shrinks_with_more_prompts() -> None:
      
        95
            """The F9 claim: `delta_kl = 0.05 [0.01, 0.11]` at N=4 narrows to
      
        96
            something tighter at N=32."""
      
        97
            raw_4, lo_4, hi_4 = _run_delta_kl(n_prompts=4)
      
        98
            raw_32, lo_32, hi_32 = _run_delta_kl(n_prompts=32)
      
        99
        
        100
            width_4 = hi_4 - lo_4
      
        101
            width_32 = hi_32 - lo_32
      
        102
        
        103
            # Both raws are positive divergences, live in the same order of
      
        104
            # magnitude, and bracket their own raw value.
      
        105
            assert raw_4 > 0
      
        106
            assert raw_32 > 0
      
        107
            assert lo_4 <= raw_4 <= hi_4
      
        108
            assert lo_32 <= raw_32 <= hi_32
      
        109
        
        110
            # The N=32 CI is strictly tighter than N=4. Theory predicts the
      
        111
            # CI half-width scales as 1/sqrt(N), so N=32 should be roughly
      
        112
            # sqrt(8) ≈ 2.8× narrower than N=4.
      
        113
            assert width_32 < width_4, (
      
        114
                f"expected width_32 < width_4; got {width_32:.4f} >= {width_4:.4f} "
      
        115
                f"(CIs: N=4 {[lo_4, hi_4]}, N=32 {[lo_32, hi_32]})"
      
        116
            )
      
        117
            # Loose additional check: the narrowing factor is meaningfully
      
        118
            # bigger than 1.0 — the CI isn't just slightly tighter from
      
        119
            # RNG noise.
      
        120
            assert width_4 / max(width_32, 1e-9) > 1.5, (
      
        121
                f"expected N=4 width at least 1.5× N=32 width; got ratio "
      
        122
                f"{width_4 / max(width_32, 1e-9):.2f}"
      
        123
            )
      
        124
            # Sanity on magnitudes — widths are positive and finite.
      
        125
            for w in (width_4, width_32):
      
        126
                assert w > 0
      
        127
                assert math.isfinite(w)

1	"""S14 F9 prove-the-value: bootstrap CI width shrinks as N grows.
2
3	The audit's F9 pitch is "every numeric probe should publish a CI so
4	downstream claims are honest about sampling noise." The narrow-vs-
5	wide behavior is the concrete evidence that the CI is informative
6	rather than a fixed-width decoration.
7
8	Test construction: build a dummy backend that produces *per-prompt
9	varying* divergences by seeding each prompt's ft token distribution
10	with its hash. Run ``delta_kl`` at N=4 and N=32 on that backend,
11	assert the N=32 CI is strictly narrower than the N=4 CI — the F9
12	claim in concrete form.
13
14	The stock `DummyDifferentialBackend.as_finetuned()` returns the same
15	synthesized distribution for every prompt, which produces identical
16	per-prompt divergences and a zero-width CI at any N. This test's
17	fixture subclasses the dummy to inject per-prompt variation so the
18	bootstrap has actual dispersion to measure.
19	"""
20
21	from __future__ import annotations
22
23	import math
24	from collections.abc import Iterator
25	from contextlib import contextmanager
26
27	import numpy as np
28
29	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses, _DummyView
30	from dlm_sway.core.scoring import TokenDist
31	from dlm_sway.probes.base import RunContext, build_probe
32
33
34	class _VariableFtView(_DummyView):
35	"""A dummy view whose ``next_token_dist`` varies by prompt.
36
37	Each prompt gets a deterministically-seeded small perturbation of
38	the default ft distribution — enough to produce per-prompt JS
39	differences in the 0.001–0.05 range, which is where the bootstrap
40	CI narrowing is visible.
41	"""
42
43	def next_token_dist(self, prompt: str, *, top_k: int = 256) -> TokenDist:
44	base_dist = super().next_token_dist(prompt, top_k=top_k)
45	# Use a stable hash (hashlib) instead of Python's built-in
46	# ``hash()``, which salts per-process via PYTHONHASHSEED and
47	# would make per-prompt dispersion vary across pytest runs.
48	import hashlib
49
50	seed = int(hashlib.md5(prompt.encode("utf-8")).hexdigest()[:8], 16)
51	rng = np.random.default_rng(seed)
52	noise = rng.normal(0.0, 0.5, size=base_dist.logprobs.shape).astype(np.float32)
53	perturbed = base_dist.logprobs + noise
54	# Renormalize (within the top-k slice).
55	max_lp = perturbed.max()
56	probs = np.exp(perturbed - max_lp)
57	probs /= probs.sum()
58	return TokenDist(
59	token_ids=base_dist.token_ids,
60	logprobs=np.log(probs).astype(np.float32),
61	vocab_size=base_dist.vocab_size,
62	tail_logprob=base_dist.tail_logprob,
63	)
64
65
66	class _VariableFtBackend(DummyDifferentialBackend):
67	"""Dummy backend whose ft view perturbs per-prompt."""
68
69	@contextmanager
70	def as_finetuned(self) -> Iterator[_DummyView]:
71	self._enter("ft")
72	try:
73	view = _VariableFtView("ft", self._ft_r, inst=self._inst)
74	yield view
75	finally:
76	self._exit()
77
78
79	def _run_delta_kl(n_prompts: int) -> tuple[float, float, float]:
80	"""Run delta_kl with ``n_prompts`` synthesized prompts. Returns
81	``(raw, ci_lo, ci_hi)``.
82	"""
83	backend = _VariableFtBackend(base=DummyResponses(), ft=DummyResponses())
84	prompts = [f"prompt-{i:03d}" for i in range(n_prompts)]
85	probe, spec = build_probe({"name": f"dk_{n_prompts}", "kind": "delta_kl", "prompts": prompts})
86	ctx = RunContext(backend=backend)
87	result = probe.run(spec, ctx)
88	assert result.raw is not None, "dummy backend delta_kl should produce a raw value"
89	assert result.ci_95 is not None, "bootstrap_ci should land on delta_kl output"
90	lo, hi = result.ci_95
91	return result.raw, lo, hi
92
93
94	def test_ci_width_shrinks_with_more_prompts() -> None:
95	"""The F9 claim: `delta_kl = 0.05 [0.01, 0.11]` at N=4 narrows to
96	something tighter at N=32."""
97	raw_4, lo_4, hi_4 = _run_delta_kl(n_prompts=4)
98	raw_32, lo_32, hi_32 = _run_delta_kl(n_prompts=32)
99
100	width_4 = hi_4 - lo_4
101	width_32 = hi_32 - lo_32
102
103	# Both raws are positive divergences, live in the same order of
104	# magnitude, and bracket their own raw value.
105	assert raw_4 > 0
106	assert raw_32 > 0
107	assert lo_4 <= raw_4 <= hi_4
108	assert lo_32 <= raw_32 <= hi_32
109
110	# The N=32 CI is strictly tighter than N=4. Theory predicts the
111	# CI half-width scales as 1/sqrt(N), so N=32 should be roughly
112	# sqrt(8) ≈ 2.8× narrower than N=4.
113	assert width_32 < width_4, (
114	f"expected width_32 < width_4; got {width_32:.4f} >= {width_4:.4f} "
115	f"(CIs: N=4 {[lo_4, hi_4]}, N=32 {[lo_32, hi_32]})"
116	)
117	# Loose additional check: the narrowing factor is meaningfully
118	# bigger than 1.0 — the CI isn't just slightly tighter from
119	# RNG noise.
120	assert width_4 / max(width_32, 1e-9) > 1.5, (
121	f"expected N=4 width at least 1.5× N=32 width; got ratio "
122	f"{width_4 / max(width_32, 1e-9):.2f}"
123	)
124	# Sanity on magnitudes — widths are positive and finite.
125	for w in (width_4, width_32):
126	assert w > 0
127	assert math.isfinite(w)