sway Public

Watch 0 Fork 0 Star 0

Python · 5549 bytes Raw Blame History

  
        1
        """Tests for :mod:`dlm_sway.core.stats` (S14 / F9)."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import math
      
        6
        
        7
        import numpy as np
      
        8
        
        9
        from dlm_sway.core.stats import bootstrap_ci
      
        10
        
        11
        
        12
        class TestBootstrapCi:
      
        13
            def test_brackets_the_mean_on_gaussian(self) -> None:
      
        14
                """With n=100 samples from N(0, 1), the 95% CI brackets the mean
      
        15
                (0) the overwhelming majority of the time. We seed so the test
      
        16
                is deterministic; one seed is enough for a regression lock.
      
        17
                """
      
        18
                rng = np.random.default_rng(0)
      
        19
                samples = rng.normal(0.0, 1.0, size=100)
      
        20
                ci = bootstrap_ci(samples, seed=0)
      
        21
                assert ci is not None
      
        22
                lo, hi = ci
      
        23
                assert lo < samples.mean() < hi
      
        24
                # Width should be small at n=100 under unit variance — SE of
      
        25
                # the mean is ~0.1.
      
        26
                assert hi - lo < 0.6
      
        27
        
        28
            def test_degenerate_constant_samples_zero_width(self) -> None:
      
        29
                """All-identical samples → zero-width CI at the common value.
      
        30
                The helper short-circuits the bootstrap to avoid RNG noise.
      
        31
                """
      
        32
                ci = bootstrap_ci([0.5, 0.5, 0.5, 0.5])
      
        33
                assert ci == (0.5, 0.5)
      
        34
        
        35
            def test_nonfinite_samples_return_none(self) -> None:
      
        36
                assert bootstrap_ci([1.0, float("nan"), 3.0]) is None
      
        37
                assert bootstrap_ci([1.0, float("inf"), 3.0]) is None
      
        38
        
        39
            def test_empty_returns_none(self) -> None:
      
        40
                assert bootstrap_ci([]) is None
      
        41
                assert bootstrap_ci(np.array([], dtype=np.float64)) is None
      
        42
        
        43
            def test_confidence_outside_0_1_returns_none(self) -> None:
      
        44
                assert bootstrap_ci([1.0, 2.0, 3.0], confidence=0.0) is None
      
        45
                assert bootstrap_ci([1.0, 2.0, 3.0], confidence=1.0) is None
      
        46
                assert bootstrap_ci([1.0, 2.0, 3.0], confidence=-0.5) is None
      
        47
        
        48
            def test_seed_reproducibility(self) -> None:
      
        49
                samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1]
      
        50
                ci1 = bootstrap_ci(samples, seed=42)
      
        51
                ci2 = bootstrap_ci(samples, seed=42)
      
        52
                assert ci1 == ci2
      
        53
        
        54
            def test_seed_differs_produces_different_bounds(self) -> None:
      
        55
                """Different seeds should give (tiny) bound differences on small n —
      
        56
                not a correctness test, just a smoke check that the seed is
      
        57
                actually plumbed into the RNG."""
      
        58
                samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1]
      
        59
                ci1 = bootstrap_ci(samples, seed=1)
      
        60
                ci2 = bootstrap_ci(samples, seed=2)
      
        61
                # Bounds are close but not identical — each seed samples different indices.
      
        62
                assert ci1 != ci2
      
        63
        
        64
            def test_wider_n_bootstrap_converges(self) -> None:
      
        65
                """Increasing n_bootstrap tightens the percentile estimates'
      
        66
                sampling noise (not the CI itself — that depends on sample
      
        67
                size). Here we just confirm that more resamples don't blow
      
        68
                up."""
      
        69
                samples = [1.0, 2.0, 3.0, 4.0, 5.0]
      
        70
                ci_1k = bootstrap_ci(samples, n_bootstrap=1_000, seed=0)
      
        71
                ci_10k = bootstrap_ci(samples, n_bootstrap=10_000, seed=0)
      
        72
                assert ci_1k is not None
      
        73
                assert ci_10k is not None
      
        74
                # Same order of magnitude.
      
        75
                assert abs((ci_1k[1] - ci_1k[0]) - (ci_10k[1] - ci_10k[0])) < 0.5
      
        76
        
        77
            def test_returns_bounds_are_finite(self) -> None:
      
        78
                samples = [0.1, 0.2, 0.3, 0.25, 0.15]
      
        79
                ci = bootstrap_ci(samples)
      
        80
                assert ci is not None
      
        81
                lo, hi = ci
      
        82
                assert math.isfinite(lo)
      
        83
                assert math.isfinite(hi)
      
        84
                assert lo <= hi
      
        85
        
        86
        
        87
        class TestSafeFinalizeCi:
      
        88
            """`safe_finalize` threads ci_95 but nulls it when raw gets nulled."""
      
        89
        
        90
            def test_ci_preserved_when_raw_finite(self) -> None:
      
        91
                from dlm_sway.core.result import Verdict, safe_finalize
      
        92
        
        93
                result = safe_finalize(
      
        94
                    name="demo",
      
        95
                    kind="delta_kl",
      
        96
                    verdict=Verdict.PASS,
      
        97
                    raw=0.5,
      
        98
                    ci_95=(0.4, 0.6),
      
        99
                )
      
        100
                assert result.ci_95 == (0.4, 0.6)
      
        101
        
        102
            def test_ci_nulled_when_raw_is_non_finite(self) -> None:
      
        103
                from dlm_sway.core.result import Verdict, safe_finalize
      
        104
        
        105
                result = safe_finalize(
      
        106
                    name="demo",
      
        107
                    kind="delta_kl",
      
        108
                    verdict=Verdict.PASS,
      
        109
                    raw=float("nan"),  # critical field non-finite
      
        110
                    ci_95=(0.4, 0.6),
      
        111
                )
      
        112
                assert result.ci_95 is None
      
        113
                assert result.verdict == Verdict.ERROR  # critical-field guard fires
      
        114
        
        115
            def test_ci_none_default(self) -> None:
      
        116
                from dlm_sway.core.result import Verdict, safe_finalize
      
        117
        
        118
                result = safe_finalize(
      
        119
                    name="demo",
      
        120
                    kind="delta_kl",
      
        121
                    verdict=Verdict.PASS,
      
        122
                    raw=0.5,
      
        123
                )
      
        124
                assert result.ci_95 is None
      
        125
        
        126
        
        127
        class TestProbeEmitsCi95:
      
        128
            """Smoke: delta_kl on a dummy backend lands a ci_95 that brackets raw."""
      
        129
        
        130
            def test_delta_kl_ci_brackets_raw(self) -> None:
      
        131
                from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
      
        132
                from dlm_sway.probes.base import RunContext, build_probe
      
        133
        
        134
                backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        135
                probe, spec = build_probe(
      
        136
                    {
      
        137
                        "name": "dk",
      
        138
                        "kind": "delta_kl",
      
        139
                        "prompts": ["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8"],
      
        140
                    }
      
        141
                )
      
        142
                ctx = RunContext(backend=backend)
      
        143
                result = probe.run(spec, ctx)
      
        144
                assert result.ci_95 is not None
      
        145
                assert result.raw is not None
      
        146
                lo, hi = result.ci_95
      
        147
                assert lo <= result.raw <= hi
      
        148
                # Evidence payload carries the same interval as a list.
      
        149
                assert result.evidence["raw_ci_95"] == [lo, hi]

1	"""Tests for :mod:`dlm_sway.core.stats` (S14 / F9)."""
2
3	from __future__ import annotations
4
5	import math
6
7	import numpy as np
8
9	from dlm_sway.core.stats import bootstrap_ci
10
11
12	class TestBootstrapCi:
13	def test_brackets_the_mean_on_gaussian(self) -> None:
14	"""With n=100 samples from N(0, 1), the 95% CI brackets the mean
15	(0) the overwhelming majority of the time. We seed so the test
16	is deterministic; one seed is enough for a regression lock.
17	"""
18	rng = np.random.default_rng(0)
19	samples = rng.normal(0.0, 1.0, size=100)
20	ci = bootstrap_ci(samples, seed=0)
21	assert ci is not None
22	lo, hi = ci
23	assert lo < samples.mean() < hi
24	# Width should be small at n=100 under unit variance — SE of
25	# the mean is ~0.1.
26	assert hi - lo < 0.6
27
28	def test_degenerate_constant_samples_zero_width(self) -> None:
29	"""All-identical samples → zero-width CI at the common value.
30	The helper short-circuits the bootstrap to avoid RNG noise.
31	"""
32	ci = bootstrap_ci([0.5, 0.5, 0.5, 0.5])
33	assert ci == (0.5, 0.5)
34
35	def test_nonfinite_samples_return_none(self) -> None:
36	assert bootstrap_ci([1.0, float("nan"), 3.0]) is None
37	assert bootstrap_ci([1.0, float("inf"), 3.0]) is None
38
39	def test_empty_returns_none(self) -> None:
40	assert bootstrap_ci([]) is None
41	assert bootstrap_ci(np.array([], dtype=np.float64)) is None
42
43	def test_confidence_outside_0_1_returns_none(self) -> None:
44	assert bootstrap_ci([1.0, 2.0, 3.0], confidence=0.0) is None
45	assert bootstrap_ci([1.0, 2.0, 3.0], confidence=1.0) is None
46	assert bootstrap_ci([1.0, 2.0, 3.0], confidence=-0.5) is None
47
48	def test_seed_reproducibility(self) -> None:
49	samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1]
50	ci1 = bootstrap_ci(samples, seed=42)
51	ci2 = bootstrap_ci(samples, seed=42)
52	assert ci1 == ci2
53
54	def test_seed_differs_produces_different_bounds(self) -> None:
55	"""Different seeds should give (tiny) bound differences on small n —
56	not a correctness test, just a smoke check that the seed is
57	actually plumbed into the RNG."""
58	samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1]
59	ci1 = bootstrap_ci(samples, seed=1)
60	ci2 = bootstrap_ci(samples, seed=2)
61	# Bounds are close but not identical — each seed samples different indices.
62	assert ci1 != ci2
63
64	def test_wider_n_bootstrap_converges(self) -> None:
65	"""Increasing n_bootstrap tightens the percentile estimates'
66	sampling noise (not the CI itself — that depends on sample
67	size). Here we just confirm that more resamples don't blow
68	up."""
69	samples = [1.0, 2.0, 3.0, 4.0, 5.0]
70	ci_1k = bootstrap_ci(samples, n_bootstrap=1_000, seed=0)
71	ci_10k = bootstrap_ci(samples, n_bootstrap=10_000, seed=0)
72	assert ci_1k is not None
73	assert ci_10k is not None
74	# Same order of magnitude.
75	assert abs((ci_1k[1] - ci_1k[0]) - (ci_10k[1] - ci_10k[0])) < 0.5
76
77	def test_returns_bounds_are_finite(self) -> None:
78	samples = [0.1, 0.2, 0.3, 0.25, 0.15]
79	ci = bootstrap_ci(samples)
80	assert ci is not None
81	lo, hi = ci
82	assert math.isfinite(lo)
83	assert math.isfinite(hi)
84	assert lo <= hi
85
86
87	class TestSafeFinalizeCi:
88	"""`safe_finalize` threads ci_95 but nulls it when raw gets nulled."""
89
90	def test_ci_preserved_when_raw_finite(self) -> None:
91	from dlm_sway.core.result import Verdict, safe_finalize
92
93	result = safe_finalize(
94	name="demo",
95	kind="delta_kl",
96	verdict=Verdict.PASS,
97	raw=0.5,
98	ci_95=(0.4, 0.6),
99	)
100	assert result.ci_95 == (0.4, 0.6)
101
102	def test_ci_nulled_when_raw_is_non_finite(self) -> None:
103	from dlm_sway.core.result import Verdict, safe_finalize
104
105	result = safe_finalize(
106	name="demo",
107	kind="delta_kl",
108	verdict=Verdict.PASS,
109	raw=float("nan"), # critical field non-finite
110	ci_95=(0.4, 0.6),
111	)
112	assert result.ci_95 is None
113	assert result.verdict == Verdict.ERROR # critical-field guard fires
114
115	def test_ci_none_default(self) -> None:
116	from dlm_sway.core.result import Verdict, safe_finalize
117
118	result = safe_finalize(
119	name="demo",
120	kind="delta_kl",
121	verdict=Verdict.PASS,
122	raw=0.5,
123	)
124	assert result.ci_95 is None
125
126
127	class TestProbeEmitsCi95:
128	"""Smoke: delta_kl on a dummy backend lands a ci_95 that brackets raw."""
129
130	def test_delta_kl_ci_brackets_raw(self) -> None:
131	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
132	from dlm_sway.probes.base import RunContext, build_probe
133
134	backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
135	probe, spec = build_probe(
136	{
137	"name": "dk",
138	"kind": "delta_kl",
139	"prompts": ["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8"],
140	}
141	)
142	ctx = RunContext(backend=backend)
143	result = probe.run(spec, ctx)
144	assert result.ci_95 is not None
145	assert result.raw is not None
146	lo, hi = result.ci_95
147	assert lo <= result.raw <= hi
148	# Evidence payload carries the same interval as a list.
149	assert result.evidence["raw_ci_95"] == [lo, hi]