Python · 5549 bytes Raw Blame History
1 """Tests for :mod:`dlm_sway.core.stats` (S14 / F9)."""
2
3 from __future__ import annotations
4
5 import math
6
7 import numpy as np
8
9 from dlm_sway.core.stats import bootstrap_ci
10
11
12 class TestBootstrapCi:
13 def test_brackets_the_mean_on_gaussian(self) -> None:
14 """With n=100 samples from N(0, 1), the 95% CI brackets the mean
15 (0) the overwhelming majority of the time. We seed so the test
16 is deterministic; one seed is enough for a regression lock.
17 """
18 rng = np.random.default_rng(0)
19 samples = rng.normal(0.0, 1.0, size=100)
20 ci = bootstrap_ci(samples, seed=0)
21 assert ci is not None
22 lo, hi = ci
23 assert lo < samples.mean() < hi
24 # Width should be small at n=100 under unit variance — SE of
25 # the mean is ~0.1.
26 assert hi - lo < 0.6
27
28 def test_degenerate_constant_samples_zero_width(self) -> None:
29 """All-identical samples → zero-width CI at the common value.
30 The helper short-circuits the bootstrap to avoid RNG noise.
31 """
32 ci = bootstrap_ci([0.5, 0.5, 0.5, 0.5])
33 assert ci == (0.5, 0.5)
34
35 def test_nonfinite_samples_return_none(self) -> None:
36 assert bootstrap_ci([1.0, float("nan"), 3.0]) is None
37 assert bootstrap_ci([1.0, float("inf"), 3.0]) is None
38
39 def test_empty_returns_none(self) -> None:
40 assert bootstrap_ci([]) is None
41 assert bootstrap_ci(np.array([], dtype=np.float64)) is None
42
43 def test_confidence_outside_0_1_returns_none(self) -> None:
44 assert bootstrap_ci([1.0, 2.0, 3.0], confidence=0.0) is None
45 assert bootstrap_ci([1.0, 2.0, 3.0], confidence=1.0) is None
46 assert bootstrap_ci([1.0, 2.0, 3.0], confidence=-0.5) is None
47
48 def test_seed_reproducibility(self) -> None:
49 samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1]
50 ci1 = bootstrap_ci(samples, seed=42)
51 ci2 = bootstrap_ci(samples, seed=42)
52 assert ci1 == ci2
53
54 def test_seed_differs_produces_different_bounds(self) -> None:
55 """Different seeds should give (tiny) bound differences on small n —
56 not a correctness test, just a smoke check that the seed is
57 actually plumbed into the RNG."""
58 samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1]
59 ci1 = bootstrap_ci(samples, seed=1)
60 ci2 = bootstrap_ci(samples, seed=2)
61 # Bounds are close but not identical — each seed samples different indices.
62 assert ci1 != ci2
63
64 def test_wider_n_bootstrap_converges(self) -> None:
65 """Increasing n_bootstrap tightens the percentile estimates'
66 sampling noise (not the CI itself — that depends on sample
67 size). Here we just confirm that more resamples don't blow
68 up."""
69 samples = [1.0, 2.0, 3.0, 4.0, 5.0]
70 ci_1k = bootstrap_ci(samples, n_bootstrap=1_000, seed=0)
71 ci_10k = bootstrap_ci(samples, n_bootstrap=10_000, seed=0)
72 assert ci_1k is not None
73 assert ci_10k is not None
74 # Same order of magnitude.
75 assert abs((ci_1k[1] - ci_1k[0]) - (ci_10k[1] - ci_10k[0])) < 0.5
76
77 def test_returns_bounds_are_finite(self) -> None:
78 samples = [0.1, 0.2, 0.3, 0.25, 0.15]
79 ci = bootstrap_ci(samples)
80 assert ci is not None
81 lo, hi = ci
82 assert math.isfinite(lo)
83 assert math.isfinite(hi)
84 assert lo <= hi
85
86
87 class TestSafeFinalizeCi:
88 """`safe_finalize` threads ci_95 but nulls it when raw gets nulled."""
89
90 def test_ci_preserved_when_raw_finite(self) -> None:
91 from dlm_sway.core.result import Verdict, safe_finalize
92
93 result = safe_finalize(
94 name="demo",
95 kind="delta_kl",
96 verdict=Verdict.PASS,
97 raw=0.5,
98 ci_95=(0.4, 0.6),
99 )
100 assert result.ci_95 == (0.4, 0.6)
101
102 def test_ci_nulled_when_raw_is_non_finite(self) -> None:
103 from dlm_sway.core.result import Verdict, safe_finalize
104
105 result = safe_finalize(
106 name="demo",
107 kind="delta_kl",
108 verdict=Verdict.PASS,
109 raw=float("nan"), # critical field non-finite
110 ci_95=(0.4, 0.6),
111 )
112 assert result.ci_95 is None
113 assert result.verdict == Verdict.ERROR # critical-field guard fires
114
115 def test_ci_none_default(self) -> None:
116 from dlm_sway.core.result import Verdict, safe_finalize
117
118 result = safe_finalize(
119 name="demo",
120 kind="delta_kl",
121 verdict=Verdict.PASS,
122 raw=0.5,
123 )
124 assert result.ci_95 is None
125
126
127 class TestProbeEmitsCi95:
128 """Smoke: delta_kl on a dummy backend lands a ci_95 that brackets raw."""
129
130 def test_delta_kl_ci_brackets_raw(self) -> None:
131 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
132 from dlm_sway.probes.base import RunContext, build_probe
133
134 backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
135 probe, spec = build_probe(
136 {
137 "name": "dk",
138 "kind": "delta_kl",
139 "prompts": ["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8"],
140 }
141 )
142 ctx = RunContext(backend=backend)
143 result = probe.run(spec, ctx)
144 assert result.ci_95 is not None
145 assert result.raw is not None
146 lo, hi = result.ci_95
147 assert lo <= result.raw <= hi
148 # Evidence payload carries the same interval as a list.
149 assert result.evidence["raw_ci_95"] == [lo, hi]