| 1 | """Tests for :mod:`dlm_sway.core.stats` (S14 / F9).""" |
| 2 | |
| 3 | from __future__ import annotations |
| 4 | |
| 5 | import math |
| 6 | |
| 7 | import numpy as np |
| 8 | |
| 9 | from dlm_sway.core.stats import bootstrap_ci |
| 10 | |
| 11 | |
| 12 | class TestBootstrapCi: |
| 13 | def test_brackets_the_mean_on_gaussian(self) -> None: |
| 14 | """With n=100 samples from N(0, 1), the 95% CI brackets the mean |
| 15 | (0) the overwhelming majority of the time. We seed so the test |
| 16 | is deterministic; one seed is enough for a regression lock. |
| 17 | """ |
| 18 | rng = np.random.default_rng(0) |
| 19 | samples = rng.normal(0.0, 1.0, size=100) |
| 20 | ci = bootstrap_ci(samples, seed=0) |
| 21 | assert ci is not None |
| 22 | lo, hi = ci |
| 23 | assert lo < samples.mean() < hi |
| 24 | # Width should be small at n=100 under unit variance — SE of |
| 25 | # the mean is ~0.1. |
| 26 | assert hi - lo < 0.6 |
| 27 | |
| 28 | def test_degenerate_constant_samples_zero_width(self) -> None: |
| 29 | """All-identical samples → zero-width CI at the common value. |
| 30 | The helper short-circuits the bootstrap to avoid RNG noise. |
| 31 | """ |
| 32 | ci = bootstrap_ci([0.5, 0.5, 0.5, 0.5]) |
| 33 | assert ci == (0.5, 0.5) |
| 34 | |
| 35 | def test_nonfinite_samples_return_none(self) -> None: |
| 36 | assert bootstrap_ci([1.0, float("nan"), 3.0]) is None |
| 37 | assert bootstrap_ci([1.0, float("inf"), 3.0]) is None |
| 38 | |
| 39 | def test_empty_returns_none(self) -> None: |
| 40 | assert bootstrap_ci([]) is None |
| 41 | assert bootstrap_ci(np.array([], dtype=np.float64)) is None |
| 42 | |
| 43 | def test_confidence_outside_0_1_returns_none(self) -> None: |
| 44 | assert bootstrap_ci([1.0, 2.0, 3.0], confidence=0.0) is None |
| 45 | assert bootstrap_ci([1.0, 2.0, 3.0], confidence=1.0) is None |
| 46 | assert bootstrap_ci([1.0, 2.0, 3.0], confidence=-0.5) is None |
| 47 | |
| 48 | def test_seed_reproducibility(self) -> None: |
| 49 | samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1] |
| 50 | ci1 = bootstrap_ci(samples, seed=42) |
| 51 | ci2 = bootstrap_ci(samples, seed=42) |
| 52 | assert ci1 == ci2 |
| 53 | |
| 54 | def test_seed_differs_produces_different_bounds(self) -> None: |
| 55 | """Different seeds should give (tiny) bound differences on small n — |
| 56 | not a correctness test, just a smoke check that the seed is |
| 57 | actually plumbed into the RNG.""" |
| 58 | samples = [1.2, 3.4, 5.6, 2.1, 4.5, 3.3, 2.8, 4.1] |
| 59 | ci1 = bootstrap_ci(samples, seed=1) |
| 60 | ci2 = bootstrap_ci(samples, seed=2) |
| 61 | # Bounds are close but not identical — each seed samples different indices. |
| 62 | assert ci1 != ci2 |
| 63 | |
| 64 | def test_wider_n_bootstrap_converges(self) -> None: |
| 65 | """Increasing n_bootstrap tightens the percentile estimates' |
| 66 | sampling noise (not the CI itself — that depends on sample |
| 67 | size). Here we just confirm that more resamples don't blow |
| 68 | up.""" |
| 69 | samples = [1.0, 2.0, 3.0, 4.0, 5.0] |
| 70 | ci_1k = bootstrap_ci(samples, n_bootstrap=1_000, seed=0) |
| 71 | ci_10k = bootstrap_ci(samples, n_bootstrap=10_000, seed=0) |
| 72 | assert ci_1k is not None |
| 73 | assert ci_10k is not None |
| 74 | # Same order of magnitude. |
| 75 | assert abs((ci_1k[1] - ci_1k[0]) - (ci_10k[1] - ci_10k[0])) < 0.5 |
| 76 | |
| 77 | def test_returns_bounds_are_finite(self) -> None: |
| 78 | samples = [0.1, 0.2, 0.3, 0.25, 0.15] |
| 79 | ci = bootstrap_ci(samples) |
| 80 | assert ci is not None |
| 81 | lo, hi = ci |
| 82 | assert math.isfinite(lo) |
| 83 | assert math.isfinite(hi) |
| 84 | assert lo <= hi |
| 85 | |
| 86 | |
| 87 | class TestSafeFinalizeCi: |
| 88 | """`safe_finalize` threads ci_95 but nulls it when raw gets nulled.""" |
| 89 | |
| 90 | def test_ci_preserved_when_raw_finite(self) -> None: |
| 91 | from dlm_sway.core.result import Verdict, safe_finalize |
| 92 | |
| 93 | result = safe_finalize( |
| 94 | name="demo", |
| 95 | kind="delta_kl", |
| 96 | verdict=Verdict.PASS, |
| 97 | raw=0.5, |
| 98 | ci_95=(0.4, 0.6), |
| 99 | ) |
| 100 | assert result.ci_95 == (0.4, 0.6) |
| 101 | |
| 102 | def test_ci_nulled_when_raw_is_non_finite(self) -> None: |
| 103 | from dlm_sway.core.result import Verdict, safe_finalize |
| 104 | |
| 105 | result = safe_finalize( |
| 106 | name="demo", |
| 107 | kind="delta_kl", |
| 108 | verdict=Verdict.PASS, |
| 109 | raw=float("nan"), # critical field non-finite |
| 110 | ci_95=(0.4, 0.6), |
| 111 | ) |
| 112 | assert result.ci_95 is None |
| 113 | assert result.verdict == Verdict.ERROR # critical-field guard fires |
| 114 | |
| 115 | def test_ci_none_default(self) -> None: |
| 116 | from dlm_sway.core.result import Verdict, safe_finalize |
| 117 | |
| 118 | result = safe_finalize( |
| 119 | name="demo", |
| 120 | kind="delta_kl", |
| 121 | verdict=Verdict.PASS, |
| 122 | raw=0.5, |
| 123 | ) |
| 124 | assert result.ci_95 is None |
| 125 | |
| 126 | |
| 127 | class TestProbeEmitsCi95: |
| 128 | """Smoke: delta_kl on a dummy backend lands a ci_95 that brackets raw.""" |
| 129 | |
| 130 | def test_delta_kl_ci_brackets_raw(self) -> None: |
| 131 | from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses |
| 132 | from dlm_sway.probes.base import RunContext, build_probe |
| 133 | |
| 134 | backend = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) |
| 135 | probe, spec = build_probe( |
| 136 | { |
| 137 | "name": "dk", |
| 138 | "kind": "delta_kl", |
| 139 | "prompts": ["p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8"], |
| 140 | } |
| 141 | ) |
| 142 | ctx = RunContext(backend=backend) |
| 143 | result = probe.run(spec, ctx) |
| 144 | assert result.ci_95 is not None |
| 145 | assert result.raw is not None |
| 146 | lo, hi = result.ci_95 |
| 147 | assert lo <= result.raw <= hi |
| 148 | # Evidence payload carries the same interval as a list. |
| 149 | assert result.evidence["raw_ci_95"] == [lo, hi] |