Python · 3795 bytes Raw Blame History
1 """Tiny statistics helpers for numeric probes (S14 / F9).
2
3 Bootstrap resampling is the shipping pattern. Every numeric probe
4 that aggregates over a handful of prompts (4–32 is the common range)
5 has real sampling noise on its point estimate — a single `mean =
6 0.083` number hides the fact that the underlying prompt set could
7 have given anywhere from 0.06 to 0.10 with a different prompt
8 selection.
9
10 :func:`bootstrap_ci` samples-with-replacement from the given array
11 ``n_bootstrap`` times, takes the mean of each resample, and returns
12 the ``(lower, upper)`` percentile bounds for the requested
13 confidence level. Deterministic via the ``seed`` arg so two runs of
14 the same suite produce the same CI.
15
16 Intentionally *not* a full statistics module — just the one helper
17 the aggregating probes need. Bayesian CIs, studentized bootstraps,
18 BCa corrections are out of scope; bootstrap-percentile is the MVP
19 the audit's F9 entry called out.
20 """
21
22 from __future__ import annotations
23
24 import math
25 from collections.abc import Sequence
26
27 import numpy as np
28 from numpy.typing import NDArray
29
30
31 def bootstrap_ci(
32 samples: Sequence[float] | NDArray[np.float64],
33 *,
34 n_bootstrap: int = 1000,
35 confidence: float = 0.95,
36 seed: int = 0,
37 ) -> tuple[float, float] | None:
38 """Percentile bootstrap confidence interval for the mean.
39
40 Parameters
41 ----------
42 samples:
43 Per-sample measurements (e.g. the list of per-prompt
44 divergences inside ``delta_kl``). All values must be finite
45 — non-finite inputs short-circuit to ``None`` so probes can
46 fold the CI path without pre-validating.
47 n_bootstrap:
48 Resample count. 1000 is the sweet spot for percentile
49 bootstraps: wider resamples (10k) give slightly tighter
50 intervals at 10× the cost; fewer (100) start to show
51 percentile discretization on small ``n``.
52 confidence:
53 Two-sided coverage. ``0.95`` → returns the 2.5th and 97.5th
54 percentiles of the bootstrap distribution.
55 seed:
56 Governs the resample RNG. Threaded from ``ctx.seed`` so the
57 CI is reproducible across runs of the same suite.
58
59 Returns
60 -------
61 ``(lower, upper)`` floats. ``None`` when:
62
63 - ``samples`` is empty
64 - any sample is non-finite
65 - ``confidence`` is outside ``(0, 1)``
66
67 Notes
68 -----
69 Percentile bootstrap — doesn't correct for skew (BCa would) but
70 is accurate enough for the 4–32-sample range probes operate in
71 and has no knobs users could get wrong.
72 """
73 arr = np.asarray(samples, dtype=np.float64)
74 if arr.size == 0:
75 return None
76 if not np.all(np.isfinite(arr)):
77 return None
78 if not (0.0 < confidence < 1.0):
79 return None
80
81 # Degenerate (all-identical) input → zero-width CI at the common
82 # value. Skip the resample work entirely — saves a few ms per
83 # probe when the adapter happens to produce constant per-prompt
84 # divergence, and avoids any RNG-dependent rounding noise on the
85 # bounds.
86 if float(arr.min()) == float(arr.max()):
87 v = float(arr[0])
88 return (v, v)
89
90 rng = np.random.default_rng(seed)
91 n = arr.size
92 # Vectorized resample: one (n_bootstrap, n) index matrix; take
93 # means along axis 1. Memory = 8 * n_bootstrap * n bytes —
94 # ~128 KB at default settings with n=16. Fits in L2.
95 idx = rng.integers(0, n, size=(n_bootstrap, n))
96 resampled_means = arr[idx].mean(axis=1)
97
98 alpha = 1.0 - confidence
99 lo_p = (alpha / 2.0) * 100.0
100 hi_p = (1.0 - alpha / 2.0) * 100.0
101 lo, hi = np.percentile(resampled_means, [lo_p, hi_p])
102 lo_f, hi_f = float(lo), float(hi)
103 if not (math.isfinite(lo_f) and math.isfinite(hi_f)):
104 return None
105 return (lo_f, hi_f)
106
107
108 __all__ = ["bootstrap_ci"]