| 1 | """Tiny statistics helpers for numeric probes (S14 / F9). |
| 2 | |
| 3 | Bootstrap resampling is the shipping pattern. Every numeric probe |
| 4 | that aggregates over a handful of prompts (4–32 is the common range) |
| 5 | has real sampling noise on its point estimate — a single `mean = |
| 6 | 0.083` number hides the fact that the underlying prompt set could |
| 7 | have given anywhere from 0.06 to 0.10 with a different prompt |
| 8 | selection. |
| 9 | |
| 10 | :func:`bootstrap_ci` samples-with-replacement from the given array |
| 11 | ``n_bootstrap`` times, takes the mean of each resample, and returns |
| 12 | the ``(lower, upper)`` percentile bounds for the requested |
| 13 | confidence level. Deterministic via the ``seed`` arg so two runs of |
| 14 | the same suite produce the same CI. |
| 15 | |
| 16 | Intentionally *not* a full statistics module — just the one helper |
| 17 | the aggregating probes need. Bayesian CIs, studentized bootstraps, |
| 18 | BCa corrections are out of scope; bootstrap-percentile is the MVP |
| 19 | the audit's F9 entry called out. |
| 20 | """ |
| 21 | |
| 22 | from __future__ import annotations |
| 23 | |
| 24 | import math |
| 25 | from collections.abc import Sequence |
| 26 | |
| 27 | import numpy as np |
| 28 | from numpy.typing import NDArray |
| 29 | |
| 30 | |
| 31 | def bootstrap_ci( |
| 32 | samples: Sequence[float] | NDArray[np.float64], |
| 33 | *, |
| 34 | n_bootstrap: int = 1000, |
| 35 | confidence: float = 0.95, |
| 36 | seed: int = 0, |
| 37 | ) -> tuple[float, float] | None: |
| 38 | """Percentile bootstrap confidence interval for the mean. |
| 39 | |
| 40 | Parameters |
| 41 | ---------- |
| 42 | samples: |
| 43 | Per-sample measurements (e.g. the list of per-prompt |
| 44 | divergences inside ``delta_kl``). All values must be finite |
| 45 | — non-finite inputs short-circuit to ``None`` so probes can |
| 46 | fold the CI path without pre-validating. |
| 47 | n_bootstrap: |
| 48 | Resample count. 1000 is the sweet spot for percentile |
| 49 | bootstraps: wider resamples (10k) give slightly tighter |
| 50 | intervals at 10× the cost; fewer (100) start to show |
| 51 | percentile discretization on small ``n``. |
| 52 | confidence: |
| 53 | Two-sided coverage. ``0.95`` → returns the 2.5th and 97.5th |
| 54 | percentiles of the bootstrap distribution. |
| 55 | seed: |
| 56 | Governs the resample RNG. Threaded from ``ctx.seed`` so the |
| 57 | CI is reproducible across runs of the same suite. |
| 58 | |
| 59 | Returns |
| 60 | ------- |
| 61 | ``(lower, upper)`` floats. ``None`` when: |
| 62 | |
| 63 | - ``samples`` is empty |
| 64 | - any sample is non-finite |
| 65 | - ``confidence`` is outside ``(0, 1)`` |
| 66 | |
| 67 | Notes |
| 68 | ----- |
| 69 | Percentile bootstrap — doesn't correct for skew (BCa would) but |
| 70 | is accurate enough for the 4–32-sample range probes operate in |
| 71 | and has no knobs users could get wrong. |
| 72 | """ |
| 73 | arr = np.asarray(samples, dtype=np.float64) |
| 74 | if arr.size == 0: |
| 75 | return None |
| 76 | if not np.all(np.isfinite(arr)): |
| 77 | return None |
| 78 | if not (0.0 < confidence < 1.0): |
| 79 | return None |
| 80 | |
| 81 | # Degenerate (all-identical) input → zero-width CI at the common |
| 82 | # value. Skip the resample work entirely — saves a few ms per |
| 83 | # probe when the adapter happens to produce constant per-prompt |
| 84 | # divergence, and avoids any RNG-dependent rounding noise on the |
| 85 | # bounds. |
| 86 | if float(arr.min()) == float(arr.max()): |
| 87 | v = float(arr[0]) |
| 88 | return (v, v) |
| 89 | |
| 90 | rng = np.random.default_rng(seed) |
| 91 | n = arr.size |
| 92 | # Vectorized resample: one (n_bootstrap, n) index matrix; take |
| 93 | # means along axis 1. Memory = 8 * n_bootstrap * n bytes — |
| 94 | # ~128 KB at default settings with n=16. Fits in L2. |
| 95 | idx = rng.integers(0, n, size=(n_bootstrap, n)) |
| 96 | resampled_means = arr[idx].mean(axis=1) |
| 97 | |
| 98 | alpha = 1.0 - confidence |
| 99 | lo_p = (alpha / 2.0) * 100.0 |
| 100 | hi_p = (1.0 - alpha / 2.0) * 100.0 |
| 101 | lo, hi = np.percentile(resampled_means, [lo_p, hi_p]) |
| 102 | lo_f, hi_f = float(lo), float(hi) |
| 103 | if not (math.isfinite(lo_f) and math.isfinite(hi_f)): |
| 104 | return None |
| 105 | return (lo_f, hi_f) |
| 106 | |
| 107 | |
| 108 | __all__ = ["bootstrap_ci"] |