sway Public

Watch 0 Fork 0 Star 0

Python · 3795 bytes Raw Blame History

  
        1
        """Tiny statistics helpers for numeric probes (S14 / F9).
      
        2
        
        3
        Bootstrap resampling is the shipping pattern. Every numeric probe
      
        4
        that aggregates over a handful of prompts (4–32 is the common range)
      
        5
        has real sampling noise on its point estimate — a single `mean =
      
        6
        0.083` number hides the fact that the underlying prompt set could
      
        7
        have given anywhere from 0.06 to 0.10 with a different prompt
      
        8
        selection.
      
        9
        
        10
        :func:`bootstrap_ci` samples-with-replacement from the given array
      
        11
        ``n_bootstrap`` times, takes the mean of each resample, and returns
      
        12
        the ``(lower, upper)`` percentile bounds for the requested
      
        13
        confidence level. Deterministic via the ``seed`` arg so two runs of
      
        14
        the same suite produce the same CI.
      
        15
        
        16
        Intentionally *not* a full statistics module — just the one helper
      
        17
        the aggregating probes need. Bayesian CIs, studentized bootstraps,
      
        18
        BCa corrections are out of scope; bootstrap-percentile is the MVP
      
        19
        the audit's F9 entry called out.
      
        20
        """
      
        21
        
        22
        from __future__ import annotations
      
        23
        
        24
        import math
      
        25
        from collections.abc import Sequence
      
        26
        
        27
        import numpy as np
      
        28
        from numpy.typing import NDArray
      
        29
        
        30
        
        31
        def bootstrap_ci(
      
        32
            samples: Sequence[float] | NDArray[np.float64],
      
        33
            *,
      
        34
            n_bootstrap: int = 1000,
      
        35
            confidence: float = 0.95,
      
        36
            seed: int = 0,
      
        37
        ) -> tuple[float, float] | None:
      
        38
            """Percentile bootstrap confidence interval for the mean.
      
        39
        
        40
            Parameters
      
        41
            ----------
      
        42
            samples:
      
        43
                Per-sample measurements (e.g. the list of per-prompt
      
        44
                divergences inside ``delta_kl``). All values must be finite
      
        45
                — non-finite inputs short-circuit to ``None`` so probes can
      
        46
                fold the CI path without pre-validating.
      
        47
            n_bootstrap:
      
        48
                Resample count. 1000 is the sweet spot for percentile
      
        49
                bootstraps: wider resamples (10k) give slightly tighter
      
        50
                intervals at 10× the cost; fewer (100) start to show
      
        51
                percentile discretization on small ``n``.
      
        52
            confidence:
      
        53
                Two-sided coverage. ``0.95`` → returns the 2.5th and 97.5th
      
        54
                percentiles of the bootstrap distribution.
      
        55
            seed:
      
        56
                Governs the resample RNG. Threaded from ``ctx.seed`` so the
      
        57
                CI is reproducible across runs of the same suite.
      
        58
        
        59
            Returns
      
        60
            -------
      
        61
            ``(lower, upper)`` floats. ``None`` when:
      
        62
        
        63
            - ``samples`` is empty
      
        64
            - any sample is non-finite
      
        65
            - ``confidence`` is outside ``(0, 1)``
      
        66
        
        67
            Notes
      
        68
            -----
      
        69
            Percentile bootstrap — doesn't correct for skew (BCa would) but
      
        70
            is accurate enough for the 4–32-sample range probes operate in
      
        71
            and has no knobs users could get wrong.
      
        72
            """
      
        73
            arr = np.asarray(samples, dtype=np.float64)
      
        74
            if arr.size == 0:
      
        75
                return None
      
        76
            if not np.all(np.isfinite(arr)):
      
        77
                return None
      
        78
            if not (0.0 < confidence < 1.0):
      
        79
                return None
      
        80
        
        81
            # Degenerate (all-identical) input → zero-width CI at the common
      
        82
            # value. Skip the resample work entirely — saves a few ms per
      
        83
            # probe when the adapter happens to produce constant per-prompt
      
        84
            # divergence, and avoids any RNG-dependent rounding noise on the
      
        85
            # bounds.
      
        86
            if float(arr.min()) == float(arr.max()):
      
        87
                v = float(arr[0])
      
        88
                return (v, v)
      
        89
        
        90
            rng = np.random.default_rng(seed)
      
        91
            n = arr.size
      
        92
            # Vectorized resample: one (n_bootstrap, n) index matrix; take
      
        93
            # means along axis 1. Memory = 8 * n_bootstrap * n bytes —
      
        94
            # ~128 KB at default settings with n=16. Fits in L2.
      
        95
            idx = rng.integers(0, n, size=(n_bootstrap, n))
      
        96
            resampled_means = arr[idx].mean(axis=1)
      
        97
        
        98
            alpha = 1.0 - confidence
      
        99
            lo_p = (alpha / 2.0) * 100.0
      
        100
            hi_p = (1.0 - alpha / 2.0) * 100.0
      
        101
            lo, hi = np.percentile(resampled_means, [lo_p, hi_p])
      
        102
            lo_f, hi_f = float(lo), float(hi)
      
        103
            if not (math.isfinite(lo_f) and math.isfinite(hi_f)):
      
        104
                return None
      
        105
            return (lo_f, hi_f)
      
        106
        
        107
        
        108
        __all__ = ["bootstrap_ci"]

1	"""Tiny statistics helpers for numeric probes (S14 / F9).
2
3	Bootstrap resampling is the shipping pattern. Every numeric probe
4	that aggregates over a handful of prompts (4–32 is the common range)
5	has real sampling noise on its point estimate — a single `mean =
6	0.083` number hides the fact that the underlying prompt set could
7	have given anywhere from 0.06 to 0.10 with a different prompt
8	selection.
9
10	:func:`bootstrap_ci` samples-with-replacement from the given array
11	``n_bootstrap`` times, takes the mean of each resample, and returns
12	the ``(lower, upper)`` percentile bounds for the requested
13	confidence level. Deterministic via the ``seed`` arg so two runs of
14	the same suite produce the same CI.
15
16	Intentionally not a full statistics module — just the one helper
17	the aggregating probes need. Bayesian CIs, studentized bootstraps,
18	BCa corrections are out of scope; bootstrap-percentile is the MVP
19	the audit's F9 entry called out.
20	"""
21
22	from __future__ import annotations
23
24	import math
25	from collections.abc import Sequence
26
27	import numpy as np
28	from numpy.typing import NDArray
29
30
31	def bootstrap_ci(
32	samples: Sequence[float] \| NDArray[np.float64],
33	*,
34	n_bootstrap: int = 1000,
35	confidence: float = 0.95,
36	seed: int = 0,
37	) -> tuple[float, float] \| None:
38	"""Percentile bootstrap confidence interval for the mean.
39
40	Parameters
41	----------
42	samples:
43	Per-sample measurements (e.g. the list of per-prompt
44	divergences inside ``delta_kl``). All values must be finite
45	— non-finite inputs short-circuit to ``None`` so probes can
46	fold the CI path without pre-validating.
47	n_bootstrap:
48	Resample count. 1000 is the sweet spot for percentile
49	bootstraps: wider resamples (10k) give slightly tighter
50	intervals at 10× the cost; fewer (100) start to show
51	percentile discretization on small ``n``.
52	confidence:
53	Two-sided coverage. ``0.95`` → returns the 2.5th and 97.5th
54	percentiles of the bootstrap distribution.
55	seed:
56	Governs the resample RNG. Threaded from ``ctx.seed`` so the
57	CI is reproducible across runs of the same suite.
58
59	Returns
60	-------
61	``(lower, upper)`` floats. ``None`` when:
62
63	- ``samples`` is empty
64	- any sample is non-finite
65	- ``confidence`` is outside ``(0, 1)``
66
67	Notes
68	-----
69	Percentile bootstrap — doesn't correct for skew (BCa would) but
70	is accurate enough for the 4–32-sample range probes operate in
71	and has no knobs users could get wrong.
72	"""
73	arr = np.asarray(samples, dtype=np.float64)
74	if arr.size == 0:
75	return None
76	if not np.all(np.isfinite(arr)):
77	return None
78	if not (0.0 < confidence < 1.0):
79	return None
80
81	# Degenerate (all-identical) input → zero-width CI at the common
82	# value. Skip the resample work entirely — saves a few ms per
83	# probe when the adapter happens to produce constant per-prompt
84	# divergence, and avoids any RNG-dependent rounding noise on the
85	# bounds.
86	if float(arr.min()) == float(arr.max()):
87	v = float(arr[0])
88	return (v, v)
89
90	rng = np.random.default_rng(seed)
91	n = arr.size
92	# Vectorized resample: one (n_bootstrap, n) index matrix; take
93	# means along axis 1. Memory = 8 * n_bootstrap * n bytes —
94	# ~128 KB at default settings with n=16. Fits in L2.
95	idx = rng.integers(0, n, size=(n_bootstrap, n))
96	resampled_means = arr[idx].mean(axis=1)
97
98	alpha = 1.0 - confidence
99	lo_p = (alpha / 2.0) * 100.0
100	hi_p = (1.0 - alpha / 2.0) * 100.0
101	lo, hi = np.percentile(resampled_means, [lo_p, hi_p])
102	lo_f, hi_f = float(lo), float(hi)
103	if not (math.isfinite(lo_f) and math.isfinite(hi_f)):
104	return None
105	return (lo_f, hi_f)
106
107
108	__all__ = ["bootstrap_ci"]