@@ -276,19 +276,22 @@ class TestProbe: |
| 276 | 276 | r = probe.run(fresh_spec, RunContext(backend=backend)) |
| 277 | 277 | assert r.evidence["from_cache"] is False |
| 278 | 278 | |
| 279 | | - def test_std_floor_prevents_runaway_zscore(self) -> None: |
| 280 | | - """C9: identical raws across seeds → std=0 → clamped to 1e-6. |
| 281 | | - |
| 282 | | - Use a single-run calibration (no variance by construction) to |
| 283 | | - force the degenerate case; the runner must still publish the |
| 284 | | - kind with a non-zero std so downstream z-scores stay finite. |
| 279 | + def test_degenerate_calibration_flagged_and_refused(self) -> None: |
| 280 | + """F02 (Audit 03): identical raws or runs≤1 → ``degenerate: 1.0`` |
| 281 | + in the stats dict, and the downstream z-score computation |
| 282 | + refuses instead of firing on a 1e-6 floor. |
| 283 | + |
| 284 | + Pre-F02 this test asserted ``std ≥ 1e-6`` + ``z is not None``, |
| 285 | + which is exactly the contract that produced the audit's |
| 286 | + +290,766σ observation on a leakage probe under ``runs: 1``. |
| 287 | + The fix flips both assertions. |
| 285 | 288 | """ |
| 286 | 289 | backend = _diverging_backend() |
| 287 | 290 | probe, spec = build_probe( |
| 288 | 291 | { |
| 289 | 292 | "name": "null", |
| 290 | 293 | "kind": "null_adapter", |
| 291 | | - "runs": 1, # single seed → std=0 |
| 294 | + "runs": 1, # single seed → degenerate by construction |
| 292 | 295 | "calibrate_kinds": ["delta_kl"], |
| 293 | 296 | } |
| 294 | 297 | ) |
@@ -296,13 +299,15 @@ class TestProbe: |
| 296 | 299 | result = probe.run(spec, ctx) |
| 297 | 300 | assert result.verdict == Verdict.PASS |
| 298 | 301 | stats = result.evidence["null_stats"]["delta_kl"] |
| 299 | | - assert stats["std"] >= 1e-6 |
| 300 | | - # And the downstream z-score computation is finite, not inf. |
| 302 | + # Std floor is still 1e-6 (preserved for valid-but-tight |
| 303 | + # multi-seed nulls). What changed is the explicit |
| 304 | + # ``degenerate`` flag on the stats dict — ``runs: 1`` → True. |
| 305 | + assert stats["std"] == 1e-6 |
| 306 | + assert stats["degenerate"] >= 0.5 |
| 307 | + # Downstream z_score now refuses rather than emit runaway values. |
| 301 | 308 | from dlm_sway.probes._zscore import z_score |
| 302 | 309 | |
| 303 | | - z = z_score(0.5, stats) |
| 304 | | - assert z is not None |
| 305 | | - assert np.isfinite(z) |
| 310 | + assert z_score(0.5, stats) is None |
| 306 | 311 | |
| 307 | 312 | def test_per_kind_stats_published(self) -> None: |
| 308 | 313 | """Every calibrating kind gets its own (mean, std, n) triple.""" |