tenseleyflow/sway / b0469ba

Browse files

tests/null_calibration: assert runs=1 is flagged degenerate + refused by z_score (F02)

Authored by espadonne
SHA
b0469baa0c6de7d0d6245bb88578209aba121811
Parents
365378a
Tree
55b3b50

1 changed file

StatusFile+-
M tests/unit/test_null_calibration.py 17 12
tests/unit/test_null_calibration.pymodified
@@ -276,19 +276,22 @@ class TestProbe:
276276
         r = probe.run(fresh_spec, RunContext(backend=backend))
277277
         assert r.evidence["from_cache"] is False
278278
 
279
-    def test_std_floor_prevents_runaway_zscore(self) -> None:
280
-        """C9: identical raws across seeds → std=0 → clamped to 1e-6.
281
-
282
-        Use a single-run calibration (no variance by construction) to
283
-        force the degenerate case; the runner must still publish the
284
-        kind with a non-zero std so downstream z-scores stay finite.
279
+    def test_degenerate_calibration_flagged_and_refused(self) -> None:
280
+        """F02 (Audit 03): identical raws or runs≤1 → ``degenerate: 1.0``
281
+        in the stats dict, and the downstream z-score computation
282
+        refuses instead of firing on a 1e-6 floor.
283
+
284
+        Pre-F02 this test asserted ``std ≥ 1e-6`` + ``z is not None``,
285
+        which is exactly the contract that produced the audit's
286
+        +290,766σ observation on a leakage probe under ``runs: 1``.
287
+        The fix flips both assertions.
285288
         """
286289
         backend = _diverging_backend()
287290
         probe, spec = build_probe(
288291
             {
289292
                 "name": "null",
290293
                 "kind": "null_adapter",
291
-                "runs": 1,  # single seed → std=0
294
+                "runs": 1,  # single seed → degenerate by construction
292295
                 "calibrate_kinds": ["delta_kl"],
293296
             }
294297
         )
@@ -296,13 +299,15 @@ class TestProbe:
296299
         result = probe.run(spec, ctx)
297300
         assert result.verdict == Verdict.PASS
298301
         stats = result.evidence["null_stats"]["delta_kl"]
299
-        assert stats["std"] >= 1e-6
300
-        # And the downstream z-score computation is finite, not inf.
302
+        # Std floor is still 1e-6 (preserved for valid-but-tight
303
+        # multi-seed nulls). What changed is the explicit
304
+        # ``degenerate`` flag on the stats dict — ``runs: 1`` → True.
305
+        assert stats["std"] == 1e-6
306
+        assert stats["degenerate"] >= 0.5
307
+        # Downstream z_score now refuses rather than emit runaway values.
301308
         from dlm_sway.probes._zscore import z_score
302309
 
303
-        z = z_score(0.5, stats)
304
-        assert z is not None
305
-        assert np.isfinite(z)
310
+        assert z_score(0.5, stats) is None
306311
 
307312
     def test_per_kind_stats_published(self) -> None:
308313
         """Every calibrating kind gets its own (mean, std, n) triple."""