tenseleyflow/sway / 365378a

Browse files

suite/report: degenerate-null-calibration footer rollup (F02)

Authored by espadonne
SHA
365378abb06f6a882e85d654fb75da7bc9672d07
Parents
15d4314
Tree
cd40b1e

1 changed file

StatusFile+-
M src/dlm_sway/suite/report.py 62 0
src/dlm_sway/suite/report.pymodified
@@ -137,6 +137,37 @@ def collect_missing_extras(suite: SuiteResult) -> list[str]:
137137
     return sorted(found)
138138
 
139139
 
140
+def collect_degenerate_null_kinds(suite: SuiteResult) -> list[str]:
141
+    """Probe kinds whose null-calibration stats were flagged degenerate.
142
+
143
+    ``null_adapter`` marks a kind's stats with ``degenerate: 1.0`` when
144
+    the calibration ran but the baseline was too narrow for the z-score
145
+    path to fire (``runs: 1``, or a multi-seed run whose raws collapsed
146
+    to an effectively-zero variance — F02 from Audit 03). Unlike
147
+    :func:`collect_null_opt_outs` (which surfaces probes that opted
148
+    out at spec-build time), this surface catches the case where the
149
+    null *did* run but wasn't useful. Both cases fall back to fixed
150
+    thresholds; the report distinguishes them so users can act:
151
+    ``opt_out`` → expected for probes like ``adapter_revert``;
152
+    ``degenerate`` → bump ``runs:`` in the spec.
153
+    """
154
+    found: set[str] = set()
155
+    for probe in suite.probes:
156
+        if probe.kind != "null_adapter":
157
+            continue
158
+        # ``null_adapter`` writes per-kind stats into
159
+        # ``SuiteResult.null_stats``, not the probe's evidence — the
160
+        # suite-level field is the canonical place the runner threads
161
+        # calibration across probes.
162
+        stats_by_kind = suite.null_stats or {}
163
+        for kind, kind_stats in stats_by_kind.items():
164
+            if not isinstance(kind_stats, dict):
165
+                continue
166
+            if kind_stats.get("degenerate", 0.0) >= 0.5:
167
+                found.add(kind)
168
+    return sorted(found)
169
+
170
+
140171
 def collect_null_opt_outs(suite: SuiteResult) -> list[str]:
141172
     """Probe kinds that opted out of null calibration.
142173
 
@@ -266,6 +297,22 @@ def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None
266297
             )
267298
         )
268299
 
300
+    # F02 (Audit 03): null-calibration-degenerate rollup. Distinct from
301
+    # opt-outs — the null *did* run, but its baseline was too narrow
302
+    # (``runs: 1`` or coincidentally-identical seeds). Users see this
303
+    # and bump ``runs:`` in the spec; the fix is actionable.
304
+    degenerate = collect_degenerate_null_kinds(suite)
305
+    if degenerate:
306
+        c.print()
307
+        c.print(
308
+            Text(
309
+                f"{len(degenerate)} probe kind(s) had a degenerate null "
310
+                f"baseline (std ≈ 0, insufficient for z-scoring): "
311
+                f"{', '.join(degenerate)} — bump ``runs:`` in null_adapter spec.",
312
+                style="dim",
313
+            )
314
+        )
315
+
269316
     c.print()
270317
     footer_parts = [f"wall: {format_duration_s(suite.wall_seconds)}", f"sway {suite.sway_version}"]
271318
     if suite.determinism is not None:
@@ -534,6 +581,20 @@ def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
534581
         for kind in opt_outs:
535582
             buf.write(f"- `{kind}`\n")
536583
 
584
+    # F02 (Audit 03) — degenerate null-calibration rollup.
585
+    degenerate = collect_degenerate_null_kinds(suite)
586
+    if degenerate:
587
+        buf.write("\n## Degenerate null calibration\n\n")
588
+        buf.write(
589
+            f"{len(degenerate)} probe kind(s) ran null_adapter but the "
590
+            f"resulting baseline was too narrow for z-scoring "
591
+            f"(std ≈ 0, typically `runs: 1` or coincidentally-matched "
592
+            f"seeds). Fix: bump `runs:` in the `null_adapter` spec "
593
+            f"entry. Affected kinds:\n\n"
594
+        )
595
+        for kind in degenerate:
596
+            buf.write(f"- `{kind}`\n")
597
+
537598
     # F07 — cluster_kl sub-line: expand the per-cluster breakdown so
538599
     # the reader can answer "which topic moved?" without cracking open
539600
     # the JSON. The row itself already carries ``k=N, spec=X.XX`` in
@@ -638,6 +699,7 @@ def _bar(v: float, *, width: int = 10) -> str:
638699
 
639700
 
640701
 __all__ = [
702
+    "collect_degenerate_null_kinds",
641703
     "collect_missing_extras",
642704
     "collect_null_opt_outs",
643705
     "format_duration_s",