@@ -137,6 +137,37 @@ def collect_missing_extras(suite: SuiteResult) -> list[str]: |
| 137 | 137 | return sorted(found) |
| 138 | 138 | |
| 139 | 139 | |
| 140 | +def collect_degenerate_null_kinds(suite: SuiteResult) -> list[str]: |
| 141 | + """Probe kinds whose null-calibration stats were flagged degenerate. |
| 142 | + |
| 143 | + ``null_adapter`` marks a kind's stats with ``degenerate: 1.0`` when |
| 144 | + the calibration ran but the baseline was too narrow for the z-score |
| 145 | + path to fire (``runs: 1``, or a multi-seed run whose raws collapsed |
| 146 | + to an effectively-zero variance — F02 from Audit 03). Unlike |
| 147 | + :func:`collect_null_opt_outs` (which surfaces probes that opted |
| 148 | + out at spec-build time), this surface catches the case where the |
| 149 | + null *did* run but wasn't useful. Both cases fall back to fixed |
| 150 | + thresholds; the report distinguishes them so users can act: |
| 151 | + ``opt_out`` → expected for probes like ``adapter_revert``; |
| 152 | + ``degenerate`` → bump ``runs:`` in the spec. |
| 153 | + """ |
| 154 | + found: set[str] = set() |
| 155 | + for probe in suite.probes: |
| 156 | + if probe.kind != "null_adapter": |
| 157 | + continue |
| 158 | + # ``null_adapter`` writes per-kind stats into |
| 159 | + # ``SuiteResult.null_stats``, not the probe's evidence — the |
| 160 | + # suite-level field is the canonical place the runner threads |
| 161 | + # calibration across probes. |
| 162 | + stats_by_kind = suite.null_stats or {} |
| 163 | + for kind, kind_stats in stats_by_kind.items(): |
| 164 | + if not isinstance(kind_stats, dict): |
| 165 | + continue |
| 166 | + if kind_stats.get("degenerate", 0.0) >= 0.5: |
| 167 | + found.add(kind) |
| 168 | + return sorted(found) |
| 169 | + |
| 170 | + |
| 140 | 171 | def collect_null_opt_outs(suite: SuiteResult) -> list[str]: |
| 141 | 172 | """Probe kinds that opted out of null calibration. |
| 142 | 173 | |
@@ -266,6 +297,22 @@ def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None |
| 266 | 297 | ) |
| 267 | 298 | ) |
| 268 | 299 | |
| 300 | + # F02 (Audit 03): null-calibration-degenerate rollup. Distinct from |
| 301 | + # opt-outs — the null *did* run, but its baseline was too narrow |
| 302 | + # (``runs: 1`` or coincidentally-identical seeds). Users see this |
| 303 | + # and bump ``runs:`` in the spec; the fix is actionable. |
| 304 | + degenerate = collect_degenerate_null_kinds(suite) |
| 305 | + if degenerate: |
| 306 | + c.print() |
| 307 | + c.print( |
| 308 | + Text( |
| 309 | + f"{len(degenerate)} probe kind(s) had a degenerate null " |
| 310 | + f"baseline (std ≈ 0, insufficient for z-scoring): " |
| 311 | + f"{', '.join(degenerate)} — bump ``runs:`` in null_adapter spec.", |
| 312 | + style="dim", |
| 313 | + ) |
| 314 | + ) |
| 315 | + |
| 269 | 316 | c.print() |
| 270 | 317 | footer_parts = [f"wall: {format_duration_s(suite.wall_seconds)}", f"sway {suite.sway_version}"] |
| 271 | 318 | if suite.determinism is not None: |
@@ -534,6 +581,20 @@ def to_markdown(suite: SuiteResult, score: SwayScore) -> str: |
| 534 | 581 | for kind in opt_outs: |
| 535 | 582 | buf.write(f"- `{kind}`\n") |
| 536 | 583 | |
| 584 | + # F02 (Audit 03) — degenerate null-calibration rollup. |
| 585 | + degenerate = collect_degenerate_null_kinds(suite) |
| 586 | + if degenerate: |
| 587 | + buf.write("\n## Degenerate null calibration\n\n") |
| 588 | + buf.write( |
| 589 | + f"{len(degenerate)} probe kind(s) ran null_adapter but the " |
| 590 | + f"resulting baseline was too narrow for z-scoring " |
| 591 | + f"(std ≈ 0, typically `runs: 1` or coincidentally-matched " |
| 592 | + f"seeds). Fix: bump `runs:` in the `null_adapter` spec " |
| 593 | + f"entry. Affected kinds:\n\n" |
| 594 | + ) |
| 595 | + for kind in degenerate: |
| 596 | + buf.write(f"- `{kind}`\n") |
| 597 | + |
| 537 | 598 | # F07 — cluster_kl sub-line: expand the per-cluster breakdown so |
| 538 | 599 | # the reader can answer "which topic moved?" without cracking open |
| 539 | 600 | # the JSON. The row itself already carries ``k=N, spec=X.XX`` in |
@@ -638,6 +699,7 @@ def _bar(v: float, *, width: int = 10) -> str: |
| 638 | 699 | |
| 639 | 700 | |
| 640 | 701 | __all__ = [ |
| 702 | + "collect_degenerate_null_kinds", |
| 641 | 703 | "collect_missing_extras", |
| 642 | 704 | "collect_null_opt_outs", |
| 643 | 705 | "format_duration_s", |