@@ -191,7 +191,24 @@ class TestCalibrateSpec: |
| 191 | 191 | |
| 192 | 192 | class TestNullCalibrationEndToEnd: |
| 193 | 193 | def test_runner_threads_null_stats_to_external_perplexity(self) -> None: |
| 194 | | - """null_adapter → external_perplexity gets a z_score in the suite.""" |
| 194 | + """null_adapter → external_perplexity — the runner threads the |
| 195 | + per-kind null stats into the downstream probe's context. |
| 196 | + |
| 197 | + Post-F02 (Audit 03), the dummy backend's ``rolling_logprob`` |
| 198 | + isn't seed-sensitive, so null-calibration runs of |
| 199 | + ``external_perplexity`` produce identical raws across seeds. |
| 200 | + That's a legitimately-degenerate null (``std == 0``), and the |
| 201 | + F02 fix now surfaces that as ``degenerate: 1.0`` in the stats |
| 202 | + dict. Downstream ``z_score`` correctly refuses to divide by a |
| 203 | + lifted std, and the probe takes the fixed-threshold fallback. |
| 204 | + |
| 205 | + What THIS test pins is the runner threading contract: regardless |
| 206 | + of whether the z-score fires, ``null_stats`` must reach the |
| 207 | + suite result + the probe's message must surface the |
| 208 | + ``(no calibration for ...)`` tag when the null is degenerate. |
| 209 | + A regression in threading would drop ``null_stats`` from the |
| 210 | + suite result entirely. |
| 211 | + """ |
| 195 | 212 | from dlm_sway.suite.runner import run as run_suite |
| 196 | 213 | from dlm_sway.suite.spec import SwaySpec |
| 197 | 214 | |
@@ -209,7 +226,7 @@ class TestNullCalibrationEndToEnd: |
| 209 | 226 | "name": "ext", |
| 210 | 227 | "kind": "external_perplexity", |
| 211 | 228 | "max_chunks": 3, |
| 212 | | - "assert_z_gte": -100.0, # permissive |
| 229 | + "assert_mean_delta_gte": -100.0, # permissive fixed threshold |
| 213 | 230 | }, |
| 214 | 231 | ], |
| 215 | 232 | } |
@@ -219,11 +236,16 @@ class TestNullCalibrationEndToEnd: |
| 219 | 236 | null_result = result.probes[0] |
| 220 | 237 | ext_result = result.probes[1] |
| 221 | 238 | assert null_result.verdict == Verdict.PASS |
| 222 | | - # External-perplexity should have taken the z-score path because |
| 223 | | - # null_adapter populated per-kind stats for it. |
| 224 | | - assert ext_result.z_score is not None, ( |
| 225 | | - "external_perplexity should have z-scored against null baseline, " |
| 226 | | - f"got evidence={ext_result.evidence}, message={ext_result.message}" |
| 227 | | - ) |
| 228 | | - # Raw z-score path puts "higher-is-better" wording in the message. |
| 229 | | - assert "higher-is-better" in (ext_result.message or "") |
| 239 | + |
| 240 | + # F02 — the runner threads null_stats into suite.null_stats |
| 241 | + # even when the null is degenerate. The probe sees the stats |
| 242 | + # dict (with ``degenerate: 1.0``) and chooses the fallback |
| 243 | + # path itself. |
| 244 | + assert "external_perplexity" in result.null_stats |
| 245 | + ext_null = result.null_stats["external_perplexity"] |
| 246 | + assert ext_null.get("degenerate", 0.0) >= 0.5 |
| 247 | + # Probe fell back to fixed thresholds; z_score is None, and |
| 248 | + # the message carries the (no calibration) tag the S02 report |
| 249 | + # layer uses to annotate the row. |
| 250 | + assert ext_result.z_score is None |
| 251 | + assert "no calibration for external_perplexity" in (ext_result.message or "") |