tenseleyflow/sway / 3134089

Browse files

tests/ext_ppl: assert runner threads null_stats even when degenerate (F02)

Authored by espadonne
SHA
313408996026652909933e890bc233f7fbd7021a
Parents
8f65acd
Tree
ac72be6

1 changed file

StatusFile+-
M tests/unit/test_probe_external_perplexity.py 32 10
tests/unit/test_probe_external_perplexity.pymodified
@@ -191,7 +191,24 @@ class TestCalibrateSpec:
191191
 
192192
 class TestNullCalibrationEndToEnd:
193193
     def test_runner_threads_null_stats_to_external_perplexity(self) -> None:
194
-        """null_adapter → external_perplexity gets a z_score in the suite."""
194
+        """null_adapter → external_perplexity — the runner threads the
195
+        per-kind null stats into the downstream probe's context.
196
+
197
+        Post-F02 (Audit 03), the dummy backend's ``rolling_logprob``
198
+        isn't seed-sensitive, so null-calibration runs of
199
+        ``external_perplexity`` produce identical raws across seeds.
200
+        That's a legitimately-degenerate null (``std == 0``), and the
201
+        F02 fix now surfaces that as ``degenerate: 1.0`` in the stats
202
+        dict. Downstream ``z_score`` correctly refuses to divide by a
203
+        lifted std, and the probe takes the fixed-threshold fallback.
204
+
205
+        What THIS test pins is the runner threading contract: regardless
206
+        of whether the z-score fires, ``null_stats`` must reach the
207
+        suite result + the probe's message must surface the
208
+        ``(no calibration for ...)`` tag when the null is degenerate.
209
+        A regression in threading would drop ``null_stats`` from the
210
+        suite result entirely.
211
+        """
195212
         from dlm_sway.suite.runner import run as run_suite
196213
         from dlm_sway.suite.spec import SwaySpec
197214
 
@@ -209,7 +226,7 @@ class TestNullCalibrationEndToEnd:
209226
                         "name": "ext",
210227
                         "kind": "external_perplexity",
211228
                         "max_chunks": 3,
212
-                        "assert_z_gte": -100.0,  # permissive
229
+                        "assert_mean_delta_gte": -100.0,  # permissive fixed threshold
213230
                     },
214231
                 ],
215232
             }
@@ -219,11 +236,16 @@ class TestNullCalibrationEndToEnd:
219236
         null_result = result.probes[0]
220237
         ext_result = result.probes[1]
221238
         assert null_result.verdict == Verdict.PASS
222
-        # External-perplexity should have taken the z-score path because
223
-        # null_adapter populated per-kind stats for it.
224
-        assert ext_result.z_score is not None, (
225
-            "external_perplexity should have z-scored against null baseline, "
226
-            f"got evidence={ext_result.evidence}, message={ext_result.message}"
227
-        )
228
-        # Raw z-score path puts "higher-is-better" wording in the message.
229
-        assert "higher-is-better" in (ext_result.message or "")
239
+
240
+        # F02 — the runner threads null_stats into suite.null_stats
241
+        # even when the null is degenerate. The probe sees the stats
242
+        # dict (with ``degenerate: 1.0``) and chooses the fallback
243
+        # path itself.
244
+        assert "external_perplexity" in result.null_stats
245
+        ext_null = result.null_stats["external_perplexity"]
246
+        assert ext_null.get("degenerate", 0.0) >= 0.5
247
+        # Probe fell back to fixed thresholds; z_score is None, and
248
+        # the message carries the (no calibration) tag the S02 report
249
+        # layer uses to annotate the row.
250
+        assert ext_result.z_score is None
251
+        assert "no calibration for external_perplexity" in (ext_result.message or "")