| 1 | """End-to-end tests that every numeric probe threads null_stats correctly. |
| 2 | |
| 3 | Covers: with stats → ``z_score`` field populated + verdict respects |
| 4 | ``assert_z_gte``; without stats → fixed-threshold verdict + the |
| 5 | ``(no calibration)`` annotation surfaces in the message. |
| 6 | """ |
| 7 | |
| 8 | from __future__ import annotations |
| 9 | |
| 10 | import pytest |
| 11 | |
| 12 | from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses |
| 13 | from dlm_sway.core.result import Verdict |
| 14 | from dlm_sway.probes.base import RunContext, build_probe |
| 15 | |
| 16 | |
| 17 | def _backend() -> DummyDifferentialBackend: |
| 18 | return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses()) |
| 19 | |
| 20 | |
| 21 | class TestNoCalibrationAnnotation: |
| 22 | """When stats are absent, every probe's message carries the note.""" |
| 23 | |
| 24 | @pytest.mark.parametrize( |
| 25 | ("kind", "spec_kwargs"), |
| 26 | [ |
| 27 | ("delta_kl", {"prompts": ["q1", "q2"]}), |
| 28 | ( |
| 29 | "paraphrase_invariance", |
| 30 | { |
| 31 | "cases": [ |
| 32 | {"prompt": "q", "gold": "a", "paraphrases": ["p1", "p2"]}, |
| 33 | ] |
| 34 | }, |
| 35 | ), |
| 36 | ( |
| 37 | "preference_flip", |
| 38 | { |
| 39 | "triples": [ |
| 40 | {"prompt": "q1", "chosen": "a", "rejected": "b"}, |
| 41 | {"prompt": "q2", "chosen": "c", "rejected": "d"}, |
| 42 | {"prompt": "q3", "chosen": "e", "rejected": "f"}, |
| 43 | {"prompt": "q4", "chosen": "g", "rejected": "h"}, |
| 44 | ] |
| 45 | }, |
| 46 | ), |
| 47 | ("calibration_drift", {"items_limit": 5}), |
| 48 | ], |
| 49 | ) |
| 50 | def test_no_calibration_note_in_message(self, kind: str, spec_kwargs: dict) -> None: |
| 51 | probe, spec = build_probe({"name": "p", "kind": kind, **spec_kwargs}) |
| 52 | ctx = RunContext(backend=_backend()) |
| 53 | result = probe.run(spec, ctx) |
| 54 | # If the probe produced a PASS/FAIL verdict with a raw, it took |
| 55 | # the fixed-threshold path and must surface the annotation. |
| 56 | if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None: |
| 57 | assert "no calibration" in result.message.lower(), ( |
| 58 | f"{kind} did not surface the no-calibration annotation; message={result.message!r}" |
| 59 | ) |
| 60 | assert result.z_score is None |
| 61 | |
| 62 | def test_section_internalization_no_calibration(self) -> None: |
| 63 | from dlm_sway.core.sections import Section |
| 64 | |
| 65 | sections = [ |
| 66 | Section(id="s1", kind="prose", content="alpha beta gamma.", tag=None), |
| 67 | Section(id="s2", kind="prose", content="delta epsilon zeta.", tag=None), |
| 68 | ] |
| 69 | probe, spec = build_probe({"name": "p", "kind": "section_internalization"}) |
| 70 | result = probe.run(spec, RunContext(backend=_backend(), sections=sections)) |
| 71 | if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None: |
| 72 | assert "no calibration" in result.message.lower() |
| 73 | assert result.z_score is None |
| 74 | |
| 75 | |
| 76 | class TestStatsThreadedToZScore: |
| 77 | """With stats in ctx.null_stats, numeric probes z-score and populate the field.""" |
| 78 | |
| 79 | def test_delta_kl_emits_z_score(self) -> None: |
| 80 | probe, spec = build_probe( |
| 81 | { |
| 82 | "name": "dk", |
| 83 | "kind": "delta_kl", |
| 84 | "prompts": ["p1", "p2"], |
| 85 | "assert_z_gte": -50.0, # permissive so we always PASS |
| 86 | } |
| 87 | ) |
| 88 | stats = {"delta_kl": {"mean": 0.0, "std": 0.01, "n": 3.0}} |
| 89 | ctx = RunContext(backend=_backend(), null_stats=stats) |
| 90 | result = probe.run(spec, ctx) |
| 91 | assert result.z_score is not None |
| 92 | assert "vs null" in result.message |