Python · 3682 bytes Raw Blame History
1 """End-to-end tests that every numeric probe threads null_stats correctly.
2
3 Covers: with stats → ``z_score`` field populated + verdict respects
4 ``assert_z_gte``; without stats → fixed-threshold verdict + the
5 ``(no calibration)`` annotation surfaces in the message.
6 """
7
8 from __future__ import annotations
9
10 import pytest
11
12 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
13 from dlm_sway.core.result import Verdict
14 from dlm_sway.probes.base import RunContext, build_probe
15
16
17 def _backend() -> DummyDifferentialBackend:
18 return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
19
20
21 class TestNoCalibrationAnnotation:
22 """When stats are absent, every probe's message carries the note."""
23
24 @pytest.mark.parametrize(
25 ("kind", "spec_kwargs"),
26 [
27 ("delta_kl", {"prompts": ["q1", "q2"]}),
28 (
29 "paraphrase_invariance",
30 {
31 "cases": [
32 {"prompt": "q", "gold": "a", "paraphrases": ["p1", "p2"]},
33 ]
34 },
35 ),
36 (
37 "preference_flip",
38 {
39 "triples": [
40 {"prompt": "q1", "chosen": "a", "rejected": "b"},
41 {"prompt": "q2", "chosen": "c", "rejected": "d"},
42 {"prompt": "q3", "chosen": "e", "rejected": "f"},
43 {"prompt": "q4", "chosen": "g", "rejected": "h"},
44 ]
45 },
46 ),
47 ("calibration_drift", {"items_limit": 5}),
48 ],
49 )
50 def test_no_calibration_note_in_message(self, kind: str, spec_kwargs: dict) -> None:
51 probe, spec = build_probe({"name": "p", "kind": kind, **spec_kwargs})
52 ctx = RunContext(backend=_backend())
53 result = probe.run(spec, ctx)
54 # If the probe produced a PASS/FAIL verdict with a raw, it took
55 # the fixed-threshold path and must surface the annotation.
56 if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None:
57 assert "no calibration" in result.message.lower(), (
58 f"{kind} did not surface the no-calibration annotation; message={result.message!r}"
59 )
60 assert result.z_score is None
61
62 def test_section_internalization_no_calibration(self) -> None:
63 from dlm_sway.core.sections import Section
64
65 sections = [
66 Section(id="s1", kind="prose", content="alpha beta gamma.", tag=None),
67 Section(id="s2", kind="prose", content="delta epsilon zeta.", tag=None),
68 ]
69 probe, spec = build_probe({"name": "p", "kind": "section_internalization"})
70 result = probe.run(spec, RunContext(backend=_backend(), sections=sections))
71 if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None:
72 assert "no calibration" in result.message.lower()
73 assert result.z_score is None
74
75
76 class TestStatsThreadedToZScore:
77 """With stats in ctx.null_stats, numeric probes z-score and populate the field."""
78
79 def test_delta_kl_emits_z_score(self) -> None:
80 probe, spec = build_probe(
81 {
82 "name": "dk",
83 "kind": "delta_kl",
84 "prompts": ["p1", "p2"],
85 "assert_z_gte": -50.0, # permissive so we always PASS
86 }
87 )
88 stats = {"delta_kl": {"mean": 0.0, "std": 0.01, "n": 3.0}}
89 ctx = RunContext(backend=_backend(), null_stats=stats)
90 result = probe.run(spec, ctx)
91 assert result.z_score is not None
92 assert "vs null" in result.message