sway Public

Watch 0 Fork 0 Star 0

Python · 3682 bytes Raw Blame History

  
        1
        """End-to-end tests that every numeric probe threads null_stats correctly.
      
        2
        
        3
        Covers: with stats → ``z_score`` field populated + verdict respects
      
        4
        ``assert_z_gte``; without stats → fixed-threshold verdict + the
      
        5
        ``(no calibration)`` annotation surfaces in the message.
      
        6
        """
      
        7
        
        8
        from __future__ import annotations
      
        9
        
        10
        import pytest
      
        11
        
        12
        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
      
        13
        from dlm_sway.core.result import Verdict
      
        14
        from dlm_sway.probes.base import RunContext, build_probe
      
        15
        
        16
        
        17
        def _backend() -> DummyDifferentialBackend:
      
        18
            return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        19
        
        20
        
        21
        class TestNoCalibrationAnnotation:
      
        22
            """When stats are absent, every probe's message carries the note."""
      
        23
        
        24
            @pytest.mark.parametrize(
      
        25
                ("kind", "spec_kwargs"),
      
        26
                [
      
        27
                    ("delta_kl", {"prompts": ["q1", "q2"]}),
      
        28
                    (
      
        29
                        "paraphrase_invariance",
      
        30
                        {
      
        31
                            "cases": [
      
        32
                                {"prompt": "q", "gold": "a", "paraphrases": ["p1", "p2"]},
      
        33
                            ]
      
        34
                        },
      
        35
                    ),
      
        36
                    (
      
        37
                        "preference_flip",
      
        38
                        {
      
        39
                            "triples": [
      
        40
                                {"prompt": "q1", "chosen": "a", "rejected": "b"},
      
        41
                                {"prompt": "q2", "chosen": "c", "rejected": "d"},
      
        42
                                {"prompt": "q3", "chosen": "e", "rejected": "f"},
      
        43
                                {"prompt": "q4", "chosen": "g", "rejected": "h"},
      
        44
                            ]
      
        45
                        },
      
        46
                    ),
      
        47
                    ("calibration_drift", {"items_limit": 5}),
      
        48
                ],
      
        49
            )
      
        50
            def test_no_calibration_note_in_message(self, kind: str, spec_kwargs: dict) -> None:
      
        51
                probe, spec = build_probe({"name": "p", "kind": kind, **spec_kwargs})
      
        52
                ctx = RunContext(backend=_backend())
      
        53
                result = probe.run(spec, ctx)
      
        54
                # If the probe produced a PASS/FAIL verdict with a raw, it took
      
        55
                # the fixed-threshold path and must surface the annotation.
      
        56
                if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None:
      
        57
                    assert "no calibration" in result.message.lower(), (
      
        58
                        f"{kind} did not surface the no-calibration annotation; message={result.message!r}"
      
        59
                    )
      
        60
                    assert result.z_score is None
      
        61
        
        62
            def test_section_internalization_no_calibration(self) -> None:
      
        63
                from dlm_sway.core.sections import Section
      
        64
        
        65
                sections = [
      
        66
                    Section(id="s1", kind="prose", content="alpha beta gamma.", tag=None),
      
        67
                    Section(id="s2", kind="prose", content="delta epsilon zeta.", tag=None),
      
        68
                ]
      
        69
                probe, spec = build_probe({"name": "p", "kind": "section_internalization"})
      
        70
                result = probe.run(spec, RunContext(backend=_backend(), sections=sections))
      
        71
                if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None:
      
        72
                    assert "no calibration" in result.message.lower()
      
        73
                    assert result.z_score is None
      
        74
        
        75
        
        76
        class TestStatsThreadedToZScore:
      
        77
            """With stats in ctx.null_stats, numeric probes z-score and populate the field."""
      
        78
        
        79
            def test_delta_kl_emits_z_score(self) -> None:
      
        80
                probe, spec = build_probe(
      
        81
                    {
      
        82
                        "name": "dk",
      
        83
                        "kind": "delta_kl",
      
        84
                        "prompts": ["p1", "p2"],
      
        85
                        "assert_z_gte": -50.0,  # permissive so we always PASS
      
        86
                    }
      
        87
                )
      
        88
                stats = {"delta_kl": {"mean": 0.0, "std": 0.01, "n": 3.0}}
      
        89
                ctx = RunContext(backend=_backend(), null_stats=stats)
      
        90
                result = probe.run(spec, ctx)
      
        91
                assert result.z_score is not None
      
        92
                assert "vs null" in result.message

1	"""End-to-end tests that every numeric probe threads null_stats correctly.
2
3	Covers: with stats → ``z_score`` field populated + verdict respects
4	``assert_z_gte``; without stats → fixed-threshold verdict + the
5	``(no calibration)`` annotation surfaces in the message.
6	"""
7
8	from __future__ import annotations
9
10	import pytest
11
12	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
13	from dlm_sway.core.result import Verdict
14	from dlm_sway.probes.base import RunContext, build_probe
15
16
17	def _backend() -> DummyDifferentialBackend:
18	return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
19
20
21	class TestNoCalibrationAnnotation:
22	"""When stats are absent, every probe's message carries the note."""
23
24	@pytest.mark.parametrize(
25	("kind", "spec_kwargs"),
26	[
27	("delta_kl", {"prompts": ["q1", "q2"]}),
28	(
29	"paraphrase_invariance",
30	{
31	"cases": [
32	{"prompt": "q", "gold": "a", "paraphrases": ["p1", "p2"]},
33	]
34	},
35	),
36	(
37	"preference_flip",
38	{
39	"triples": [
40	{"prompt": "q1", "chosen": "a", "rejected": "b"},
41	{"prompt": "q2", "chosen": "c", "rejected": "d"},
42	{"prompt": "q3", "chosen": "e", "rejected": "f"},
43	{"prompt": "q4", "chosen": "g", "rejected": "h"},
44	]
45	},
46	),
47	("calibration_drift", {"items_limit": 5}),
48	],
49	)
50	def test_no_calibration_note_in_message(self, kind: str, spec_kwargs: dict) -> None:
51	probe, spec = build_probe({"name": "p", "kind": kind, **spec_kwargs})
52	ctx = RunContext(backend=_backend())
53	result = probe.run(spec, ctx)
54	# If the probe produced a PASS/FAIL verdict with a raw, it took
55	# the fixed-threshold path and must surface the annotation.
56	if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None:
57	assert "no calibration" in result.message.lower(), (
58	f"{kind} did not surface the no-calibration annotation; message={result.message!r}"
59	)
60	assert result.z_score is None
61
62	def test_section_internalization_no_calibration(self) -> None:
63	from dlm_sway.core.sections import Section
64
65	sections = [
66	Section(id="s1", kind="prose", content="alpha beta gamma.", tag=None),
67	Section(id="s2", kind="prose", content="delta epsilon zeta.", tag=None),
68	]
69	probe, spec = build_probe({"name": "p", "kind": "section_internalization"})
70	result = probe.run(spec, RunContext(backend=_backend(), sections=sections))
71	if result.verdict in (Verdict.PASS, Verdict.FAIL) and result.raw is not None:
72	assert "no calibration" in result.message.lower()
73	assert result.z_score is None
74
75
76	class TestStatsThreadedToZScore:
77	"""With stats in ctx.null_stats, numeric probes z-score and populate the field."""
78
79	def test_delta_kl_emits_z_score(self) -> None:
80	probe, spec = build_probe(
81	{
82	"name": "dk",
83	"kind": "delta_kl",
84	"prompts": ["p1", "p2"],
85	"assert_z_gte": -50.0, # permissive so we always PASS
86	}
87	)
88	stats = {"delta_kl": {"mean": 0.0, "std": 0.01, "n": 3.0}}
89	ctx = RunContext(backend=_backend(), null_stats=stats)
90	result = probe.run(spec, ctx)
91	assert result.z_score is not None
92	assert "vs null" in result.message