documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 7350 bytes Raw Blame History

  
        1
        """Unit tests for Sprint 42's external CLI judge runtime."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import math
      
        6
        import subprocess
      
        7
        from pathlib import Path
      
        8
        from unittest.mock import patch
      
        9
        
        10
        import pytest
      
        11
        
        12
        from dlm.preference import (
      
        13
            CliJudge,
      
        14
            HfRewardModelJudge,
      
        15
            InvalidJudgeSpecError,
      
        16
            JudgeInvocationError,
      
        17
            JudgeUnavailableError,
      
        18
            SwayJudge,
      
        19
            build_judge,
      
        20
        )
      
        21
        from dlm.preference.judge import _combine_reasoning, _parse_cli_candidate_score
      
        22
        
        23
        
        24
        def _proc(
      
        25
            *,
      
        26
            returncode: int = 0,
      
        27
            stdout: str = "",
      
        28
            stderr: str = "",
      
        29
        ) -> subprocess.CompletedProcess[str]:
      
        30
            return subprocess.CompletedProcess(
      
        31
                args=["judge-bin"],
      
        32
                returncode=returncode,
      
        33
                stdout=stdout,
      
        34
                stderr=stderr,
      
        35
            )
      
        36
        
        37
        
        38
        class TestCliJudge:
      
        39
            def test_blank_command_is_rejected(self) -> None:
      
        40
                with pytest.raises(InvalidJudgeSpecError, match="include a command"):
      
        41
                    CliJudge("   ")
      
        42
        
        43
            def test_empty_argv_after_split_is_rejected(self) -> None:
      
        44
                with (
      
        45
                    patch("dlm.preference.judge.shlex.split", return_value=[]),
      
        46
                    pytest.raises(InvalidJudgeSpecError, match="include a command"),
      
        47
                ):
      
        48
                    CliJudge("judge-bin")
      
        49
        
        50
            def test_non_positive_timeout_is_rejected(self) -> None:
      
        51
                with pytest.raises(ValueError, match="timeout must be > 0"):
      
        52
                    CliJudge("judge-bin", timeout=0.0)
      
        53
        
        54
            def test_scores_pair_via_two_json_round_trips(self) -> None:
      
        55
                seen_payloads: list[str] = []
      
        56
        
        57
                def fake_run(*args: object, **kwargs: object) -> subprocess.CompletedProcess[str]:
      
        58
                    argv = args[0]
      
        59
                    assert argv == ["judge-bin", "--json"]
      
        60
                    payload = kwargs["input"]
      
        61
                    assert isinstance(payload, str)
      
        62
                    seen_payloads.append(payload)
      
        63
                    if len(seen_payloads) == 1:
      
        64
                        return _proc(stdout='{"score": 0.2, "reasoning": "too vague"}')
      
        65
                    return _proc(stdout='{"score": 0.9, "reasoning": "specific and correct"}')
      
        66
        
        67
                judge = CliJudge("judge-bin --json")
      
        68
                with patch("dlm.preference.judge.subprocess.run", side_effect=fake_run):
      
        69
                    score = judge.score_pair("What is DGEMM?", "bad", "good")
      
        70
        
        71
                assert score.score_a == pytest.approx(0.2)
      
        72
                assert score.score_b == pytest.approx(0.9)
      
        73
                assert score.preferred == "b"
      
        74
                assert "a: too vague" in (score.reasoning or "")
      
        75
                assert "b: specific and correct" in (score.reasoning or "")
      
        76
                assert '"prompt": "What is DGEMM?"' in seen_payloads[0]
      
        77
                assert '"candidate": "bad"' in seen_payloads[0]
      
        78
                assert '"candidate": "good"' in seen_payloads[1]
      
        79
        
        80
            def test_non_zero_exit_raises(self) -> None:
      
        81
                judge = CliJudge("judge-bin")
      
        82
                with (
      
        83
                    patch(
      
        84
                        "dlm.preference.judge.subprocess.run",
      
        85
                        return_value=_proc(returncode=7, stderr="bad model"),
      
        86
                    ),
      
        87
                    pytest.raises(JudgeInvocationError, match="exited 7: bad model"),
      
        88
                ):
      
        89
                    judge.score_pair("p", "a", "b")
      
        90
        
        91
            def test_invalid_json_raises(self) -> None:
      
        92
                judge = CliJudge("judge-bin")
      
        93
                with (
      
        94
                    patch(
      
        95
                        "dlm.preference.judge.subprocess.run",
      
        96
                        return_value=_proc(stdout="not-json"),
      
        97
                    ),
      
        98
                    pytest.raises(JudgeInvocationError, match="invalid JSON"),
      
        99
                ):
      
        100
                    judge.score_pair("p", "a", "b")
      
        101
        
        102
            def test_missing_numeric_score_raises(self) -> None:
      
        103
                judge = CliJudge("judge-bin")
      
        104
                with (
      
        105
                    patch(
      
        106
                        "dlm.preference.judge.subprocess.run",
      
        107
                        return_value=_proc(stdout='{"reasoning": "oops"}'),
      
        108
                    ),
      
        109
                    pytest.raises(JudgeInvocationError, match="numeric `score`"),
      
        110
                ):
      
        111
                    judge.score_pair("p", "a", "b")
      
        112
        
        113
            def test_missing_binary_raises_unavailable(self) -> None:
      
        114
                judge = CliJudge("judge-bin")
      
        115
                with (
      
        116
                    patch(
      
        117
                        "dlm.preference.judge.subprocess.run",
      
        118
                        side_effect=FileNotFoundError("judge-bin"),
      
        119
                    ),
      
        120
                    pytest.raises(JudgeUnavailableError, match="not available on PATH"),
      
        121
                ):
      
        122
                    judge.score_pair("p", "a", "b")
      
        123
        
        124
            def test_timeout_raises_invocation_error(self) -> None:
      
        125
                judge = CliJudge("judge-bin", timeout=1.5)
      
        126
                with (
      
        127
                    patch(
      
        128
                        "dlm.preference.judge.subprocess.run",
      
        129
                        side_effect=subprocess.TimeoutExpired(cmd="judge-bin", timeout=1.5),
      
        130
                    ),
      
        131
                    pytest.raises(JudgeInvocationError, match="timed out after 1.5s"),
      
        132
                ):
      
        133
                    judge.score_pair("p", "a", "b")
      
        134
        
        135
            def test_oserror_raises_unavailable_error(self) -> None:
      
        136
                judge = CliJudge("judge-bin")
      
        137
                with (
      
        138
                    patch(
      
        139
                        "dlm.preference.judge.subprocess.run",
      
        140
                        side_effect=OSError("permission denied"),
      
        141
                    ),
      
        142
                    pytest.raises(JudgeUnavailableError, match="could not start"),
      
        143
                ):
      
        144
                    judge.score_pair("p", "a", "b")
      
        145
        
        146
        
        147
        class TestCliJudgeHelpers:
      
        148
            def test_empty_stdout_is_rejected(self) -> None:
      
        149
                with pytest.raises(JudgeInvocationError, match="empty stdout"):
      
        150
                    _parse_cli_candidate_score("   ")
      
        151
        
        152
            def test_json_must_be_object(self) -> None:
      
        153
                with pytest.raises(JudgeInvocationError, match="JSON object"):
      
        154
                    _parse_cli_candidate_score('["not", "an", "object"]')
      
        155
        
        156
            @pytest.mark.parametrize("score", [float("nan"), float("inf"), -float("inf")])
      
        157
            def test_score_must_be_finite(self, score: float) -> None:
      
        158
                rendered = "NaN" if math.isnan(score) else ("Infinity" if score > 0 else "-Infinity")
      
        159
                with pytest.raises(JudgeInvocationError, match="must be finite"):
      
        160
                    _parse_cli_candidate_score(f'{{"score": {rendered}}}')
      
        161
        
        162
            def test_reasoning_must_be_string_when_present(self) -> None:
      
        163
                with pytest.raises(JudgeInvocationError, match="must be a string"):
      
        164
                    _parse_cli_candidate_score('{"score": 1.0, "reasoning": 7}')
      
        165
        
        166
            @pytest.mark.parametrize(
      
        167
                ("left", "right", "expected"),
      
        168
                [
      
        169
                    ("why a", None, "a: why a"),
      
        170
                    (None, "why b", "b: why b"),
      
        171
                    ("why a", "why b", "a: why a | b: why b"),
      
        172
                    (None, None, None),
      
        173
                ],
      
        174
            )
      
        175
            def test_combine_reasoning_formats_present_parts(
      
        176
                self,
      
        177
                left: str | None,
      
        178
                right: str | None,
      
        179
                expected: str | None,
      
        180
            ) -> None:
      
        181
                assert _combine_reasoning(left, right) == expected
      
        182
        
        183
        
        184
        class TestBuildJudge:
      
        185
            def test_cli_ref_builds_concrete_cli_judge(self) -> None:
      
        186
                judge = build_judge("cli:judge-bin --json")
      
        187
        
        188
                assert isinstance(judge, CliJudge)
      
        189
                assert judge.name == "cli:judge-bin --json"
      
        190
        
        191
            def test_hf_ref_builds_concrete_hf_judge(self) -> None:
      
        192
                judge = build_judge("hf:reward/model")
      
        193
        
        194
                assert isinstance(judge, HfRewardModelJudge)
      
        195
                assert judge.name == "hf:reward/model"
      
        196
        
        197
            def test_sway_ref_builds_concrete_sway_judge(self) -> None:
      
        198
                judge = build_judge("sway", dlm_path=Path("/tmp/example.dlm"))
      
        199
        
        200
                assert isinstance(judge, SwayJudge)
      
        201
                assert judge.name == "sway:preference_judge"
      
        202
        
        203
            def test_sway_ref_requires_dlm_path_context(self) -> None:
      
        204
                with pytest.raises(JudgeUnavailableError, match="requires the .dlm path context"):
      
        205
                    build_judge("sway")

1	"""Unit tests for Sprint 42's external CLI judge runtime."""
2
3	from __future__ import annotations
4
5	import math
6	import subprocess
7	from pathlib import Path
8	from unittest.mock import patch
9
10	import pytest
11
12	from dlm.preference import (
13	CliJudge,
14	HfRewardModelJudge,
15	InvalidJudgeSpecError,
16	JudgeInvocationError,
17	JudgeUnavailableError,
18	SwayJudge,
19	build_judge,
20	)
21	from dlm.preference.judge import _combine_reasoning, _parse_cli_candidate_score
22
23
24	def _proc(
25	*,
26	returncode: int = 0,
27	stdout: str = "",
28	stderr: str = "",
29	) -> subprocess.CompletedProcess[str]:
30	return subprocess.CompletedProcess(
31	args=["judge-bin"],
32	returncode=returncode,
33	stdout=stdout,
34	stderr=stderr,
35	)
36
37
38	class TestCliJudge:
39	def test_blank_command_is_rejected(self) -> None:
40	with pytest.raises(InvalidJudgeSpecError, match="include a command"):
41	CliJudge(" ")
42
43	def test_empty_argv_after_split_is_rejected(self) -> None:
44	with (
45	patch("dlm.preference.judge.shlex.split", return_value=[]),
46	pytest.raises(InvalidJudgeSpecError, match="include a command"),
47	):
48	CliJudge("judge-bin")
49
50	def test_non_positive_timeout_is_rejected(self) -> None:
51	with pytest.raises(ValueError, match="timeout must be > 0"):
52	CliJudge("judge-bin", timeout=0.0)
53
54	def test_scores_pair_via_two_json_round_trips(self) -> None:
55	seen_payloads: list[str] = []
56
57	def fake_run(args: object, *kwargs: object) -> subprocess.CompletedProcess[str]:
58	argv = args[0]
59	assert argv == ["judge-bin", "--json"]
60	payload = kwargs["input"]
61	assert isinstance(payload, str)
62	seen_payloads.append(payload)
63	if len(seen_payloads) == 1:
64	return _proc(stdout='{"score": 0.2, "reasoning": "too vague"}')
65	return _proc(stdout='{"score": 0.9, "reasoning": "specific and correct"}')
66
67	judge = CliJudge("judge-bin --json")
68	with patch("dlm.preference.judge.subprocess.run", side_effect=fake_run):
69	score = judge.score_pair("What is DGEMM?", "bad", "good")
70
71	assert score.score_a == pytest.approx(0.2)
72	assert score.score_b == pytest.approx(0.9)
73	assert score.preferred == "b"
74	assert "a: too vague" in (score.reasoning or "")
75	assert "b: specific and correct" in (score.reasoning or "")
76	assert '"prompt": "What is DGEMM?"' in seen_payloads[0]
77	assert '"candidate": "bad"' in seen_payloads[0]
78	assert '"candidate": "good"' in seen_payloads[1]
79
80	def test_non_zero_exit_raises(self) -> None:
81	judge = CliJudge("judge-bin")
82	with (
83	patch(
84	"dlm.preference.judge.subprocess.run",
85	return_value=_proc(returncode=7, stderr="bad model"),
86	),
87	pytest.raises(JudgeInvocationError, match="exited 7: bad model"),
88	):
89	judge.score_pair("p", "a", "b")
90
91	def test_invalid_json_raises(self) -> None:
92	judge = CliJudge("judge-bin")
93	with (
94	patch(
95	"dlm.preference.judge.subprocess.run",
96	return_value=_proc(stdout="not-json"),
97	),
98	pytest.raises(JudgeInvocationError, match="invalid JSON"),
99	):
100	judge.score_pair("p", "a", "b")
101
102	def test_missing_numeric_score_raises(self) -> None:
103	judge = CliJudge("judge-bin")
104	with (
105	patch(
106	"dlm.preference.judge.subprocess.run",
107	return_value=_proc(stdout='{"reasoning": "oops"}'),
108	),
109	pytest.raises(JudgeInvocationError, match="numeric `score`"),
110	):
111	judge.score_pair("p", "a", "b")
112
113	def test_missing_binary_raises_unavailable(self) -> None:
114	judge = CliJudge("judge-bin")
115	with (
116	patch(
117	"dlm.preference.judge.subprocess.run",
118	side_effect=FileNotFoundError("judge-bin"),
119	),
120	pytest.raises(JudgeUnavailableError, match="not available on PATH"),
121	):
122	judge.score_pair("p", "a", "b")
123
124	def test_timeout_raises_invocation_error(self) -> None:
125	judge = CliJudge("judge-bin", timeout=1.5)
126	with (
127	patch(
128	"dlm.preference.judge.subprocess.run",
129	side_effect=subprocess.TimeoutExpired(cmd="judge-bin", timeout=1.5),
130	),
131	pytest.raises(JudgeInvocationError, match="timed out after 1.5s"),
132	):
133	judge.score_pair("p", "a", "b")
134
135	def test_oserror_raises_unavailable_error(self) -> None:
136	judge = CliJudge("judge-bin")
137	with (
138	patch(
139	"dlm.preference.judge.subprocess.run",
140	side_effect=OSError("permission denied"),
141	),
142	pytest.raises(JudgeUnavailableError, match="could not start"),
143	):
144	judge.score_pair("p", "a", "b")
145
146
147	class TestCliJudgeHelpers:
148	def test_empty_stdout_is_rejected(self) -> None:
149	with pytest.raises(JudgeInvocationError, match="empty stdout"):
150	_parse_cli_candidate_score(" ")
151
152	def test_json_must_be_object(self) -> None:
153	with pytest.raises(JudgeInvocationError, match="JSON object"):
154	_parse_cli_candidate_score('["not", "an", "object"]')
155
156	@pytest.mark.parametrize("score", [float("nan"), float("inf"), -float("inf")])
157	def test_score_must_be_finite(self, score: float) -> None:
158	rendered = "NaN" if math.isnan(score) else ("Infinity" if score > 0 else "-Infinity")
159	with pytest.raises(JudgeInvocationError, match="must be finite"):
160	_parse_cli_candidate_score(f'{{"score": {rendered}}}')
161
162	def test_reasoning_must_be_string_when_present(self) -> None:
163	with pytest.raises(JudgeInvocationError, match="must be a string"):
164	_parse_cli_candidate_score('{"score": 1.0, "reasoning": 7}')
165
166	@pytest.mark.parametrize(
167	("left", "right", "expected"),
168	[
169	("why a", None, "a: why a"),
170	(None, "why b", "b: why b"),
171	("why a", "why b", "a: why a \| b: why b"),
172	(None, None, None),
173	],
174	)
175	def test_combine_reasoning_formats_present_parts(
176	self,
177	left: str \| None,
178	right: str \| None,
179	expected: str \| None,
180	) -> None:
181	assert _combine_reasoning(left, right) == expected
182
183
184	class TestBuildJudge:
185	def test_cli_ref_builds_concrete_cli_judge(self) -> None:
186	judge = build_judge("cli:judge-bin --json")
187
188	assert isinstance(judge, CliJudge)
189	assert judge.name == "cli:judge-bin --json"
190
191	def test_hf_ref_builds_concrete_hf_judge(self) -> None:
192	judge = build_judge("hf:reward/model")
193
194	assert isinstance(judge, HfRewardModelJudge)
195	assert judge.name == "hf:reward/model"
196
197	def test_sway_ref_builds_concrete_sway_judge(self) -> None:
198	judge = build_judge("sway", dlm_path=Path("/tmp/example.dlm"))
199
200	assert isinstance(judge, SwayJudge)
201	assert judge.name == "sway:preference_judge"
202
203	def test_sway_ref_requires_dlm_path_context(self) -> None:
204	with pytest.raises(JudgeUnavailableError, match="requires the .dlm path context"):
205	build_judge("sway")