Python · 7350 bytes Raw Blame History
1 """Unit tests for Sprint 42's external CLI judge runtime."""
2
3 from __future__ import annotations
4
5 import math
6 import subprocess
7 from pathlib import Path
8 from unittest.mock import patch
9
10 import pytest
11
12 from dlm.preference import (
13 CliJudge,
14 HfRewardModelJudge,
15 InvalidJudgeSpecError,
16 JudgeInvocationError,
17 JudgeUnavailableError,
18 SwayJudge,
19 build_judge,
20 )
21 from dlm.preference.judge import _combine_reasoning, _parse_cli_candidate_score
22
23
24 def _proc(
25 *,
26 returncode: int = 0,
27 stdout: str = "",
28 stderr: str = "",
29 ) -> subprocess.CompletedProcess[str]:
30 return subprocess.CompletedProcess(
31 args=["judge-bin"],
32 returncode=returncode,
33 stdout=stdout,
34 stderr=stderr,
35 )
36
37
38 class TestCliJudge:
39 def test_blank_command_is_rejected(self) -> None:
40 with pytest.raises(InvalidJudgeSpecError, match="include a command"):
41 CliJudge(" ")
42
43 def test_empty_argv_after_split_is_rejected(self) -> None:
44 with (
45 patch("dlm.preference.judge.shlex.split", return_value=[]),
46 pytest.raises(InvalidJudgeSpecError, match="include a command"),
47 ):
48 CliJudge("judge-bin")
49
50 def test_non_positive_timeout_is_rejected(self) -> None:
51 with pytest.raises(ValueError, match="timeout must be > 0"):
52 CliJudge("judge-bin", timeout=0.0)
53
54 def test_scores_pair_via_two_json_round_trips(self) -> None:
55 seen_payloads: list[str] = []
56
57 def fake_run(*args: object, **kwargs: object) -> subprocess.CompletedProcess[str]:
58 argv = args[0]
59 assert argv == ["judge-bin", "--json"]
60 payload = kwargs["input"]
61 assert isinstance(payload, str)
62 seen_payloads.append(payload)
63 if len(seen_payloads) == 1:
64 return _proc(stdout='{"score": 0.2, "reasoning": "too vague"}')
65 return _proc(stdout='{"score": 0.9, "reasoning": "specific and correct"}')
66
67 judge = CliJudge("judge-bin --json")
68 with patch("dlm.preference.judge.subprocess.run", side_effect=fake_run):
69 score = judge.score_pair("What is DGEMM?", "bad", "good")
70
71 assert score.score_a == pytest.approx(0.2)
72 assert score.score_b == pytest.approx(0.9)
73 assert score.preferred == "b"
74 assert "a: too vague" in (score.reasoning or "")
75 assert "b: specific and correct" in (score.reasoning or "")
76 assert '"prompt": "What is DGEMM?"' in seen_payloads[0]
77 assert '"candidate": "bad"' in seen_payloads[0]
78 assert '"candidate": "good"' in seen_payloads[1]
79
80 def test_non_zero_exit_raises(self) -> None:
81 judge = CliJudge("judge-bin")
82 with (
83 patch(
84 "dlm.preference.judge.subprocess.run",
85 return_value=_proc(returncode=7, stderr="bad model"),
86 ),
87 pytest.raises(JudgeInvocationError, match="exited 7: bad model"),
88 ):
89 judge.score_pair("p", "a", "b")
90
91 def test_invalid_json_raises(self) -> None:
92 judge = CliJudge("judge-bin")
93 with (
94 patch(
95 "dlm.preference.judge.subprocess.run",
96 return_value=_proc(stdout="not-json"),
97 ),
98 pytest.raises(JudgeInvocationError, match="invalid JSON"),
99 ):
100 judge.score_pair("p", "a", "b")
101
102 def test_missing_numeric_score_raises(self) -> None:
103 judge = CliJudge("judge-bin")
104 with (
105 patch(
106 "dlm.preference.judge.subprocess.run",
107 return_value=_proc(stdout='{"reasoning": "oops"}'),
108 ),
109 pytest.raises(JudgeInvocationError, match="numeric `score`"),
110 ):
111 judge.score_pair("p", "a", "b")
112
113 def test_missing_binary_raises_unavailable(self) -> None:
114 judge = CliJudge("judge-bin")
115 with (
116 patch(
117 "dlm.preference.judge.subprocess.run",
118 side_effect=FileNotFoundError("judge-bin"),
119 ),
120 pytest.raises(JudgeUnavailableError, match="not available on PATH"),
121 ):
122 judge.score_pair("p", "a", "b")
123
124 def test_timeout_raises_invocation_error(self) -> None:
125 judge = CliJudge("judge-bin", timeout=1.5)
126 with (
127 patch(
128 "dlm.preference.judge.subprocess.run",
129 side_effect=subprocess.TimeoutExpired(cmd="judge-bin", timeout=1.5),
130 ),
131 pytest.raises(JudgeInvocationError, match="timed out after 1.5s"),
132 ):
133 judge.score_pair("p", "a", "b")
134
135 def test_oserror_raises_unavailable_error(self) -> None:
136 judge = CliJudge("judge-bin")
137 with (
138 patch(
139 "dlm.preference.judge.subprocess.run",
140 side_effect=OSError("permission denied"),
141 ),
142 pytest.raises(JudgeUnavailableError, match="could not start"),
143 ):
144 judge.score_pair("p", "a", "b")
145
146
147 class TestCliJudgeHelpers:
148 def test_empty_stdout_is_rejected(self) -> None:
149 with pytest.raises(JudgeInvocationError, match="empty stdout"):
150 _parse_cli_candidate_score(" ")
151
152 def test_json_must_be_object(self) -> None:
153 with pytest.raises(JudgeInvocationError, match="JSON object"):
154 _parse_cli_candidate_score('["not", "an", "object"]')
155
156 @pytest.mark.parametrize("score", [float("nan"), float("inf"), -float("inf")])
157 def test_score_must_be_finite(self, score: float) -> None:
158 rendered = "NaN" if math.isnan(score) else ("Infinity" if score > 0 else "-Infinity")
159 with pytest.raises(JudgeInvocationError, match="must be finite"):
160 _parse_cli_candidate_score(f'{{"score": {rendered}}}')
161
162 def test_reasoning_must_be_string_when_present(self) -> None:
163 with pytest.raises(JudgeInvocationError, match="must be a string"):
164 _parse_cli_candidate_score('{"score": 1.0, "reasoning": 7}')
165
166 @pytest.mark.parametrize(
167 ("left", "right", "expected"),
168 [
169 ("why a", None, "a: why a"),
170 (None, "why b", "b: why b"),
171 ("why a", "why b", "a: why a | b: why b"),
172 (None, None, None),
173 ],
174 )
175 def test_combine_reasoning_formats_present_parts(
176 self,
177 left: str | None,
178 right: str | None,
179 expected: str | None,
180 ) -> None:
181 assert _combine_reasoning(left, right) == expected
182
183
184 class TestBuildJudge:
185 def test_cli_ref_builds_concrete_cli_judge(self) -> None:
186 judge = build_judge("cli:judge-bin --json")
187
188 assert isinstance(judge, CliJudge)
189 assert judge.name == "cli:judge-bin --json"
190
191 def test_hf_ref_builds_concrete_hf_judge(self) -> None:
192 judge = build_judge("hf:reward/model")
193
194 assert isinstance(judge, HfRewardModelJudge)
195 assert judge.name == "hf:reward/model"
196
197 def test_sway_ref_builds_concrete_sway_judge(self) -> None:
198 judge = build_judge("sway", dlm_path=Path("/tmp/example.dlm"))
199
200 assert isinstance(judge, SwayJudge)
201 assert judge.name == "sway:preference_judge"
202
203 def test_sway_ref_requires_dlm_path_context(self) -> None:
204 with pytest.raises(JudgeUnavailableError, match="requires the .dlm path context"):
205 build_judge("sway")