Python · 9860 bytes Raw Blame History
1 """Unit tests for Sprint 42's pure preference-mining backend."""
2
3 from __future__ import annotations
4
5 from collections import deque
6
7 import pytest
8
9 from dlm.doc.parser import parse_text
10 from dlm.preference import (
11 PreferenceMineSkipReason,
12 build_mine_plan,
13 render_mine_plan,
14 )
15 from dlm.preference.judge import PairScore
16 from dlm.preference.mine import _best_pair, _first_line, _resolve_pair, _unique_nonempty
17
18 _FRONTMATTER = """---
19 dlm_id: 01KPQ9X1000000000000000000
20 dlm_version: 14
21 base_model: smollm2-135m
22 ---
23 """
24
25
26 class StubBackend:
27 def __init__(self, responses: dict[str, list[str]]) -> None:
28 self._responses = {prompt: deque(items) for prompt, items in responses.items()}
29
30 def generate(self, prompt: str, **_gen_kwargs: object) -> str:
31 return self._responses[prompt].popleft()
32
33
34 class StubJudge:
35 name = "stub:judge"
36 suggested_threshold = 0.10
37
38 def __init__(self, scores: dict[tuple[str, str, str], PairScore]) -> None:
39 self._scores = scores
40
41 def score_pair(self, prompt: str, candidate_a: str, candidate_b: str) -> PairScore:
42 return self._scores[(prompt, candidate_a, candidate_b)]
43
44
45 def _parsed(body: str):
46 return parse_text(_FRONTMATTER + body)
47
48
49 class TestBuildMinePlan:
50 def test_validates_numeric_limits(self) -> None:
51 parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
52 backend = StubBackend({"question?": ["one", "two"]})
53 judge = StubJudge({("question?", "one", "two"): PairScore(score_a=1.0, score_b=0.0)})
54
55 with pytest.raises(ValueError, match="samples must be >= 2"):
56 build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=1)
57 with pytest.raises(ValueError, match="max_pairs must be >= 1"):
58 build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_pairs=0)
59 with pytest.raises(ValueError, match="threshold must be >= 0.0"):
60 build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, threshold=-0.1)
61 with pytest.raises(ValueError, match="max_new_tokens must be >= 1"):
62 build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_new_tokens=0)
63
64 def test_materializes_auto_mined_preference_section(self) -> None:
65 parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
66 backend = StubBackend({"question?": ["bad answer", "good answer"]})
67 judge = StubJudge(
68 {
69 ("question?", "bad answer", "good answer"): PairScore(
70 score_a=0.1,
71 score_b=0.9,
72 reasoning="good answer wins clearly",
73 )
74 }
75 )
76
77 plan = build_mine_plan(
78 parsed,
79 backend,
80 judge,
81 mined_run_id=7,
82 samples=2,
83 mined_at="2026-04-23T20:00:00Z",
84 )
85
86 assert len(plan.additions) == 1
87 assert plan.skipped == ()
88 addition = plan.additions[0]
89 assert addition.chosen == "good answer"
90 assert addition.rejected == "bad answer"
91 assert addition.source.prompt == "question?"
92 assert addition.section.auto_mined is True
93 assert addition.section.judge_name == "stub:judge"
94 assert addition.section.judge_score_chosen == 0.9
95 assert addition.section.judge_score_rejected == 0.1
96 assert addition.section.mined_at == "2026-04-23T20:00:00Z"
97 assert addition.section.mined_run_id == 7
98 assert addition.section.content == (
99 "### Prompt\nquestion?\n### Chosen\ngood answer\n### Rejected\nbad answer"
100 )
101
102 def test_probe_markers_are_normalized_and_duplicate_prompts_skip(self) -> None:
103 parsed = _parsed(
104 "::instruction::\n"
105 "### Q !probe\n"
106 "What is DGEMM?\n"
107 "### A\n"
108 "A matrix multiply.\n\n"
109 "::instruction::\n"
110 "### Q\n"
111 "What is DGEMM?\n"
112 "### A\n"
113 "Still a matrix multiply.\n"
114 )
115 backend = StubBackend({"What is DGEMM?": ["weak", "strong"]})
116 judge = StubJudge(
117 {
118 ("What is DGEMM?", "weak", "strong"): PairScore(
119 score_a=0.2,
120 score_b=0.8,
121 )
122 }
123 )
124
125 plan = build_mine_plan(parsed, backend, judge, mined_run_id=3, samples=2)
126
127 assert len(plan.additions) == 1
128 assert len(plan.skipped) == 1
129 assert plan.skipped[0].reason is PreferenceMineSkipReason.DUPLICATE_PROMPT
130 assert plan.additions[0].source.prompt == "What is DGEMM?"
131
132 def test_existing_preference_is_not_remined(self) -> None:
133 parsed = _parsed(
134 "::instruction::\n"
135 "### Q\n"
136 "question?\n"
137 "### A\n"
138 "reference\n\n"
139 "::preference::\n"
140 "### Prompt\n"
141 "question?\n"
142 "### Chosen\n"
143 "good answer\n"
144 "### Rejected\n"
145 "bad answer\n"
146 )
147 backend = StubBackend({"question?": ["bad answer", "good answer"]})
148 judge = StubJudge(
149 {
150 ("question?", "bad answer", "good answer"): PairScore(
151 score_a=0.2,
152 score_b=0.9,
153 )
154 }
155 )
156
157 plan = build_mine_plan(parsed, backend, judge, mined_run_id=5, samples=2)
158
159 assert plan.additions == ()
160 assert len(plan.skipped) == 1
161 assert plan.skipped[0].reason is PreferenceMineSkipReason.ALREADY_PRESENT
162
163 def test_near_identical_candidates_are_rejected(self) -> None:
164 parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
165 backend = StubBackend(
166 {
167 "question?": [
168 "This answer explains the algorithm step by step in a calm, direct tone.",
169 "This answer explains the algorithm step by step in a calm, direct tone!",
170 ]
171 }
172 )
173 judge = StubJudge({})
174
175 plan = build_mine_plan(parsed, backend, judge, mined_run_id=9, samples=2)
176
177 assert plan.additions == ()
178 assert len(plan.skipped) == 1
179 assert plan.skipped[0].reason is PreferenceMineSkipReason.TOO_SIMILAR
180
181 def test_below_threshold_pairs_are_skipped_and_rendered(self) -> None:
182 parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
183 backend = StubBackend({"question?": ["candidate one", "candidate two"]})
184 judge = StubJudge(
185 {
186 ("question?", "candidate one", "candidate two"): PairScore(
187 score_a=0.52,
188 score_b=0.48,
189 )
190 }
191 )
192
193 plan = build_mine_plan(parsed, backend, judge, mined_run_id=11, samples=2)
194 rendered = render_mine_plan(plan)
195
196 assert plan.additions == ()
197 assert len(plan.skipped) == 1
198 assert plan.skipped[0].reason is PreferenceMineSkipReason.BELOW_THRESHOLD
199 assert "preference mine plan: 0 add, 1 skip" in rendered
200 assert "below_threshold" in rendered
201
202 def test_malformed_instruction_section_is_reported(self) -> None:
203 parsed = _parsed("::instruction::\n### Q\nunterminated question\n")
204 backend = StubBackend({})
205 judge = StubJudge({})
206
207 plan = build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2)
208
209 assert plan.additions == ()
210 assert len(plan.skipped) == 1
211 assert plan.skipped[0].reason is PreferenceMineSkipReason.MALFORMED_INSTRUCTION
212
213 def test_stops_collecting_once_max_pairs_is_reached(self) -> None:
214 parsed = _parsed(
215 "::instruction::\n### Q\nquestion one?\n### A\nreference\n\n"
216 "::instruction::\n### Q\nquestion two?\n### A\nreference\n"
217 )
218 backend = StubBackend(
219 {
220 "question one?": ["bad one", "good one"],
221 "question two?": ["bad two", "good two"],
222 }
223 )
224 judge = StubJudge(
225 {
226 ("question one?", "bad one", "good one"): PairScore(score_a=0.1, score_b=0.9),
227 ("question two?", "bad two", "good two"): PairScore(score_a=0.1, score_b=0.9),
228 }
229 )
230
231 plan = build_mine_plan(parsed, backend, judge, mined_run_id=4, samples=2, max_pairs=1)
232
233 assert len(plan.additions) == 1
234 assert plan.additions[0].source.prompt == "question one?"
235
236 def test_insufficient_variety_is_reported(self) -> None:
237 parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
238 backend = StubBackend({"question?": [" same ", "", "same", " "]})
239 judge = StubJudge({})
240
241 plan = build_mine_plan(parsed, backend, judge, mined_run_id=6, samples=4)
242
243 assert plan.additions == ()
244 assert len(plan.skipped) == 1
245 assert plan.skipped[0].reason is PreferenceMineSkipReason.INSUFFICIENT_VARIETY
246 assert "need at least 2 unique non-empty candidates" in plan.skipped[0].detail
247
248
249 class TestMineHelpers:
250 def test_unique_nonempty_strips_blanks_and_duplicates(self) -> None:
251 assert _unique_nonempty(["", " alpha ", "alpha", "beta", " "]) == ["alpha", "beta"]
252
253 def test_best_pair_skips_ties(self) -> None:
254 judge = StubJudge({("prompt", "a", "b"): PairScore(score_a=0.4, score_b=0.4)})
255
256 assert _best_pair("prompt", ["a", "b"], judge=judge) is None
257
258 def test_resolve_pair_returns_none_for_ties(self) -> None:
259 assert _resolve_pair("a", "b", PairScore(score_a=0.2, score_b=0.2)) is None
260
261 def test_first_line_truncates_long_text(self) -> None:
262 rendered = _first_line("x" * 90, max_chars=20)
263 assert rendered == ("x" * 19) + "…"