documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 9860 bytes Raw Blame History

  
        1
        """Unit tests for Sprint 42's pure preference-mining backend."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from collections import deque
      
        6
        
        7
        import pytest
      
        8
        
        9
        from dlm.doc.parser import parse_text
      
        10
        from dlm.preference import (
      
        11
            PreferenceMineSkipReason,
      
        12
            build_mine_plan,
      
        13
            render_mine_plan,
      
        14
        )
      
        15
        from dlm.preference.judge import PairScore
      
        16
        from dlm.preference.mine import _best_pair, _first_line, _resolve_pair, _unique_nonempty
      
        17
        
        18
        _FRONTMATTER = """---
      
        19
        dlm_id: 01KPQ9X1000000000000000000
      
        20
        dlm_version: 14
      
        21
        base_model: smollm2-135m
      
        22
        ---
      
        23
        """
      
        24
        
        25
        
        26
        class StubBackend:
      
        27
            def __init__(self, responses: dict[str, list[str]]) -> None:
      
        28
                self._responses = {prompt: deque(items) for prompt, items in responses.items()}
      
        29
        
        30
            def generate(self, prompt: str, **_gen_kwargs: object) -> str:
      
        31
                return self._responses[prompt].popleft()
      
        32
        
        33
        
        34
        class StubJudge:
      
        35
            name = "stub:judge"
      
        36
            suggested_threshold = 0.10
      
        37
        
        38
            def __init__(self, scores: dict[tuple[str, str, str], PairScore]) -> None:
      
        39
                self._scores = scores
      
        40
        
        41
            def score_pair(self, prompt: str, candidate_a: str, candidate_b: str) -> PairScore:
      
        42
                return self._scores[(prompt, candidate_a, candidate_b)]
      
        43
        
        44
        
        45
        def _parsed(body: str):
      
        46
            return parse_text(_FRONTMATTER + body)
      
        47
        
        48
        
        49
        class TestBuildMinePlan:
      
        50
            def test_validates_numeric_limits(self) -> None:
      
        51
                parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
      
        52
                backend = StubBackend({"question?": ["one", "two"]})
      
        53
                judge = StubJudge({("question?", "one", "two"): PairScore(score_a=1.0, score_b=0.0)})
      
        54
        
        55
                with pytest.raises(ValueError, match="samples must be >= 2"):
      
        56
                    build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=1)
      
        57
                with pytest.raises(ValueError, match="max_pairs must be >= 1"):
      
        58
                    build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_pairs=0)
      
        59
                with pytest.raises(ValueError, match="threshold must be >= 0.0"):
      
        60
                    build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, threshold=-0.1)
      
        61
                with pytest.raises(ValueError, match="max_new_tokens must be >= 1"):
      
        62
                    build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_new_tokens=0)
      
        63
        
        64
            def test_materializes_auto_mined_preference_section(self) -> None:
      
        65
                parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
      
        66
                backend = StubBackend({"question?": ["bad answer", "good answer"]})
      
        67
                judge = StubJudge(
      
        68
                    {
      
        69
                        ("question?", "bad answer", "good answer"): PairScore(
      
        70
                            score_a=0.1,
      
        71
                            score_b=0.9,
      
        72
                            reasoning="good answer wins clearly",
      
        73
                        )
      
        74
                    }
      
        75
                )
      
        76
        
        77
                plan = build_mine_plan(
      
        78
                    parsed,
      
        79
                    backend,
      
        80
                    judge,
      
        81
                    mined_run_id=7,
      
        82
                    samples=2,
      
        83
                    mined_at="2026-04-23T20:00:00Z",
      
        84
                )
      
        85
        
        86
                assert len(plan.additions) == 1
      
        87
                assert plan.skipped == ()
      
        88
                addition = plan.additions[0]
      
        89
                assert addition.chosen == "good answer"
      
        90
                assert addition.rejected == "bad answer"
      
        91
                assert addition.source.prompt == "question?"
      
        92
                assert addition.section.auto_mined is True
      
        93
                assert addition.section.judge_name == "stub:judge"
      
        94
                assert addition.section.judge_score_chosen == 0.9
      
        95
                assert addition.section.judge_score_rejected == 0.1
      
        96
                assert addition.section.mined_at == "2026-04-23T20:00:00Z"
      
        97
                assert addition.section.mined_run_id == 7
      
        98
                assert addition.section.content == (
      
        99
                    "### Prompt\nquestion?\n### Chosen\ngood answer\n### Rejected\nbad answer"
      
        100
                )
      
        101
        
        102
            def test_probe_markers_are_normalized_and_duplicate_prompts_skip(self) -> None:
      
        103
                parsed = _parsed(
      
        104
                    "::instruction::\n"
      
        105
                    "### Q !probe\n"
      
        106
                    "What is DGEMM?\n"
      
        107
                    "### A\n"
      
        108
                    "A matrix multiply.\n\n"
      
        109
                    "::instruction::\n"
      
        110
                    "### Q\n"
      
        111
                    "What is DGEMM?\n"
      
        112
                    "### A\n"
      
        113
                    "Still a matrix multiply.\n"
      
        114
                )
      
        115
                backend = StubBackend({"What is DGEMM?": ["weak", "strong"]})
      
        116
                judge = StubJudge(
      
        117
                    {
      
        118
                        ("What is DGEMM?", "weak", "strong"): PairScore(
      
        119
                            score_a=0.2,
      
        120
                            score_b=0.8,
      
        121
                        )
      
        122
                    }
      
        123
                )
      
        124
        
        125
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=3, samples=2)
      
        126
        
        127
                assert len(plan.additions) == 1
      
        128
                assert len(plan.skipped) == 1
      
        129
                assert plan.skipped[0].reason is PreferenceMineSkipReason.DUPLICATE_PROMPT
      
        130
                assert plan.additions[0].source.prompt == "What is DGEMM?"
      
        131
        
        132
            def test_existing_preference_is_not_remined(self) -> None:
      
        133
                parsed = _parsed(
      
        134
                    "::instruction::\n"
      
        135
                    "### Q\n"
      
        136
                    "question?\n"
      
        137
                    "### A\n"
      
        138
                    "reference\n\n"
      
        139
                    "::preference::\n"
      
        140
                    "### Prompt\n"
      
        141
                    "question?\n"
      
        142
                    "### Chosen\n"
      
        143
                    "good answer\n"
      
        144
                    "### Rejected\n"
      
        145
                    "bad answer\n"
      
        146
                )
      
        147
                backend = StubBackend({"question?": ["bad answer", "good answer"]})
      
        148
                judge = StubJudge(
      
        149
                    {
      
        150
                        ("question?", "bad answer", "good answer"): PairScore(
      
        151
                            score_a=0.2,
      
        152
                            score_b=0.9,
      
        153
                        )
      
        154
                    }
      
        155
                )
      
        156
        
        157
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=5, samples=2)
      
        158
        
        159
                assert plan.additions == ()
      
        160
                assert len(plan.skipped) == 1
      
        161
                assert plan.skipped[0].reason is PreferenceMineSkipReason.ALREADY_PRESENT
      
        162
        
        163
            def test_near_identical_candidates_are_rejected(self) -> None:
      
        164
                parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
      
        165
                backend = StubBackend(
      
        166
                    {
      
        167
                        "question?": [
      
        168
                            "This answer explains the algorithm step by step in a calm, direct tone.",
      
        169
                            "This answer explains the algorithm step by step in a calm, direct tone!",
      
        170
                        ]
      
        171
                    }
      
        172
                )
      
        173
                judge = StubJudge({})
      
        174
        
        175
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=9, samples=2)
      
        176
        
        177
                assert plan.additions == ()
      
        178
                assert len(plan.skipped) == 1
      
        179
                assert plan.skipped[0].reason is PreferenceMineSkipReason.TOO_SIMILAR
      
        180
        
        181
            def test_below_threshold_pairs_are_skipped_and_rendered(self) -> None:
      
        182
                parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
      
        183
                backend = StubBackend({"question?": ["candidate one", "candidate two"]})
      
        184
                judge = StubJudge(
      
        185
                    {
      
        186
                        ("question?", "candidate one", "candidate two"): PairScore(
      
        187
                            score_a=0.52,
      
        188
                            score_b=0.48,
      
        189
                        )
      
        190
                    }
      
        191
                )
      
        192
        
        193
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=11, samples=2)
      
        194
                rendered = render_mine_plan(plan)
      
        195
        
        196
                assert plan.additions == ()
      
        197
                assert len(plan.skipped) == 1
      
        198
                assert plan.skipped[0].reason is PreferenceMineSkipReason.BELOW_THRESHOLD
      
        199
                assert "preference mine plan: 0 add, 1 skip" in rendered
      
        200
                assert "below_threshold" in rendered
      
        201
        
        202
            def test_malformed_instruction_section_is_reported(self) -> None:
      
        203
                parsed = _parsed("::instruction::\n### Q\nunterminated question\n")
      
        204
                backend = StubBackend({})
      
        205
                judge = StubJudge({})
      
        206
        
        207
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2)
      
        208
        
        209
                assert plan.additions == ()
      
        210
                assert len(plan.skipped) == 1
      
        211
                assert plan.skipped[0].reason is PreferenceMineSkipReason.MALFORMED_INSTRUCTION
      
        212
        
        213
            def test_stops_collecting_once_max_pairs_is_reached(self) -> None:
      
        214
                parsed = _parsed(
      
        215
                    "::instruction::\n### Q\nquestion one?\n### A\nreference\n\n"
      
        216
                    "::instruction::\n### Q\nquestion two?\n### A\nreference\n"
      
        217
                )
      
        218
                backend = StubBackend(
      
        219
                    {
      
        220
                        "question one?": ["bad one", "good one"],
      
        221
                        "question two?": ["bad two", "good two"],
      
        222
                    }
      
        223
                )
      
        224
                judge = StubJudge(
      
        225
                    {
      
        226
                        ("question one?", "bad one", "good one"): PairScore(score_a=0.1, score_b=0.9),
      
        227
                        ("question two?", "bad two", "good two"): PairScore(score_a=0.1, score_b=0.9),
      
        228
                    }
      
        229
                )
      
        230
        
        231
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=4, samples=2, max_pairs=1)
      
        232
        
        233
                assert len(plan.additions) == 1
      
        234
                assert plan.additions[0].source.prompt == "question one?"
      
        235
        
        236
            def test_insufficient_variety_is_reported(self) -> None:
      
        237
                parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
      
        238
                backend = StubBackend({"question?": [" same ", "", "same", "   "]})
      
        239
                judge = StubJudge({})
      
        240
        
        241
                plan = build_mine_plan(parsed, backend, judge, mined_run_id=6, samples=4)
      
        242
        
        243
                assert plan.additions == ()
      
        244
                assert len(plan.skipped) == 1
      
        245
                assert plan.skipped[0].reason is PreferenceMineSkipReason.INSUFFICIENT_VARIETY
      
        246
                assert "need at least 2 unique non-empty candidates" in plan.skipped[0].detail
      
        247
        
        248
        
        249
        class TestMineHelpers:
      
        250
            def test_unique_nonempty_strips_blanks_and_duplicates(self) -> None:
      
        251
                assert _unique_nonempty(["", " alpha ", "alpha", "beta", "   "]) == ["alpha", "beta"]
      
        252
        
        253
            def test_best_pair_skips_ties(self) -> None:
      
        254
                judge = StubJudge({("prompt", "a", "b"): PairScore(score_a=0.4, score_b=0.4)})
      
        255
        
        256
                assert _best_pair("prompt", ["a", "b"], judge=judge) is None
      
        257
        
        258
            def test_resolve_pair_returns_none_for_ties(self) -> None:
      
        259
                assert _resolve_pair("a", "b", PairScore(score_a=0.2, score_b=0.2)) is None
      
        260
        
        261
            def test_first_line_truncates_long_text(self) -> None:
      
        262
                rendered = _first_line("x" * 90, max_chars=20)
      
        263
                assert rendered == ("x" * 19) + "…"

1	"""Unit tests for Sprint 42's pure preference-mining backend."""
2
3	from __future__ import annotations
4
5	from collections import deque
6
7	import pytest
8
9	from dlm.doc.parser import parse_text
10	from dlm.preference import (
11	PreferenceMineSkipReason,
12	build_mine_plan,
13	render_mine_plan,
14	)
15	from dlm.preference.judge import PairScore
16	from dlm.preference.mine import _best_pair, _first_line, _resolve_pair, _unique_nonempty
17
18	_FRONTMATTER = """---
19	dlm_id: 01KPQ9X1000000000000000000
20	dlm_version: 14
21	base_model: smollm2-135m
22	---
23	"""
24
25
26	class StubBackend:
27	def __init__(self, responses: dict[str, list[str]]) -> None:
28	self._responses = {prompt: deque(items) for prompt, items in responses.items()}
29
30	def generate(self, prompt: str, **_gen_kwargs: object) -> str:
31	return self._responses[prompt].popleft()
32
33
34	class StubJudge:
35	name = "stub:judge"
36	suggested_threshold = 0.10
37
38	def __init__(self, scores: dict[tuple[str, str, str], PairScore]) -> None:
39	self._scores = scores
40
41	def score_pair(self, prompt: str, candidate_a: str, candidate_b: str) -> PairScore:
42	return self._scores[(prompt, candidate_a, candidate_b)]
43
44
45	def _parsed(body: str):
46	return parse_text(_FRONTMATTER + body)
47
48
49	class TestBuildMinePlan:
50	def test_validates_numeric_limits(self) -> None:
51	parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
52	backend = StubBackend({"question?": ["one", "two"]})
53	judge = StubJudge({("question?", "one", "two"): PairScore(score_a=1.0, score_b=0.0)})
54
55	with pytest.raises(ValueError, match="samples must be >= 2"):
56	build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=1)
57	with pytest.raises(ValueError, match="max_pairs must be >= 1"):
58	build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_pairs=0)
59	with pytest.raises(ValueError, match="threshold must be >= 0.0"):
60	build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, threshold=-0.1)
61	with pytest.raises(ValueError, match="max_new_tokens must be >= 1"):
62	build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_new_tokens=0)
63
64	def test_materializes_auto_mined_preference_section(self) -> None:
65	parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
66	backend = StubBackend({"question?": ["bad answer", "good answer"]})
67	judge = StubJudge(
68	{
69	("question?", "bad answer", "good answer"): PairScore(
70	score_a=0.1,
71	score_b=0.9,
72	reasoning="good answer wins clearly",
73	)
74	}
75	)
76
77	plan = build_mine_plan(
78	parsed,
79	backend,
80	judge,
81	mined_run_id=7,
82	samples=2,
83	mined_at="2026-04-23T20:00:00Z",
84	)
85
86	assert len(plan.additions) == 1
87	assert plan.skipped == ()
88	addition = plan.additions[0]
89	assert addition.chosen == "good answer"
90	assert addition.rejected == "bad answer"
91	assert addition.source.prompt == "question?"
92	assert addition.section.auto_mined is True
93	assert addition.section.judge_name == "stub:judge"
94	assert addition.section.judge_score_chosen == 0.9
95	assert addition.section.judge_score_rejected == 0.1
96	assert addition.section.mined_at == "2026-04-23T20:00:00Z"
97	assert addition.section.mined_run_id == 7
98	assert addition.section.content == (
99	"### Prompt\nquestion?\n### Chosen\ngood answer\n### Rejected\nbad answer"
100	)
101
102	def test_probe_markers_are_normalized_and_duplicate_prompts_skip(self) -> None:
103	parsed = _parsed(
104	"::instruction::\n"
105	"### Q !probe\n"
106	"What is DGEMM?\n"
107	"### A\n"
108	"A matrix multiply.\n\n"
109	"::instruction::\n"
110	"### Q\n"
111	"What is DGEMM?\n"
112	"### A\n"
113	"Still a matrix multiply.\n"
114	)
115	backend = StubBackend({"What is DGEMM?": ["weak", "strong"]})
116	judge = StubJudge(
117	{
118	("What is DGEMM?", "weak", "strong"): PairScore(
119	score_a=0.2,
120	score_b=0.8,
121	)
122	}
123	)
124
125	plan = build_mine_plan(parsed, backend, judge, mined_run_id=3, samples=2)
126
127	assert len(plan.additions) == 1
128	assert len(plan.skipped) == 1
129	assert plan.skipped[0].reason is PreferenceMineSkipReason.DUPLICATE_PROMPT
130	assert plan.additions[0].source.prompt == "What is DGEMM?"
131
132	def test_existing_preference_is_not_remined(self) -> None:
133	parsed = _parsed(
134	"::instruction::\n"
135	"### Q\n"
136	"question?\n"
137	"### A\n"
138	"reference\n\n"
139	"::preference::\n"
140	"### Prompt\n"
141	"question?\n"
142	"### Chosen\n"
143	"good answer\n"
144	"### Rejected\n"
145	"bad answer\n"
146	)
147	backend = StubBackend({"question?": ["bad answer", "good answer"]})
148	judge = StubJudge(
149	{
150	("question?", "bad answer", "good answer"): PairScore(
151	score_a=0.2,
152	score_b=0.9,
153	)
154	}
155	)
156
157	plan = build_mine_plan(parsed, backend, judge, mined_run_id=5, samples=2)
158
159	assert plan.additions == ()
160	assert len(plan.skipped) == 1
161	assert plan.skipped[0].reason is PreferenceMineSkipReason.ALREADY_PRESENT
162
163	def test_near_identical_candidates_are_rejected(self) -> None:
164	parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
165	backend = StubBackend(
166	{
167	"question?": [
168	"This answer explains the algorithm step by step in a calm, direct tone.",
169	"This answer explains the algorithm step by step in a calm, direct tone!",
170	]
171	}
172	)
173	judge = StubJudge({})
174
175	plan = build_mine_plan(parsed, backend, judge, mined_run_id=9, samples=2)
176
177	assert plan.additions == ()
178	assert len(plan.skipped) == 1
179	assert plan.skipped[0].reason is PreferenceMineSkipReason.TOO_SIMILAR
180
181	def test_below_threshold_pairs_are_skipped_and_rendered(self) -> None:
182	parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
183	backend = StubBackend({"question?": ["candidate one", "candidate two"]})
184	judge = StubJudge(
185	{
186	("question?", "candidate one", "candidate two"): PairScore(
187	score_a=0.52,
188	score_b=0.48,
189	)
190	}
191	)
192
193	plan = build_mine_plan(parsed, backend, judge, mined_run_id=11, samples=2)
194	rendered = render_mine_plan(plan)
195
196	assert plan.additions == ()
197	assert len(plan.skipped) == 1
198	assert plan.skipped[0].reason is PreferenceMineSkipReason.BELOW_THRESHOLD
199	assert "preference mine plan: 0 add, 1 skip" in rendered
200	assert "below_threshold" in rendered
201
202	def test_malformed_instruction_section_is_reported(self) -> None:
203	parsed = _parsed("::instruction::\n### Q\nunterminated question\n")
204	backend = StubBackend({})
205	judge = StubJudge({})
206
207	plan = build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2)
208
209	assert plan.additions == ()
210	assert len(plan.skipped) == 1
211	assert plan.skipped[0].reason is PreferenceMineSkipReason.MALFORMED_INSTRUCTION
212
213	def test_stops_collecting_once_max_pairs_is_reached(self) -> None:
214	parsed = _parsed(
215	"::instruction::\n### Q\nquestion one?\n### A\nreference\n\n"
216	"::instruction::\n### Q\nquestion two?\n### A\nreference\n"
217	)
218	backend = StubBackend(
219	{
220	"question one?": ["bad one", "good one"],
221	"question two?": ["bad two", "good two"],
222	}
223	)
224	judge = StubJudge(
225	{
226	("question one?", "bad one", "good one"): PairScore(score_a=0.1, score_b=0.9),
227	("question two?", "bad two", "good two"): PairScore(score_a=0.1, score_b=0.9),
228	}
229	)
230
231	plan = build_mine_plan(parsed, backend, judge, mined_run_id=4, samples=2, max_pairs=1)
232
233	assert len(plan.additions) == 1
234	assert plan.additions[0].source.prompt == "question one?"
235
236	def test_insufficient_variety_is_reported(self) -> None:
237	parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n")
238	backend = StubBackend({"question?": [" same ", "", "same", " "]})
239	judge = StubJudge({})
240
241	plan = build_mine_plan(parsed, backend, judge, mined_run_id=6, samples=4)
242
243	assert plan.additions == ()
244	assert len(plan.skipped) == 1
245	assert plan.skipped[0].reason is PreferenceMineSkipReason.INSUFFICIENT_VARIETY
246	assert "need at least 2 unique non-empty candidates" in plan.skipped[0].detail
247
248
249	class TestMineHelpers:
250	def test_unique_nonempty_strips_blanks_and_duplicates(self) -> None:
251	assert _unique_nonempty(["", " alpha ", "alpha", "beta", " "]) == ["alpha", "beta"]
252
253	def test_best_pair_skips_ties(self) -> None:
254	judge = StubJudge({("prompt", "a", "b"): PairScore(score_a=0.4, score_b=0.4)})
255
256	assert _best_pair("prompt", ["a", "b"], judge=judge) is None
257
258	def test_resolve_pair_returns_none_for_ties(self) -> None:
259	assert _resolve_pair("a", "b", PairScore(score_a=0.2, score_b=0.2)) is None
260
261	def test_first_line_truncates_long_text(self) -> None:
262	rendered = _first_line("x" * 90, max_chars=20)
263	assert rendered == ("x" * 19) + "…"