| 1 |
"""Unit tests for Sprint 42's pure preference-mining backend.""" |
| 2 |
|
| 3 |
from __future__ import annotations |
| 4 |
|
| 5 |
from collections import deque |
| 6 |
|
| 7 |
import pytest |
| 8 |
|
| 9 |
from dlm.doc.parser import parse_text |
| 10 |
from dlm.preference import ( |
| 11 |
PreferenceMineSkipReason, |
| 12 |
build_mine_plan, |
| 13 |
render_mine_plan, |
| 14 |
) |
| 15 |
from dlm.preference.judge import PairScore |
| 16 |
from dlm.preference.mine import _best_pair, _first_line, _resolve_pair, _unique_nonempty |
| 17 |
|
| 18 |
_FRONTMATTER = """--- |
| 19 |
dlm_id: 01KPQ9X1000000000000000000 |
| 20 |
dlm_version: 14 |
| 21 |
base_model: smollm2-135m |
| 22 |
--- |
| 23 |
""" |
| 24 |
|
| 25 |
|
| 26 |
class StubBackend: |
| 27 |
def __init__(self, responses: dict[str, list[str]]) -> None: |
| 28 |
self._responses = {prompt: deque(items) for prompt, items in responses.items()} |
| 29 |
|
| 30 |
def generate(self, prompt: str, **_gen_kwargs: object) -> str: |
| 31 |
return self._responses[prompt].popleft() |
| 32 |
|
| 33 |
|
| 34 |
class StubJudge: |
| 35 |
name = "stub:judge" |
| 36 |
suggested_threshold = 0.10 |
| 37 |
|
| 38 |
def __init__(self, scores: dict[tuple[str, str, str], PairScore]) -> None: |
| 39 |
self._scores = scores |
| 40 |
|
| 41 |
def score_pair(self, prompt: str, candidate_a: str, candidate_b: str) -> PairScore: |
| 42 |
return self._scores[(prompt, candidate_a, candidate_b)] |
| 43 |
|
| 44 |
|
| 45 |
def _parsed(body: str): |
| 46 |
return parse_text(_FRONTMATTER + body) |
| 47 |
|
| 48 |
|
| 49 |
class TestBuildMinePlan: |
| 50 |
def test_validates_numeric_limits(self) -> None: |
| 51 |
parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n") |
| 52 |
backend = StubBackend({"question?": ["one", "two"]}) |
| 53 |
judge = StubJudge({("question?", "one", "two"): PairScore(score_a=1.0, score_b=0.0)}) |
| 54 |
|
| 55 |
with pytest.raises(ValueError, match="samples must be >= 2"): |
| 56 |
build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=1) |
| 57 |
with pytest.raises(ValueError, match="max_pairs must be >= 1"): |
| 58 |
build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_pairs=0) |
| 59 |
with pytest.raises(ValueError, match="threshold must be >= 0.0"): |
| 60 |
build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, threshold=-0.1) |
| 61 |
with pytest.raises(ValueError, match="max_new_tokens must be >= 1"): |
| 62 |
build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2, max_new_tokens=0) |
| 63 |
|
| 64 |
def test_materializes_auto_mined_preference_section(self) -> None: |
| 65 |
parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n") |
| 66 |
backend = StubBackend({"question?": ["bad answer", "good answer"]}) |
| 67 |
judge = StubJudge( |
| 68 |
{ |
| 69 |
("question?", "bad answer", "good answer"): PairScore( |
| 70 |
score_a=0.1, |
| 71 |
score_b=0.9, |
| 72 |
reasoning="good answer wins clearly", |
| 73 |
) |
| 74 |
} |
| 75 |
) |
| 76 |
|
| 77 |
plan = build_mine_plan( |
| 78 |
parsed, |
| 79 |
backend, |
| 80 |
judge, |
| 81 |
mined_run_id=7, |
| 82 |
samples=2, |
| 83 |
mined_at="2026-04-23T20:00:00Z", |
| 84 |
) |
| 85 |
|
| 86 |
assert len(plan.additions) == 1 |
| 87 |
assert plan.skipped == () |
| 88 |
addition = plan.additions[0] |
| 89 |
assert addition.chosen == "good answer" |
| 90 |
assert addition.rejected == "bad answer" |
| 91 |
assert addition.source.prompt == "question?" |
| 92 |
assert addition.section.auto_mined is True |
| 93 |
assert addition.section.judge_name == "stub:judge" |
| 94 |
assert addition.section.judge_score_chosen == 0.9 |
| 95 |
assert addition.section.judge_score_rejected == 0.1 |
| 96 |
assert addition.section.mined_at == "2026-04-23T20:00:00Z" |
| 97 |
assert addition.section.mined_run_id == 7 |
| 98 |
assert addition.section.content == ( |
| 99 |
"### Prompt\nquestion?\n### Chosen\ngood answer\n### Rejected\nbad answer" |
| 100 |
) |
| 101 |
|
| 102 |
def test_probe_markers_are_normalized_and_duplicate_prompts_skip(self) -> None: |
| 103 |
parsed = _parsed( |
| 104 |
"::instruction::\n" |
| 105 |
"### Q !probe\n" |
| 106 |
"What is DGEMM?\n" |
| 107 |
"### A\n" |
| 108 |
"A matrix multiply.\n\n" |
| 109 |
"::instruction::\n" |
| 110 |
"### Q\n" |
| 111 |
"What is DGEMM?\n" |
| 112 |
"### A\n" |
| 113 |
"Still a matrix multiply.\n" |
| 114 |
) |
| 115 |
backend = StubBackend({"What is DGEMM?": ["weak", "strong"]}) |
| 116 |
judge = StubJudge( |
| 117 |
{ |
| 118 |
("What is DGEMM?", "weak", "strong"): PairScore( |
| 119 |
score_a=0.2, |
| 120 |
score_b=0.8, |
| 121 |
) |
| 122 |
} |
| 123 |
) |
| 124 |
|
| 125 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=3, samples=2) |
| 126 |
|
| 127 |
assert len(plan.additions) == 1 |
| 128 |
assert len(plan.skipped) == 1 |
| 129 |
assert plan.skipped[0].reason is PreferenceMineSkipReason.DUPLICATE_PROMPT |
| 130 |
assert plan.additions[0].source.prompt == "What is DGEMM?" |
| 131 |
|
| 132 |
def test_existing_preference_is_not_remined(self) -> None: |
| 133 |
parsed = _parsed( |
| 134 |
"::instruction::\n" |
| 135 |
"### Q\n" |
| 136 |
"question?\n" |
| 137 |
"### A\n" |
| 138 |
"reference\n\n" |
| 139 |
"::preference::\n" |
| 140 |
"### Prompt\n" |
| 141 |
"question?\n" |
| 142 |
"### Chosen\n" |
| 143 |
"good answer\n" |
| 144 |
"### Rejected\n" |
| 145 |
"bad answer\n" |
| 146 |
) |
| 147 |
backend = StubBackend({"question?": ["bad answer", "good answer"]}) |
| 148 |
judge = StubJudge( |
| 149 |
{ |
| 150 |
("question?", "bad answer", "good answer"): PairScore( |
| 151 |
score_a=0.2, |
| 152 |
score_b=0.9, |
| 153 |
) |
| 154 |
} |
| 155 |
) |
| 156 |
|
| 157 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=5, samples=2) |
| 158 |
|
| 159 |
assert plan.additions == () |
| 160 |
assert len(plan.skipped) == 1 |
| 161 |
assert plan.skipped[0].reason is PreferenceMineSkipReason.ALREADY_PRESENT |
| 162 |
|
| 163 |
def test_near_identical_candidates_are_rejected(self) -> None: |
| 164 |
parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n") |
| 165 |
backend = StubBackend( |
| 166 |
{ |
| 167 |
"question?": [ |
| 168 |
"This answer explains the algorithm step by step in a calm, direct tone.", |
| 169 |
"This answer explains the algorithm step by step in a calm, direct tone!", |
| 170 |
] |
| 171 |
} |
| 172 |
) |
| 173 |
judge = StubJudge({}) |
| 174 |
|
| 175 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=9, samples=2) |
| 176 |
|
| 177 |
assert plan.additions == () |
| 178 |
assert len(plan.skipped) == 1 |
| 179 |
assert plan.skipped[0].reason is PreferenceMineSkipReason.TOO_SIMILAR |
| 180 |
|
| 181 |
def test_below_threshold_pairs_are_skipped_and_rendered(self) -> None: |
| 182 |
parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n") |
| 183 |
backend = StubBackend({"question?": ["candidate one", "candidate two"]}) |
| 184 |
judge = StubJudge( |
| 185 |
{ |
| 186 |
("question?", "candidate one", "candidate two"): PairScore( |
| 187 |
score_a=0.52, |
| 188 |
score_b=0.48, |
| 189 |
) |
| 190 |
} |
| 191 |
) |
| 192 |
|
| 193 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=11, samples=2) |
| 194 |
rendered = render_mine_plan(plan) |
| 195 |
|
| 196 |
assert plan.additions == () |
| 197 |
assert len(plan.skipped) == 1 |
| 198 |
assert plan.skipped[0].reason is PreferenceMineSkipReason.BELOW_THRESHOLD |
| 199 |
assert "preference mine plan: 0 add, 1 skip" in rendered |
| 200 |
assert "below_threshold" in rendered |
| 201 |
|
| 202 |
def test_malformed_instruction_section_is_reported(self) -> None: |
| 203 |
parsed = _parsed("::instruction::\n### Q\nunterminated question\n") |
| 204 |
backend = StubBackend({}) |
| 205 |
judge = StubJudge({}) |
| 206 |
|
| 207 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=1, samples=2) |
| 208 |
|
| 209 |
assert plan.additions == () |
| 210 |
assert len(plan.skipped) == 1 |
| 211 |
assert plan.skipped[0].reason is PreferenceMineSkipReason.MALFORMED_INSTRUCTION |
| 212 |
|
| 213 |
def test_stops_collecting_once_max_pairs_is_reached(self) -> None: |
| 214 |
parsed = _parsed( |
| 215 |
"::instruction::\n### Q\nquestion one?\n### A\nreference\n\n" |
| 216 |
"::instruction::\n### Q\nquestion two?\n### A\nreference\n" |
| 217 |
) |
| 218 |
backend = StubBackend( |
| 219 |
{ |
| 220 |
"question one?": ["bad one", "good one"], |
| 221 |
"question two?": ["bad two", "good two"], |
| 222 |
} |
| 223 |
) |
| 224 |
judge = StubJudge( |
| 225 |
{ |
| 226 |
("question one?", "bad one", "good one"): PairScore(score_a=0.1, score_b=0.9), |
| 227 |
("question two?", "bad two", "good two"): PairScore(score_a=0.1, score_b=0.9), |
| 228 |
} |
| 229 |
) |
| 230 |
|
| 231 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=4, samples=2, max_pairs=1) |
| 232 |
|
| 233 |
assert len(plan.additions) == 1 |
| 234 |
assert plan.additions[0].source.prompt == "question one?" |
| 235 |
|
| 236 |
def test_insufficient_variety_is_reported(self) -> None: |
| 237 |
parsed = _parsed("::instruction::\n### Q\nquestion?\n### A\nreference\n") |
| 238 |
backend = StubBackend({"question?": [" same ", "", "same", " "]}) |
| 239 |
judge = StubJudge({}) |
| 240 |
|
| 241 |
plan = build_mine_plan(parsed, backend, judge, mined_run_id=6, samples=4) |
| 242 |
|
| 243 |
assert plan.additions == () |
| 244 |
assert len(plan.skipped) == 1 |
| 245 |
assert plan.skipped[0].reason is PreferenceMineSkipReason.INSUFFICIENT_VARIETY |
| 246 |
assert "need at least 2 unique non-empty candidates" in plan.skipped[0].detail |
| 247 |
|
| 248 |
|
| 249 |
class TestMineHelpers: |
| 250 |
def test_unique_nonempty_strips_blanks_and_duplicates(self) -> None: |
| 251 |
assert _unique_nonempty(["", " alpha ", "alpha", "beta", " "]) == ["alpha", "beta"] |
| 252 |
|
| 253 |
def test_best_pair_skips_ties(self) -> None: |
| 254 |
judge = StubJudge({("prompt", "a", "b"): PairScore(score_a=0.4, score_b=0.4)}) |
| 255 |
|
| 256 |
assert _best_pair("prompt", ["a", "b"], judge=judge) is None |
| 257 |
|
| 258 |
def test_resolve_pair_returns_none_for_ties(self) -> None: |
| 259 |
assert _resolve_pair("a", "b", PairScore(score_a=0.2, score_b=0.2)) is None |
| 260 |
|
| 261 |
def test_first_line_truncates_long_text(self) -> None: |
| 262 |
rendered = _first_line("x" * 90, max_chars=20) |
| 263 |
assert rendered == ("x" * 19) + "…" |