Python · 16054 bytes Raw Blame History
1 """Parser + serializer integration: parse, tokenize, round-trip, encoding."""
2
3 from __future__ import annotations
4
5 from pathlib import Path
6
7 import pytest
8 import yaml
9
10 from dlm.doc.errors import (
11 DlmVersionError,
12 FenceError,
13 FrontmatterError,
14 SchemaValidationError,
15 )
16 from dlm.doc.parser import (
17 ParsedDlm,
18 _parse_auto_mined_marker,
19 _parse_auto_synth_marker,
20 _yaml_error_location,
21 parse_file,
22 parse_text,
23 )
24 from dlm.doc.sections import SectionType
25 from dlm.doc.serializer import serialize
26 from dlm.io.text import DlmEncodingError
27 from tests.fixtures.dlm_factory import instruction, make_dlm, preference, prose
28
29 VALID_ULID = "01HZ4X7TGZM3J1A2B3C4D5E6F7"
30
31
32 class TestParseValidDocuments:
33 def test_factory_default_parses(self) -> None:
34 text = make_dlm(dlm_id=VALID_ULID)
35 parsed = parse_text(text)
36 assert isinstance(parsed, ParsedDlm)
37 assert parsed.frontmatter.dlm_id == VALID_ULID
38 assert parsed.frontmatter.base_model == "smollm2-135m"
39 assert len(parsed.sections) >= 1
40
41 def test_prose_only_document(self) -> None:
42 text = make_dlm(
43 dlm_id=VALID_ULID,
44 sections=[prose("# intro\n\nJust text here.\n")],
45 )
46 parsed = parse_text(text)
47 assert len(parsed.sections) == 1
48 assert parsed.sections[0].type == SectionType.PROSE
49 assert "Just text here." in parsed.sections[0].content
50
51 def test_instruction_section_roundtrips_content(self) -> None:
52 text = make_dlm(
53 dlm_id=VALID_ULID,
54 sections=[instruction(("Q1?", "A1."), ("Q2?", "A2."))],
55 )
56 parsed = parse_text(text)
57 assert len(parsed.sections) == 1
58 assert parsed.sections[0].type == SectionType.INSTRUCTION
59
60 def test_preference_section_accepted_in_v1(self) -> None:
61 text = make_dlm(
62 dlm_id=VALID_ULID,
63 sections=[preference(("p", "c", "r"))],
64 )
65 parsed = parse_text(text)
66 assert len(parsed.sections) == 1
67 assert parsed.sections[0].type == SectionType.PREFERENCE
68
69 def test_mixed_sections_preserve_order(self) -> None:
70 text = make_dlm(
71 dlm_id=VALID_ULID,
72 sections=[
73 prose("intro.\n"),
74 instruction(("q", "a")),
75 preference(("p", "c", "r")),
76 ],
77 )
78 parsed = parse_text(text)
79 types = [s.type for s in parsed.sections]
80 assert types == [
81 SectionType.PROSE,
82 SectionType.INSTRUCTION,
83 SectionType.PREFERENCE,
84 ]
85
86
87 class TestFrontmatterErrors:
88 def test_missing_opening_delimiter(self) -> None:
89 text = "no frontmatter here\n"
90 with pytest.raises(FrontmatterError) as exc:
91 parse_text(text)
92 assert exc.value.line == 1
93
94 def test_missing_closing_delimiter(self) -> None:
95 text = "---\ndlm_id: " + VALID_ULID + "\n"
96 with pytest.raises(FrontmatterError, match="no closing"):
97 parse_text(text)
98
99 def test_non_mapping_yaml(self) -> None:
100 text = "---\n- just\n- a\n- list\n---\n"
101 with pytest.raises(FrontmatterError, match="must be a mapping"):
102 parse_text(text)
103
104 def test_unknown_top_level_key_reports_schema_error(self) -> None:
105 text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\nsurprise: 1\n---\n"
106 with pytest.raises(SchemaValidationError, match="surprise"):
107 parse_text(text)
108
109 def test_invalid_yaml_surfaces_location(self) -> None:
110 text = "---\ndlm_id: [unclosed\n---\n"
111 with pytest.raises(FrontmatterError, match="invalid YAML"):
112 parse_text(text)
113
114 def test_yaml_error_without_marks_returns_zero_location(self) -> None:
115 assert _yaml_error_location(yaml.YAMLError("plain boom")) == (0, 0)
116
117
118 class TestVersionGating:
119 def test_future_version_refused(self) -> None:
120 text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\ndlm_version: 999\n---\n"
121 with pytest.raises(DlmVersionError, match="newer than this parser"):
122 parse_text(text)
123
124 def test_sub_current_version_without_migrator_refuses(self) -> None:
125 """Sprint 12b: sub-CURRENT documents route through the migration
126 dispatcher; a gap in the `MIGRATORS` registry raises
127 `UnsupportedMigrationError` (subclass of `DlmVersionError`).
128
129 Simulate CURRENT=2 with no v1 migrator registered; the dispatcher
130 refuses to silently accept a v1 dict. The coverage test in
131 `tests/unit/doc/test_migrations.py` is the static enforcement;
132 this is the runtime gate.
133 """
134 from dlm.doc import versioned as versioned_module
135 from dlm.doc.errors import UnsupportedMigrationError
136
137 text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\ndlm_version: 1\n---\n"
138 original = versioned_module.CURRENT_SCHEMA_VERSION
139 versioned_module.CURRENT_SCHEMA_VERSION = original + 1
140 try:
141 with pytest.raises(UnsupportedMigrationError, match="no migrator"):
142 parse_text(text)
143 finally:
144 versioned_module.CURRENT_SCHEMA_VERSION = original
145
146
147 class TestFenceGrammar:
148 def test_unknown_fence_raises(self) -> None:
149 text = make_dlm(
150 dlm_id=VALID_ULID,
151 sections=[prose("before fence\n")],
152 )
153 # Append a bogus fence after parsing-friendly content.
154 text += "\n::weird::\n\nbody\n"
155 with pytest.raises(FenceError, match="unknown section fence"):
156 parse_text(text)
157
158 def test_fence_inside_code_block_is_literal(self) -> None:
159 body = (
160 "Some prose.\n"
161 "\n"
162 "```\n"
163 "::instruction::\n" # should NOT be parsed as a fence
164 "```\n"
165 )
166 text = make_dlm(dlm_id=VALID_ULID, sections=[prose(body)])
167 parsed = parse_text(text)
168 assert len(parsed.sections) == 1
169 assert parsed.sections[0].type == SectionType.PROSE
170
171 def test_unterminated_code_block_raises(self) -> None:
172 # Author the text manually so the factory doesn't terminate it.
173 text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n```\nforever open\n"
174 with pytest.raises(FenceError, match="unterminated"):
175 parse_text(text)
176
177 def test_whitespace_only_prose_between_fences_is_elided(self) -> None:
178 text = (
179 f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
180 "::instruction::\n"
181 "### Q\n"
182 "q\n"
183 "### A\n"
184 "a\n"
185 "\n \n\t\n\n"
186 "::preference::\n"
187 "### Prompt\n"
188 "p\n"
189 "### Chosen\n"
190 "c\n"
191 "### Rejected\n"
192 "r\n"
193 )
194 parsed = parse_text(text)
195 assert [section.type for section in parsed.sections] == [
196 SectionType.INSTRUCTION,
197 SectionType.PREFERENCE,
198 ]
199
200 def test_whitespace_only_trailing_prose_is_elided(self) -> None:
201 text = (
202 f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
203 "::instruction::\n"
204 "### Q\n"
205 "q\n"
206 "### A\n"
207 "a\n"
208 "\n"
209 " \n"
210 "\t\n"
211 )
212 parsed = parse_text(text)
213 assert [section.type for section in parsed.sections] == [SectionType.INSTRUCTION]
214
215 def test_whitespace_only_prose_before_first_fence_is_elided(self) -> None:
216 text = (
217 f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
218 " \n"
219 "\t\n"
220 "::instruction::\n"
221 "### Q\n"
222 "q\n"
223 "### A\n"
224 "a\n"
225 )
226 parsed = parse_text(text)
227 assert [section.type for section in parsed.sections] == [SectionType.INSTRUCTION]
228
229 def test_unknown_attribute_fence_raises(self) -> None:
230 text = (
231 f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
232 '::widget path="image.png"::\n'
233 )
234 with pytest.raises(FenceError, match="unknown attribute fence"):
235 parse_text(text)
236
237 def test_non_attribute_fence_rejects_attribute_form(self) -> None:
238 text = (
239 f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
240 '::instruction path="nope"::\n'
241 )
242 with pytest.raises(FenceError, match="does not take attributes"):
243 parse_text(text)
244
245
246 class TestParsedDlmImmutability:
247 def test_sections_is_tuple(self) -> None:
248 parsed = parse_text(make_dlm(dlm_id=VALID_ULID))
249 assert isinstance(parsed.sections, tuple)
250
251
252 class TestRoundTrip:
253 def test_parse_then_serialize_then_parse_matches(self) -> None:
254 original = make_dlm(
255 dlm_id=VALID_ULID,
256 sections=[
257 prose("# Heading\n\nParagraph.\n"),
258 instruction(("What is 2+2?", "4."), ("And 3+3?", "6.")),
259 preference(("Greet.", "Hi!", "hey.")),
260 ],
261 training_overrides={"lora_r": 16, "num_epochs": 2},
262 system_prompt="You are helpful.\nAlways be concise.",
263 )
264 first = parse_text(original)
265 serialized = serialize(first)
266 second = parse_text(serialized)
267 # Frontmatter equality
268 assert first.frontmatter == second.frontmatter
269 # Section types + content equality
270 assert [s.type for s in first.sections] == [s.type for s in second.sections]
271 assert [s.content for s in first.sections] == [s.content for s in second.sections]
272
273 def test_second_pass_is_byte_identical(self) -> None:
274 original = make_dlm(
275 dlm_id=VALID_ULID,
276 sections=[prose("content\n"), instruction(("q", "a"))],
277 )
278 once = serialize(parse_text(original))
279 twice = serialize(parse_text(once))
280 assert once == twice # idempotent
281
282
283 class TestSectionIdStabilityUnderEdits:
284 """Whitespace-only edits in one section must not change another section's ID."""
285
286 def test_editing_one_prose_does_not_change_other_ids(self) -> None:
287 doc_before = make_dlm(
288 dlm_id=VALID_ULID,
289 sections=[
290 prose("first section\n"),
291 instruction(("stable q", "stable a")),
292 ],
293 )
294 doc_after = make_dlm(
295 dlm_id=VALID_ULID,
296 sections=[
297 prose("first section \n"), # trailing spaces added
298 instruction(("stable q", "stable a")),
299 ],
300 )
301 a = parse_text(doc_before).sections[1]
302 b = parse_text(doc_after).sections[1]
303 assert a.section_id == b.section_id
304
305
306 class TestEncodingContract:
307 """Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
308
309 def test_parse_file_strips_bom(self, tmp_path: Path) -> None:
310 text = make_dlm(dlm_id=VALID_ULID)
311 p = tmp_path / "with_bom.dlm"
312 p.write_bytes(b"\xef\xbb\xbf" + text.encode("utf-8"))
313 parsed = parse_file(p)
314 assert parsed.frontmatter.dlm_id == VALID_ULID
315
316 def test_parse_file_normalizes_crlf(self, tmp_path: Path) -> None:
317 text = make_dlm(dlm_id=VALID_ULID)
318 p = tmp_path / "crlf.dlm"
319 p.write_bytes(text.replace("\n", "\r\n").encode("utf-8"))
320 parsed = parse_file(p)
321 assert parsed.frontmatter.dlm_id == VALID_ULID
322
323 def test_invalid_utf8_raises_encoding_error(self, tmp_path: Path) -> None:
324 p = tmp_path / "bad.dlm"
325 p.write_bytes(b"---\nbase_model: \xff\n---\n")
326 with pytest.raises(DlmEncodingError) as exc:
327 parse_file(p)
328 # Audit-02 minor: the parser path must surface byte_offset
329 # end-to-end (not just the io-layer test).
330 assert exc.value.byte_offset == len("---\nbase_model: ")
331 assert exc.value.path == p
332
333 def test_crlf_and_lf_produce_identical_section_ids(self, tmp_path: Path) -> None:
334 """Windows and Unix edits of the same content must hash-identically."""
335 text = make_dlm(
336 dlm_id=VALID_ULID,
337 sections=[prose("body line 1\nbody line 2\n")],
338 )
339 lf_path = tmp_path / "lf.dlm"
340 lf_path.write_bytes(text.encode("utf-8"))
341 crlf_path = tmp_path / "crlf.dlm"
342 crlf_path.write_bytes(text.replace("\n", "\r\n").encode("utf-8"))
343 lf_parsed = parse_file(lf_path)
344 crlf_parsed = parse_file(crlf_path)
345 lf_ids = [s.section_id for s in lf_parsed.sections]
346 crlf_ids = [s.section_id for s in crlf_parsed.sections]
347 assert lf_ids == crlf_ids
348
349
350 class TestAutoMarkerValidation:
351 @pytest.mark.parametrize(
352 ("blob", "message"),
353 [
354 (' judge_name="sway', "invalid dlm-auto-mined marker syntax"),
355 (
356 ' judge_name="sway" judge_name="other" '
357 'judge_score_chosen="1.0" judge_score_rejected="0.5" '
358 'mined_at="2026-04-24T00:00:00Z" mined_run_id="1"',
359 "repeats attribute",
360 ),
361 (
362 ' judge_name="sway" judge_score_chosen="1.0" judge_score_rejected="0.5" '
363 'mined_at="2026-04-24T00:00:00Z" mined_run_id="1" extra="nope"',
364 "unknown attribute",
365 ),
366 (
367 ' judge_name="sway" judge_score_chosen="1.0" '
368 'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z"',
369 "missing required attribute",
370 ),
371 (
372 ' judge_name="sway" judge_score_chosen="nope" '
373 'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="1"',
374 "judge scores must be floats",
375 ),
376 (
377 ' judge_name="sway" judge_score_chosen="1.0" '
378 'judge_score_rejected="0.5" mined_at="not-a-date" mined_run_id="1"',
379 "mined_at must be ISO-8601",
380 ),
381 (
382 ' judge_name="sway" judge_score_chosen="1.0" '
383 'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="abc"',
384 "mined_run_id must be an integer",
385 ),
386 (
387 ' judge_name="sway" judge_score_chosen="1.0" '
388 'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="0"',
389 "mined_run_id must be >= 1",
390 ),
391 ],
392 )
393 def test_auto_mined_marker_validation_errors(self, blob: str, message: str) -> None:
394 with pytest.raises(FenceError, match=message):
395 _parse_auto_mined_marker(blob, path=None, line=7)
396
397 @pytest.mark.parametrize(
398 ("blob", "message"),
399 [
400 (' synth_teacher="self', "invalid dlm-auto-synth marker syntax"),
401 (
402 ' synth_teacher="self" synth_teacher="other" '
403 'synth_strategy="extraction" synth_at="2026-04-24T00:00:00Z" '
404 'source_section_id="0123456789abcdef"',
405 "repeats attribute",
406 ),
407 (
408 ' synth_teacher="self" synth_strategy="extraction" '
409 'synth_at="2026-04-24T00:00:00Z" source_section_id="0123456789abcdef" '
410 'extra="nope"',
411 "unknown attribute",
412 ),
413 (
414 ' synth_teacher="self" synth_strategy="extraction" synth_at="2026-04-24T00:00:00Z"',
415 "missing required attribute",
416 ),
417 (
418 ' synth_teacher="self" synth_strategy="extraction" '
419 'synth_at="not-a-date" source_section_id="0123456789abcdef"',
420 "synth_at must be ISO-8601",
421 ),
422 ],
423 )
424 def test_auto_synth_marker_validation_errors(self, blob: str, message: str) -> None:
425 with pytest.raises(FenceError, match=message):
426 _parse_auto_synth_marker(blob, path=None, line=11)