documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 16054 bytes Raw Blame History

  
        1
        """Parser + serializer integration: parse, tokenize, round-trip, encoding."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        from pathlib import Path
      
        6
        
        7
        import pytest
      
        8
        import yaml
      
        9
        
        10
        from dlm.doc.errors import (
      
        11
            DlmVersionError,
      
        12
            FenceError,
      
        13
            FrontmatterError,
      
        14
            SchemaValidationError,
      
        15
        )
      
        16
        from dlm.doc.parser import (
      
        17
            ParsedDlm,
      
        18
            _parse_auto_mined_marker,
      
        19
            _parse_auto_synth_marker,
      
        20
            _yaml_error_location,
      
        21
            parse_file,
      
        22
            parse_text,
      
        23
        )
      
        24
        from dlm.doc.sections import SectionType
      
        25
        from dlm.doc.serializer import serialize
      
        26
        from dlm.io.text import DlmEncodingError
      
        27
        from tests.fixtures.dlm_factory import instruction, make_dlm, preference, prose
      
        28
        
        29
        VALID_ULID = "01HZ4X7TGZM3J1A2B3C4D5E6F7"
      
        30
        
        31
        
        32
        class TestParseValidDocuments:
      
        33
            def test_factory_default_parses(self) -> None:
      
        34
                text = make_dlm(dlm_id=VALID_ULID)
      
        35
                parsed = parse_text(text)
      
        36
                assert isinstance(parsed, ParsedDlm)
      
        37
                assert parsed.frontmatter.dlm_id == VALID_ULID
      
        38
                assert parsed.frontmatter.base_model == "smollm2-135m"
      
        39
                assert len(parsed.sections) >= 1
      
        40
        
        41
            def test_prose_only_document(self) -> None:
      
        42
                text = make_dlm(
      
        43
                    dlm_id=VALID_ULID,
      
        44
                    sections=[prose("# intro\n\nJust text here.\n")],
      
        45
                )
      
        46
                parsed = parse_text(text)
      
        47
                assert len(parsed.sections) == 1
      
        48
                assert parsed.sections[0].type == SectionType.PROSE
      
        49
                assert "Just text here." in parsed.sections[0].content
      
        50
        
        51
            def test_instruction_section_roundtrips_content(self) -> None:
      
        52
                text = make_dlm(
      
        53
                    dlm_id=VALID_ULID,
      
        54
                    sections=[instruction(("Q1?", "A1."), ("Q2?", "A2."))],
      
        55
                )
      
        56
                parsed = parse_text(text)
      
        57
                assert len(parsed.sections) == 1
      
        58
                assert parsed.sections[0].type == SectionType.INSTRUCTION
      
        59
        
        60
            def test_preference_section_accepted_in_v1(self) -> None:
      
        61
                text = make_dlm(
      
        62
                    dlm_id=VALID_ULID,
      
        63
                    sections=[preference(("p", "c", "r"))],
      
        64
                )
      
        65
                parsed = parse_text(text)
      
        66
                assert len(parsed.sections) == 1
      
        67
                assert parsed.sections[0].type == SectionType.PREFERENCE
      
        68
        
        69
            def test_mixed_sections_preserve_order(self) -> None:
      
        70
                text = make_dlm(
      
        71
                    dlm_id=VALID_ULID,
      
        72
                    sections=[
      
        73
                        prose("intro.\n"),
      
        74
                        instruction(("q", "a")),
      
        75
                        preference(("p", "c", "r")),
      
        76
                    ],
      
        77
                )
      
        78
                parsed = parse_text(text)
      
        79
                types = [s.type for s in parsed.sections]
      
        80
                assert types == [
      
        81
                    SectionType.PROSE,
      
        82
                    SectionType.INSTRUCTION,
      
        83
                    SectionType.PREFERENCE,
      
        84
                ]
      
        85
        
        86
        
        87
        class TestFrontmatterErrors:
      
        88
            def test_missing_opening_delimiter(self) -> None:
      
        89
                text = "no frontmatter here\n"
      
        90
                with pytest.raises(FrontmatterError) as exc:
      
        91
                    parse_text(text)
      
        92
                assert exc.value.line == 1
      
        93
        
        94
            def test_missing_closing_delimiter(self) -> None:
      
        95
                text = "---\ndlm_id: " + VALID_ULID + "\n"
      
        96
                with pytest.raises(FrontmatterError, match="no closing"):
      
        97
                    parse_text(text)
      
        98
        
        99
            def test_non_mapping_yaml(self) -> None:
      
        100
                text = "---\n- just\n- a\n- list\n---\n"
      
        101
                with pytest.raises(FrontmatterError, match="must be a mapping"):
      
        102
                    parse_text(text)
      
        103
        
        104
            def test_unknown_top_level_key_reports_schema_error(self) -> None:
      
        105
                text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\nsurprise: 1\n---\n"
      
        106
                with pytest.raises(SchemaValidationError, match="surprise"):
      
        107
                    parse_text(text)
      
        108
        
        109
            def test_invalid_yaml_surfaces_location(self) -> None:
      
        110
                text = "---\ndlm_id: [unclosed\n---\n"
      
        111
                with pytest.raises(FrontmatterError, match="invalid YAML"):
      
        112
                    parse_text(text)
      
        113
        
        114
            def test_yaml_error_without_marks_returns_zero_location(self) -> None:
      
        115
                assert _yaml_error_location(yaml.YAMLError("plain boom")) == (0, 0)
      
        116
        
        117
        
        118
        class TestVersionGating:
      
        119
            def test_future_version_refused(self) -> None:
      
        120
                text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\ndlm_version: 999\n---\n"
      
        121
                with pytest.raises(DlmVersionError, match="newer than this parser"):
      
        122
                    parse_text(text)
      
        123
        
        124
            def test_sub_current_version_without_migrator_refuses(self) -> None:
      
        125
                """Sprint 12b: sub-CURRENT documents route through the migration
      
        126
                dispatcher; a gap in the `MIGRATORS` registry raises
      
        127
                `UnsupportedMigrationError` (subclass of `DlmVersionError`).
      
        128
        
        129
                Simulate CURRENT=2 with no v1 migrator registered; the dispatcher
      
        130
                refuses to silently accept a v1 dict. The coverage test in
      
        131
                `tests/unit/doc/test_migrations.py` is the static enforcement;
      
        132
                this is the runtime gate.
      
        133
                """
      
        134
                from dlm.doc import versioned as versioned_module
      
        135
                from dlm.doc.errors import UnsupportedMigrationError
      
        136
        
        137
                text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\ndlm_version: 1\n---\n"
      
        138
                original = versioned_module.CURRENT_SCHEMA_VERSION
      
        139
                versioned_module.CURRENT_SCHEMA_VERSION = original + 1
      
        140
                try:
      
        141
                    with pytest.raises(UnsupportedMigrationError, match="no migrator"):
      
        142
                        parse_text(text)
      
        143
                finally:
      
        144
                    versioned_module.CURRENT_SCHEMA_VERSION = original
      
        145
        
        146
        
        147
        class TestFenceGrammar:
      
        148
            def test_unknown_fence_raises(self) -> None:
      
        149
                text = make_dlm(
      
        150
                    dlm_id=VALID_ULID,
      
        151
                    sections=[prose("before fence\n")],
      
        152
                )
      
        153
                # Append a bogus fence after parsing-friendly content.
      
        154
                text += "\n::weird::\n\nbody\n"
      
        155
                with pytest.raises(FenceError, match="unknown section fence"):
      
        156
                    parse_text(text)
      
        157
        
        158
            def test_fence_inside_code_block_is_literal(self) -> None:
      
        159
                body = (
      
        160
                    "Some prose.\n"
      
        161
                    "\n"
      
        162
                    "```\n"
      
        163
                    "::instruction::\n"  # should NOT be parsed as a fence
      
        164
                    "```\n"
      
        165
                )
      
        166
                text = make_dlm(dlm_id=VALID_ULID, sections=[prose(body)])
      
        167
                parsed = parse_text(text)
      
        168
                assert len(parsed.sections) == 1
      
        169
                assert parsed.sections[0].type == SectionType.PROSE
      
        170
        
        171
            def test_unterminated_code_block_raises(self) -> None:
      
        172
                # Author the text manually so the factory doesn't terminate it.
      
        173
                text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n```\nforever open\n"
      
        174
                with pytest.raises(FenceError, match="unterminated"):
      
        175
                    parse_text(text)
      
        176
        
        177
            def test_whitespace_only_prose_between_fences_is_elided(self) -> None:
      
        178
                text = (
      
        179
                    f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
      
        180
                    "::instruction::\n"
      
        181
                    "### Q\n"
      
        182
                    "q\n"
      
        183
                    "### A\n"
      
        184
                    "a\n"
      
        185
                    "\n \n\t\n\n"
      
        186
                    "::preference::\n"
      
        187
                    "### Prompt\n"
      
        188
                    "p\n"
      
        189
                    "### Chosen\n"
      
        190
                    "c\n"
      
        191
                    "### Rejected\n"
      
        192
                    "r\n"
      
        193
                )
      
        194
                parsed = parse_text(text)
      
        195
                assert [section.type for section in parsed.sections] == [
      
        196
                    SectionType.INSTRUCTION,
      
        197
                    SectionType.PREFERENCE,
      
        198
                ]
      
        199
        
        200
            def test_whitespace_only_trailing_prose_is_elided(self) -> None:
      
        201
                text = (
      
        202
                    f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
      
        203
                    "::instruction::\n"
      
        204
                    "### Q\n"
      
        205
                    "q\n"
      
        206
                    "### A\n"
      
        207
                    "a\n"
      
        208
                    "\n"
      
        209
                    "   \n"
      
        210
                    "\t\n"
      
        211
                )
      
        212
                parsed = parse_text(text)
      
        213
                assert [section.type for section in parsed.sections] == [SectionType.INSTRUCTION]
      
        214
        
        215
            def test_whitespace_only_prose_before_first_fence_is_elided(self) -> None:
      
        216
                text = (
      
        217
                    f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
      
        218
                    "   \n"
      
        219
                    "\t\n"
      
        220
                    "::instruction::\n"
      
        221
                    "### Q\n"
      
        222
                    "q\n"
      
        223
                    "### A\n"
      
        224
                    "a\n"
      
        225
                )
      
        226
                parsed = parse_text(text)
      
        227
                assert [section.type for section in parsed.sections] == [SectionType.INSTRUCTION]
      
        228
        
        229
            def test_unknown_attribute_fence_raises(self) -> None:
      
        230
                text = (
      
        231
                    f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
      
        232
                    '::widget path="image.png"::\n'
      
        233
                )
      
        234
                with pytest.raises(FenceError, match="unknown attribute fence"):
      
        235
                    parse_text(text)
      
        236
        
        237
            def test_non_attribute_fence_rejects_attribute_form(self) -> None:
      
        238
                text = (
      
        239
                    f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
      
        240
                    '::instruction path="nope"::\n'
      
        241
                )
      
        242
                with pytest.raises(FenceError, match="does not take attributes"):
      
        243
                    parse_text(text)
      
        244
        
        245
        
        246
        class TestParsedDlmImmutability:
      
        247
            def test_sections_is_tuple(self) -> None:
      
        248
                parsed = parse_text(make_dlm(dlm_id=VALID_ULID))
      
        249
                assert isinstance(parsed.sections, tuple)
      
        250
        
        251
        
        252
        class TestRoundTrip:
      
        253
            def test_parse_then_serialize_then_parse_matches(self) -> None:
      
        254
                original = make_dlm(
      
        255
                    dlm_id=VALID_ULID,
      
        256
                    sections=[
      
        257
                        prose("# Heading\n\nParagraph.\n"),
      
        258
                        instruction(("What is 2+2?", "4."), ("And 3+3?", "6.")),
      
        259
                        preference(("Greet.", "Hi!", "hey.")),
      
        260
                    ],
      
        261
                    training_overrides={"lora_r": 16, "num_epochs": 2},
      
        262
                    system_prompt="You are helpful.\nAlways be concise.",
      
        263
                )
      
        264
                first = parse_text(original)
      
        265
                serialized = serialize(first)
      
        266
                second = parse_text(serialized)
      
        267
                # Frontmatter equality
      
        268
                assert first.frontmatter == second.frontmatter
      
        269
                # Section types + content equality
      
        270
                assert [s.type for s in first.sections] == [s.type for s in second.sections]
      
        271
                assert [s.content for s in first.sections] == [s.content for s in second.sections]
      
        272
        
        273
            def test_second_pass_is_byte_identical(self) -> None:
      
        274
                original = make_dlm(
      
        275
                    dlm_id=VALID_ULID,
      
        276
                    sections=[prose("content\n"), instruction(("q", "a"))],
      
        277
                )
      
        278
                once = serialize(parse_text(original))
      
        279
                twice = serialize(parse_text(once))
      
        280
                assert once == twice  # idempotent
      
        281
        
        282
        
        283
        class TestSectionIdStabilityUnderEdits:
      
        284
            """Whitespace-only edits in one section must not change another section's ID."""
      
        285
        
        286
            def test_editing_one_prose_does_not_change_other_ids(self) -> None:
      
        287
                doc_before = make_dlm(
      
        288
                    dlm_id=VALID_ULID,
      
        289
                    sections=[
      
        290
                        prose("first section\n"),
      
        291
                        instruction(("stable q", "stable a")),
      
        292
                    ],
      
        293
                )
      
        294
                doc_after = make_dlm(
      
        295
                    dlm_id=VALID_ULID,
      
        296
                    sections=[
      
        297
                        prose("first section   \n"),  # trailing spaces added
      
        298
                        instruction(("stable q", "stable a")),
      
        299
                    ],
      
        300
                )
      
        301
                a = parse_text(doc_before).sections[1]
      
        302
                b = parse_text(doc_after).sections[1]
      
        303
                assert a.section_id == b.section_id
      
        304
        
        305
        
        306
        class TestEncodingContract:
      
        307
            """Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
      
        308
        
        309
            def test_parse_file_strips_bom(self, tmp_path: Path) -> None:
      
        310
                text = make_dlm(dlm_id=VALID_ULID)
      
        311
                p = tmp_path / "with_bom.dlm"
      
        312
                p.write_bytes(b"\xef\xbb\xbf" + text.encode("utf-8"))
      
        313
                parsed = parse_file(p)
      
        314
                assert parsed.frontmatter.dlm_id == VALID_ULID
      
        315
        
        316
            def test_parse_file_normalizes_crlf(self, tmp_path: Path) -> None:
      
        317
                text = make_dlm(dlm_id=VALID_ULID)
      
        318
                p = tmp_path / "crlf.dlm"
      
        319
                p.write_bytes(text.replace("\n", "\r\n").encode("utf-8"))
      
        320
                parsed = parse_file(p)
      
        321
                assert parsed.frontmatter.dlm_id == VALID_ULID
      
        322
        
        323
            def test_invalid_utf8_raises_encoding_error(self, tmp_path: Path) -> None:
      
        324
                p = tmp_path / "bad.dlm"
      
        325
                p.write_bytes(b"---\nbase_model: \xff\n---\n")
      
        326
                with pytest.raises(DlmEncodingError) as exc:
      
        327
                    parse_file(p)
      
        328
                # Audit-02 minor: the parser path must surface byte_offset
      
        329
                # end-to-end (not just the io-layer test).
      
        330
                assert exc.value.byte_offset == len("---\nbase_model: ")
      
        331
                assert exc.value.path == p
      
        332
        
        333
            def test_crlf_and_lf_produce_identical_section_ids(self, tmp_path: Path) -> None:
      
        334
                """Windows and Unix edits of the same content must hash-identically."""
      
        335
                text = make_dlm(
      
        336
                    dlm_id=VALID_ULID,
      
        337
                    sections=[prose("body line 1\nbody line 2\n")],
      
        338
                )
      
        339
                lf_path = tmp_path / "lf.dlm"
      
        340
                lf_path.write_bytes(text.encode("utf-8"))
      
        341
                crlf_path = tmp_path / "crlf.dlm"
      
        342
                crlf_path.write_bytes(text.replace("\n", "\r\n").encode("utf-8"))
      
        343
                lf_parsed = parse_file(lf_path)
      
        344
                crlf_parsed = parse_file(crlf_path)
      
        345
                lf_ids = [s.section_id for s in lf_parsed.sections]
      
        346
                crlf_ids = [s.section_id for s in crlf_parsed.sections]
      
        347
                assert lf_ids == crlf_ids
      
        348
        
        349
        
        350
        class TestAutoMarkerValidation:
      
        351
            @pytest.mark.parametrize(
      
        352
                ("blob", "message"),
      
        353
                [
      
        354
                    (' judge_name="sway', "invalid dlm-auto-mined marker syntax"),
      
        355
                    (
      
        356
                        ' judge_name="sway" judge_name="other" '
      
        357
                        'judge_score_chosen="1.0" judge_score_rejected="0.5" '
      
        358
                        'mined_at="2026-04-24T00:00:00Z" mined_run_id="1"',
      
        359
                        "repeats attribute",
      
        360
                    ),
      
        361
                    (
      
        362
                        ' judge_name="sway" judge_score_chosen="1.0" judge_score_rejected="0.5" '
      
        363
                        'mined_at="2026-04-24T00:00:00Z" mined_run_id="1" extra="nope"',
      
        364
                        "unknown attribute",
      
        365
                    ),
      
        366
                    (
      
        367
                        ' judge_name="sway" judge_score_chosen="1.0" '
      
        368
                        'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z"',
      
        369
                        "missing required attribute",
      
        370
                    ),
      
        371
                    (
      
        372
                        ' judge_name="sway" judge_score_chosen="nope" '
      
        373
                        'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="1"',
      
        374
                        "judge scores must be floats",
      
        375
                    ),
      
        376
                    (
      
        377
                        ' judge_name="sway" judge_score_chosen="1.0" '
      
        378
                        'judge_score_rejected="0.5" mined_at="not-a-date" mined_run_id="1"',
      
        379
                        "mined_at must be ISO-8601",
      
        380
                    ),
      
        381
                    (
      
        382
                        ' judge_name="sway" judge_score_chosen="1.0" '
      
        383
                        'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="abc"',
      
        384
                        "mined_run_id must be an integer",
      
        385
                    ),
      
        386
                    (
      
        387
                        ' judge_name="sway" judge_score_chosen="1.0" '
      
        388
                        'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="0"',
      
        389
                        "mined_run_id must be >= 1",
      
        390
                    ),
      
        391
                ],
      
        392
            )
      
        393
            def test_auto_mined_marker_validation_errors(self, blob: str, message: str) -> None:
      
        394
                with pytest.raises(FenceError, match=message):
      
        395
                    _parse_auto_mined_marker(blob, path=None, line=7)
      
        396
        
        397
            @pytest.mark.parametrize(
      
        398
                ("blob", "message"),
      
        399
                [
      
        400
                    (' synth_teacher="self', "invalid dlm-auto-synth marker syntax"),
      
        401
                    (
      
        402
                        ' synth_teacher="self" synth_teacher="other" '
      
        403
                        'synth_strategy="extraction" synth_at="2026-04-24T00:00:00Z" '
      
        404
                        'source_section_id="0123456789abcdef"',
      
        405
                        "repeats attribute",
      
        406
                    ),
      
        407
                    (
      
        408
                        ' synth_teacher="self" synth_strategy="extraction" '
      
        409
                        'synth_at="2026-04-24T00:00:00Z" source_section_id="0123456789abcdef" '
      
        410
                        'extra="nope"',
      
        411
                        "unknown attribute",
      
        412
                    ),
      
        413
                    (
      
        414
                        ' synth_teacher="self" synth_strategy="extraction" synth_at="2026-04-24T00:00:00Z"',
      
        415
                        "missing required attribute",
      
        416
                    ),
      
        417
                    (
      
        418
                        ' synth_teacher="self" synth_strategy="extraction" '
      
        419
                        'synth_at="not-a-date" source_section_id="0123456789abcdef"',
      
        420
                        "synth_at must be ISO-8601",
      
        421
                    ),
      
        422
                ],
      
        423
            )
      
        424
            def test_auto_synth_marker_validation_errors(self, blob: str, message: str) -> None:
      
        425
                with pytest.raises(FenceError, match=message):
      
        426
                    _parse_auto_synth_marker(blob, path=None, line=11)

1	"""Parser + serializer integration: parse, tokenize, round-trip, encoding."""
2
3	from __future__ import annotations
4
5	from pathlib import Path
6
7	import pytest
8	import yaml
9
10	from dlm.doc.errors import (
11	DlmVersionError,
12	FenceError,
13	FrontmatterError,
14	SchemaValidationError,
15	)
16	from dlm.doc.parser import (
17	ParsedDlm,
18	_parse_auto_mined_marker,
19	_parse_auto_synth_marker,
20	_yaml_error_location,
21	parse_file,
22	parse_text,
23	)
24	from dlm.doc.sections import SectionType
25	from dlm.doc.serializer import serialize
26	from dlm.io.text import DlmEncodingError
27	from tests.fixtures.dlm_factory import instruction, make_dlm, preference, prose
28
29	VALID_ULID = "01HZ4X7TGZM3J1A2B3C4D5E6F7"
30
31
32	class TestParseValidDocuments:
33	def test_factory_default_parses(self) -> None:
34	text = make_dlm(dlm_id=VALID_ULID)
35	parsed = parse_text(text)
36	assert isinstance(parsed, ParsedDlm)
37	assert parsed.frontmatter.dlm_id == VALID_ULID
38	assert parsed.frontmatter.base_model == "smollm2-135m"
39	assert len(parsed.sections) >= 1
40
41	def test_prose_only_document(self) -> None:
42	text = make_dlm(
43	dlm_id=VALID_ULID,
44	sections=[prose("# intro\n\nJust text here.\n")],
45	)
46	parsed = parse_text(text)
47	assert len(parsed.sections) == 1
48	assert parsed.sections[0].type == SectionType.PROSE
49	assert "Just text here." in parsed.sections[0].content
50
51	def test_instruction_section_roundtrips_content(self) -> None:
52	text = make_dlm(
53	dlm_id=VALID_ULID,
54	sections=[instruction(("Q1?", "A1."), ("Q2?", "A2."))],
55	)
56	parsed = parse_text(text)
57	assert len(parsed.sections) == 1
58	assert parsed.sections[0].type == SectionType.INSTRUCTION
59
60	def test_preference_section_accepted_in_v1(self) -> None:
61	text = make_dlm(
62	dlm_id=VALID_ULID,
63	sections=[preference(("p", "c", "r"))],
64	)
65	parsed = parse_text(text)
66	assert len(parsed.sections) == 1
67	assert parsed.sections[0].type == SectionType.PREFERENCE
68
69	def test_mixed_sections_preserve_order(self) -> None:
70	text = make_dlm(
71	dlm_id=VALID_ULID,
72	sections=[
73	prose("intro.\n"),
74	instruction(("q", "a")),
75	preference(("p", "c", "r")),
76	],
77	)
78	parsed = parse_text(text)
79	types = [s.type for s in parsed.sections]
80	assert types == [
81	SectionType.PROSE,
82	SectionType.INSTRUCTION,
83	SectionType.PREFERENCE,
84	]
85
86
87	class TestFrontmatterErrors:
88	def test_missing_opening_delimiter(self) -> None:
89	text = "no frontmatter here\n"
90	with pytest.raises(FrontmatterError) as exc:
91	parse_text(text)
92	assert exc.value.line == 1
93
94	def test_missing_closing_delimiter(self) -> None:
95	text = "---\ndlm_id: " + VALID_ULID + "\n"
96	with pytest.raises(FrontmatterError, match="no closing"):
97	parse_text(text)
98
99	def test_non_mapping_yaml(self) -> None:
100	text = "---\n- just\n- a\n- list\n---\n"
101	with pytest.raises(FrontmatterError, match="must be a mapping"):
102	parse_text(text)
103
104	def test_unknown_top_level_key_reports_schema_error(self) -> None:
105	text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\nsurprise: 1\n---\n"
106	with pytest.raises(SchemaValidationError, match="surprise"):
107	parse_text(text)
108
109	def test_invalid_yaml_surfaces_location(self) -> None:
110	text = "---\ndlm_id: [unclosed\n---\n"
111	with pytest.raises(FrontmatterError, match="invalid YAML"):
112	parse_text(text)
113
114	def test_yaml_error_without_marks_returns_zero_location(self) -> None:
115	assert _yaml_error_location(yaml.YAMLError("plain boom")) == (0, 0)
116
117
118	class TestVersionGating:
119	def test_future_version_refused(self) -> None:
120	text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\ndlm_version: 999\n---\n"
121	with pytest.raises(DlmVersionError, match="newer than this parser"):
122	parse_text(text)
123
124	def test_sub_current_version_without_migrator_refuses(self) -> None:
125	"""Sprint 12b: sub-CURRENT documents route through the migration
126	dispatcher; a gap in the `MIGRATORS` registry raises
127	`UnsupportedMigrationError` (subclass of `DlmVersionError`).
128
129	Simulate CURRENT=2 with no v1 migrator registered; the dispatcher
130	refuses to silently accept a v1 dict. The coverage test in
131	`tests/unit/doc/test_migrations.py` is the static enforcement;
132	this is the runtime gate.
133	"""
134	from dlm.doc import versioned as versioned_module
135	from dlm.doc.errors import UnsupportedMigrationError
136
137	text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\ndlm_version: 1\n---\n"
138	original = versioned_module.CURRENT_SCHEMA_VERSION
139	versioned_module.CURRENT_SCHEMA_VERSION = original + 1
140	try:
141	with pytest.raises(UnsupportedMigrationError, match="no migrator"):
142	parse_text(text)
143	finally:
144	versioned_module.CURRENT_SCHEMA_VERSION = original
145
146
147	class TestFenceGrammar:
148	def test_unknown_fence_raises(self) -> None:
149	text = make_dlm(
150	dlm_id=VALID_ULID,
151	sections=[prose("before fence\n")],
152	)
153	# Append a bogus fence after parsing-friendly content.
154	text += "\n::weird::\n\nbody\n"
155	with pytest.raises(FenceError, match="unknown section fence"):
156	parse_text(text)
157
158	def test_fence_inside_code_block_is_literal(self) -> None:
159	body = (
160	"Some prose.\n"
161	"\n"
162	"```\n"
163	"::instruction::\n" # should NOT be parsed as a fence
164	"```\n"
165	)
166	text = make_dlm(dlm_id=VALID_ULID, sections=[prose(body)])
167	parsed = parse_text(text)
168	assert len(parsed.sections) == 1
169	assert parsed.sections[0].type == SectionType.PROSE
170
171	def test_unterminated_code_block_raises(self) -> None:
172	# Author the text manually so the factory doesn't terminate it.
173	text = f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n```\nforever open\n"
174	with pytest.raises(FenceError, match="unterminated"):
175	parse_text(text)
176
177	def test_whitespace_only_prose_between_fences_is_elided(self) -> None:
178	text = (
179	f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
180	"::instruction::\n"
181	"### Q\n"
182	"q\n"
183	"### A\n"
184	"a\n"
185	"\n \n\t\n\n"
186	"::preference::\n"
187	"### Prompt\n"
188	"p\n"
189	"### Chosen\n"
190	"c\n"
191	"### Rejected\n"
192	"r\n"
193	)
194	parsed = parse_text(text)
195	assert [section.type for section in parsed.sections] == [
196	SectionType.INSTRUCTION,
197	SectionType.PREFERENCE,
198	]
199
200	def test_whitespace_only_trailing_prose_is_elided(self) -> None:
201	text = (
202	f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
203	"::instruction::\n"
204	"### Q\n"
205	"q\n"
206	"### A\n"
207	"a\n"
208	"\n"
209	" \n"
210	"\t\n"
211	)
212	parsed = parse_text(text)
213	assert [section.type for section in parsed.sections] == [SectionType.INSTRUCTION]
214
215	def test_whitespace_only_prose_before_first_fence_is_elided(self) -> None:
216	text = (
217	f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
218	" \n"
219	"\t\n"
220	"::instruction::\n"
221	"### Q\n"
222	"q\n"
223	"### A\n"
224	"a\n"
225	)
226	parsed = parse_text(text)
227	assert [section.type for section in parsed.sections] == [SectionType.INSTRUCTION]
228
229	def test_unknown_attribute_fence_raises(self) -> None:
230	text = (
231	f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
232	'::widget path="image.png"::\n'
233	)
234	with pytest.raises(FenceError, match="unknown attribute fence"):
235	parse_text(text)
236
237	def test_non_attribute_fence_rejects_attribute_form(self) -> None:
238	text = (
239	f"---\ndlm_id: {VALID_ULID}\nbase_model: smollm2-135m\n---\n\n"
240	'::instruction path="nope"::\n'
241	)
242	with pytest.raises(FenceError, match="does not take attributes"):
243	parse_text(text)
244
245
246	class TestParsedDlmImmutability:
247	def test_sections_is_tuple(self) -> None:
248	parsed = parse_text(make_dlm(dlm_id=VALID_ULID))
249	assert isinstance(parsed.sections, tuple)
250
251
252	class TestRoundTrip:
253	def test_parse_then_serialize_then_parse_matches(self) -> None:
254	original = make_dlm(
255	dlm_id=VALID_ULID,
256	sections=[
257	prose("# Heading\n\nParagraph.\n"),
258	instruction(("What is 2+2?", "4."), ("And 3+3?", "6.")),
259	preference(("Greet.", "Hi!", "hey.")),
260	],
261	training_overrides={"lora_r": 16, "num_epochs": 2},
262	system_prompt="You are helpful.\nAlways be concise.",
263	)
264	first = parse_text(original)
265	serialized = serialize(first)
266	second = parse_text(serialized)
267	# Frontmatter equality
268	assert first.frontmatter == second.frontmatter
269	# Section types + content equality
270	assert [s.type for s in first.sections] == [s.type for s in second.sections]
271	assert [s.content for s in first.sections] == [s.content for s in second.sections]
272
273	def test_second_pass_is_byte_identical(self) -> None:
274	original = make_dlm(
275	dlm_id=VALID_ULID,
276	sections=[prose("content\n"), instruction(("q", "a"))],
277	)
278	once = serialize(parse_text(original))
279	twice = serialize(parse_text(once))
280	assert once == twice # idempotent
281
282
283	class TestSectionIdStabilityUnderEdits:
284	"""Whitespace-only edits in one section must not change another section's ID."""
285
286	def test_editing_one_prose_does_not_change_other_ids(self) -> None:
287	doc_before = make_dlm(
288	dlm_id=VALID_ULID,
289	sections=[
290	prose("first section\n"),
291	instruction(("stable q", "stable a")),
292	],
293	)
294	doc_after = make_dlm(
295	dlm_id=VALID_ULID,
296	sections=[
297	prose("first section \n"), # trailing spaces added
298	instruction(("stable q", "stable a")),
299	],
300	)
301	a = parse_text(doc_before).sections[1]
302	b = parse_text(doc_after).sections[1]
303	assert a.section_id == b.section_id
304
305
306	class TestEncodingContract:
307	"""Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
308
309	def test_parse_file_strips_bom(self, tmp_path: Path) -> None:
310	text = make_dlm(dlm_id=VALID_ULID)
311	p = tmp_path / "with_bom.dlm"
312	p.write_bytes(b"\xef\xbb\xbf" + text.encode("utf-8"))
313	parsed = parse_file(p)
314	assert parsed.frontmatter.dlm_id == VALID_ULID
315
316	def test_parse_file_normalizes_crlf(self, tmp_path: Path) -> None:
317	text = make_dlm(dlm_id=VALID_ULID)
318	p = tmp_path / "crlf.dlm"
319	p.write_bytes(text.replace("\n", "\r\n").encode("utf-8"))
320	parsed = parse_file(p)
321	assert parsed.frontmatter.dlm_id == VALID_ULID
322
323	def test_invalid_utf8_raises_encoding_error(self, tmp_path: Path) -> None:
324	p = tmp_path / "bad.dlm"
325	p.write_bytes(b"---\nbase_model: \xff\n---\n")
326	with pytest.raises(DlmEncodingError) as exc:
327	parse_file(p)
328	# Audit-02 minor: the parser path must surface byte_offset
329	# end-to-end (not just the io-layer test).
330	assert exc.value.byte_offset == len("---\nbase_model: ")
331	assert exc.value.path == p
332
333	def test_crlf_and_lf_produce_identical_section_ids(self, tmp_path: Path) -> None:
334	"""Windows and Unix edits of the same content must hash-identically."""
335	text = make_dlm(
336	dlm_id=VALID_ULID,
337	sections=[prose("body line 1\nbody line 2\n")],
338	)
339	lf_path = tmp_path / "lf.dlm"
340	lf_path.write_bytes(text.encode("utf-8"))
341	crlf_path = tmp_path / "crlf.dlm"
342	crlf_path.write_bytes(text.replace("\n", "\r\n").encode("utf-8"))
343	lf_parsed = parse_file(lf_path)
344	crlf_parsed = parse_file(crlf_path)
345	lf_ids = [s.section_id for s in lf_parsed.sections]
346	crlf_ids = [s.section_id for s in crlf_parsed.sections]
347	assert lf_ids == crlf_ids
348
349
350	class TestAutoMarkerValidation:
351	@pytest.mark.parametrize(
352	("blob", "message"),
353	[
354	(' judge_name="sway', "invalid dlm-auto-mined marker syntax"),
355	(
356	' judge_name="sway" judge_name="other" '
357	'judge_score_chosen="1.0" judge_score_rejected="0.5" '
358	'mined_at="2026-04-24T00:00:00Z" mined_run_id="1"',
359	"repeats attribute",
360	),
361	(
362	' judge_name="sway" judge_score_chosen="1.0" judge_score_rejected="0.5" '
363	'mined_at="2026-04-24T00:00:00Z" mined_run_id="1" extra="nope"',
364	"unknown attribute",
365	),
366	(
367	' judge_name="sway" judge_score_chosen="1.0" '
368	'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z"',
369	"missing required attribute",
370	),
371	(
372	' judge_name="sway" judge_score_chosen="nope" '
373	'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="1"',
374	"judge scores must be floats",
375	),
376	(
377	' judge_name="sway" judge_score_chosen="1.0" '
378	'judge_score_rejected="0.5" mined_at="not-a-date" mined_run_id="1"',
379	"mined_at must be ISO-8601",
380	),
381	(
382	' judge_name="sway" judge_score_chosen="1.0" '
383	'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="abc"',
384	"mined_run_id must be an integer",
385	),
386	(
387	' judge_name="sway" judge_score_chosen="1.0" '
388	'judge_score_rejected="0.5" mined_at="2026-04-24T00:00:00Z" mined_run_id="0"',
389	"mined_run_id must be >= 1",
390	),
391	],
392	)
393	def test_auto_mined_marker_validation_errors(self, blob: str, message: str) -> None:
394	with pytest.raises(FenceError, match=message):
395	_parse_auto_mined_marker(blob, path=None, line=7)
396
397	@pytest.mark.parametrize(
398	("blob", "message"),
399	[
400	(' synth_teacher="self', "invalid dlm-auto-synth marker syntax"),
401	(
402	' synth_teacher="self" synth_teacher="other" '
403	'synth_strategy="extraction" synth_at="2026-04-24T00:00:00Z" '
404	'source_section_id="0123456789abcdef"',
405	"repeats attribute",
406	),
407	(
408	' synth_teacher="self" synth_strategy="extraction" '
409	'synth_at="2026-04-24T00:00:00Z" source_section_id="0123456789abcdef" '
410	'extra="nope"',
411	"unknown attribute",
412	),
413	(
414	' synth_teacher="self" synth_strategy="extraction" synth_at="2026-04-24T00:00:00Z"',
415	"missing required attribute",
416	),
417	(
418	' synth_teacher="self" synth_strategy="extraction" '
419	'synth_at="not-a-date" source_section_id="0123456789abcdef"',
420	"synth_at must be ISO-8601",
421	),
422	],
423	)
424	def test_auto_synth_marker_validation_errors(self, blob: str, message: str) -> None:
425	with pytest.raises(FenceError, match=message):
426	_parse_auto_synth_marker(blob, path=None, line=11)