documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 9838 bytes Raw Blame History

  
        1
        """Body sections: PROSE (default), INSTRUCTION (`::instruction::`),
      
        2
        PREFERENCE (`::preference::`), IMAGE (`::image path="..." alt="..."::`),
      
        3
        AUDIO (`::audio path="..." transcript="..."::`).
      
        4
        
        5
        Text-body sections carry their raw content verbatim plus a stable
      
        6
        `section_id` derived from `sha256(type || "\\n" || normalized_content)[:16]`
      
        7
        where normalization is the same LF+BOM-stripping applied by `dlm.io.text`.
      
        8
        
        9
        Media sections (IMAGE, AUDIO) reference a binary blob outside the
      
        10
        `.dlm` file; their identity is
      
        11
        `sha256(type || "\\n" || path || "\\n" || blob_sha)[:16]` once the blob
      
        12
        has been ingested into the content-addressed store. The path is part
      
        13
        of the hash because different logical uses of the same bytes
      
        14
        (`hero.png` in section A, `same-bytes.png` in section B) should not
      
        15
        collapse to one training row. Before ingestion, `media_blob_sha` is
      
        16
        `None` and the path alone seeds identity — sufficient for `dlm show`
      
        17
        but not for training.
      
        18
        
        19
        This means:
      
        20
        
        21
        - The section ID is stable across Windows/Unix line endings (audit F15).
      
        22
        - A whitespace-only edit inside *another* section does not change this
      
        23
          section's ID (content-addressing correctness).
      
        24
        - Changing the section type (prose → instruction, image → audio)
      
        25
          produces a different ID even for identical content (type namespaces
      
        26
          are disjoint).
      
        27
        - For media sections, a blob-bytes change flips the ID even if the
      
        28
          path didn't move; a path change flips the ID even if the bytes are
      
        29
          identical.
      
        30
        
        31
        AUDIO vs IMAGE: audio sections require `media_transcript` (text-side
      
        32
        supervision); image sections optionally carry a caption in `content`.
      
        33
        The training row for audio ties the transcript to the audio features;
      
        34
        the training row for image uses the image-token placeholder and caption.
      
        35
        """
      
        36
        
        37
        from __future__ import annotations
      
        38
        
        39
        import hashlib
      
        40
        import math
      
        41
        from collections.abc import Mapping
      
        42
        from dataclasses import dataclass, field
      
        43
        from enum import StrEnum
      
        44
        from types import MappingProxyType
      
        45
        
        46
        from dlm.io.text import normalize_for_hashing
      
        47
        
        48
        _EMPTY_TAGS: Mapping[str, str] = MappingProxyType({})
      
        49
        
        50
        
        51
        class SectionType(StrEnum):
      
        52
            PROSE = "prose"
      
        53
            INSTRUCTION = "instruction"
      
        54
            PREFERENCE = "preference"
      
        55
            IMAGE = "image"
      
        56
            AUDIO = "audio"
      
        57
        
        58
        
        59
        _SECTION_ID_BYTES = 8  # 16 hex chars
      
        60
        
        61
        
        62
        @dataclass(frozen=True)
      
        63
        class Section:
      
        64
            """A single body section.
      
        65
        
        66
            `start_line` is the 1-indexed line in the source where the section
      
        67
            begins (the fence line for fenced sections, the first prose line for
      
        68
            PROSE). Used for error reporting and is **not** part of the section
      
        69
            identity.
      
        70
        
        71
            `content` is the raw section body, fence-free. Fence lines are
      
        72
            stripped; leading/trailing blank lines around the content are
      
        73
            preserved as-is to keep round-trip idempotent after the first pass.
      
        74
        
        75
            `adapter` is the optional `#name` routing suffix from a fence like
      
        76
            `::instruction#tone::`. `None` means "unrouted" — the section's rows
      
        77
            flow to whichever adapter the router picks as default (the first
      
        78
            declared, in multi-adapter docs). The field is intentionally not
      
        79
            part of `section_id`: moving a section between adapters is a routing
      
        80
            change, not a content change, and retention snapshots key off the
      
        81
            content hash.
      
        82
        
        83
            `tags` is the optional free-form metadata map flowed from
      
        84
            `.dlm/training.yaml`. Consumers (weighting, filtering,
      
        85
            sway probes) read these; the trainer's row-production path
      
        86
            ignores them. Like `adapter`, tags are **not** part of `section_id`
      
        87
            — metadata churn doesn't invalidate replay identity.
      
        88
        
        89
            `auto_harvest` marks a section as written back into the `.dlm` by
      
        90
            `dlm harvest` — the pull-mode that ingests failing probes from a
      
        91
            sway report (schema v7). `harvest_source` records the source run
      
        92
            ("run_N_sway"-style opaque token) for provenance. Like `tags`,
      
        93
            neither field participates in `section_id`.
      
        94
        
        95
            `auto_mined` marks a `::preference::` section as synthesized by
      
        96
            the preference-mining loop rather than hand-authored. The
      
        97
            accompanying judge metadata (`judge_name`, `judge_score_chosen`,
      
        98
            `judge_score_rejected`, `mined_at`, `mined_run_id`) captures
      
        99
            provenance for review, metrics, and revert flows. Like harvest
      
        100
            metadata, these fields do not participate in `section_id`.
      
        101
        
        102
            `auto_synth` marks an `::instruction::` section as synthesized by
      
        103
            the instruction-generation loop rather than hand-authored. The
      
        104
            accompanying metadata (`synth_teacher`, `synth_strategy`,
      
        105
            `synth_at`, `source_section_id`) captures provenance for review,
      
        106
            metrics, and revert flows. Like the other provenance flags, these
      
        107
            fields do not participate in `section_id`.
      
        108
        
        109
            `media_path` / `media_alt` / `media_blob_sha` are media-section
      
        110
            fields (IMAGE + AUDIO) populated from the fence attributes and
      
        111
            the content-addressed blob store (after ingestion). Non-media
      
        112
            sections leave them at their `None` defaults and they do not
      
        113
            participate in identity; media sections use them as the identity
      
        114
            inputs in place of `content`. `media_alt` is IMAGE-only;
      
        115
            `media_transcript` is AUDIO-only (the audio's text-side
      
        116
            supervision, required for training).
      
        117
            """
      
        118
        
        119
            type: SectionType
      
        120
            content: str
      
        121
            start_line: int = 0
      
        122
            adapter: str | None = None
      
        123
            tags: Mapping[str, str] = field(default_factory=lambda: _EMPTY_TAGS)
      
        124
            auto_harvest: bool = False
      
        125
            harvest_source: str | None = None
      
        126
            auto_mined: bool = False
      
        127
            judge_name: str | None = None
      
        128
            judge_score_chosen: float | None = None
      
        129
            judge_score_rejected: float | None = None
      
        130
            mined_at: str | None = None
      
        131
            mined_run_id: int | None = None
      
        132
            auto_synth: bool = False
      
        133
            synth_teacher: str | None = None
      
        134
            synth_strategy: str | None = None
      
        135
            synth_at: str | None = None
      
        136
            source_section_id: str | None = None
      
        137
            media_path: str | None = None
      
        138
            media_alt: str | None = None
      
        139
            media_blob_sha: str | None = None
      
        140
            media_transcript: str | None = None
      
        141
        
        142
            def __post_init__(self) -> None:
      
        143
                if self.auto_mined:
      
        144
                    if self.type != SectionType.PREFERENCE:
      
        145
                        raise ValueError("auto_mined metadata is only valid on preference sections")
      
        146
                    missing = [
      
        147
                        name
      
        148
                        for name, value in (
      
        149
                            ("judge_name", self.judge_name),
      
        150
                            ("judge_score_chosen", self.judge_score_chosen),
      
        151
                            ("judge_score_rejected", self.judge_score_rejected),
      
        152
                            ("mined_at", self.mined_at),
      
        153
                            ("mined_run_id", self.mined_run_id),
      
        154
                        )
      
        155
                        if value is None
      
        156
                    ]
      
        157
                    if missing:
      
        158
                        raise ValueError(
      
        159
                            f"auto_mined preference sections require metadata fields {missing!r}"
      
        160
                        )
      
        161
                    assert self.judge_score_chosen is not None
      
        162
                    assert self.judge_score_rejected is not None
      
        163
                    if not math.isfinite(self.judge_score_chosen) or not math.isfinite(
      
        164
                        self.judge_score_rejected
      
        165
                    ):
      
        166
                        raise ValueError("judge scores must be finite floats")
      
        167
                    assert self.mined_run_id is not None
      
        168
                    if self.mined_run_id < 1:
      
        169
                        raise ValueError("mined_run_id must be >= 1")
      
        170
        
        171
                if self.auto_synth:
      
        172
                    if self.type != SectionType.INSTRUCTION:
      
        173
                        raise ValueError("auto_synth metadata is only valid on instruction sections")
      
        174
                    missing = [
      
        175
                        name
      
        176
                        for name, value in (
      
        177
                            ("synth_teacher", self.synth_teacher),
      
        178
                            ("synth_strategy", self.synth_strategy),
      
        179
                            ("synth_at", self.synth_at),
      
        180
                            ("source_section_id", self.source_section_id),
      
        181
                        )
      
        182
                        if value is None
      
        183
                    ]
      
        184
                    if missing:
      
        185
                        raise ValueError(
      
        186
                            f"auto_synth instruction sections require metadata fields {missing!r}"
      
        187
                        )
      
        188
                    assert self.synth_teacher is not None
      
        189
                    assert self.synth_strategy is not None
      
        190
                    assert self.synth_at is not None
      
        191
                    assert self.source_section_id is not None
      
        192
                    if not self.synth_teacher:
      
        193
                        raise ValueError("synth_teacher must be non-empty")
      
        194
                    if not self.synth_strategy:
      
        195
                        raise ValueError("synth_strategy must be non-empty")
      
        196
                    if len(self.source_section_id) != _SECTION_ID_BYTES * 2 or any(
      
        197
                        ch not in "0123456789abcdef" for ch in self.source_section_id
      
        198
                    ):
      
        199
                        raise ValueError("source_section_id must be a 16-char lowercase hex section id")
      
        200
        
        201
            @property
      
        202
            def section_id(self) -> str:
      
        203
                """Stable 16-char hex content-hash ID."""
      
        204
                h = hashlib.sha256()
      
        205
                h.update(self.type.value.encode("utf-8"))
      
        206
                h.update(b"\n")
      
        207
                if self.type in (SectionType.IMAGE, SectionType.AUDIO):
      
        208
                    # Media identity: path || blob_sha. Pre-ingest fallback
      
        209
                    # hashes path alone so `dlm show` and parser round-trips
      
        210
                    # work before the trainer writes bytes through the blob
      
        211
                    # store; the trainer always populates `media_blob_sha`
      
        212
                    # before deterministic splits see the ID. Transcript /
      
        213
                    # alt-text do not participate — they're metadata on the
      
        214
                    # section, not part of identity (edit an audio's
      
        215
                    # transcript → same section, new training pair; edit the
      
        216
                    # audio bytes → new section entirely).
      
        217
                    h.update((self.media_path or "").encode("utf-8"))
      
        218
                    if self.media_blob_sha is not None:
      
        219
                        h.update(b"\n")
      
        220
                        h.update(self.media_blob_sha.encode("utf-8"))
      
        221
                else:
      
        222
                    normalized = normalize_for_hashing(self.content)
      
        223
                    h.update(normalized.encode("utf-8"))
      
        224
                return h.hexdigest()[: _SECTION_ID_BYTES * 2]

1	"""Body sections: PROSE (default), INSTRUCTION (`::instruction::`),
2	PREFERENCE (`::preference::`), IMAGE (`::image path="..." alt="..."::`),
3	AUDIO (`::audio path="..." transcript="..."::`).
4
5	Text-body sections carry their raw content verbatim plus a stable
6	`section_id` derived from `sha256(type \|\| "\\n" \|\| normalized_content)[:16]`
7	where normalization is the same LF+BOM-stripping applied by `dlm.io.text`.
8
9	Media sections (IMAGE, AUDIO) reference a binary blob outside the
10	`.dlm` file; their identity is
11	`sha256(type \|\| "\\n" \|\| path \|\| "\\n" \|\| blob_sha)[:16]` once the blob
12	has been ingested into the content-addressed store. The path is part
13	of the hash because different logical uses of the same bytes
14	(`hero.png` in section A, `same-bytes.png` in section B) should not
15	collapse to one training row. Before ingestion, `media_blob_sha` is
16	`None` and the path alone seeds identity — sufficient for `dlm show`
17	but not for training.
18
19	This means:
20
21	- The section ID is stable across Windows/Unix line endings (audit F15).
22	- A whitespace-only edit inside another section does not change this
23	section's ID (content-addressing correctness).
24	- Changing the section type (prose → instruction, image → audio)
25	produces a different ID even for identical content (type namespaces
26	are disjoint).
27	- For media sections, a blob-bytes change flips the ID even if the
28	path didn't move; a path change flips the ID even if the bytes are
29	identical.
30
31	AUDIO vs IMAGE: audio sections require `media_transcript` (text-side
32	supervision); image sections optionally carry a caption in `content`.
33	The training row for audio ties the transcript to the audio features;
34	the training row for image uses the image-token placeholder and caption.
35	"""
36
37	from __future__ import annotations
38
39	import hashlib
40	import math
41	from collections.abc import Mapping
42	from dataclasses import dataclass, field
43	from enum import StrEnum
44	from types import MappingProxyType
45
46	from dlm.io.text import normalize_for_hashing
47
48	_EMPTY_TAGS: Mapping[str, str] = MappingProxyType({})
49
50
51	class SectionType(StrEnum):
52	PROSE = "prose"
53	INSTRUCTION = "instruction"
54	PREFERENCE = "preference"
55	IMAGE = "image"
56	AUDIO = "audio"
57
58
59	_SECTION_ID_BYTES = 8 # 16 hex chars
60
61
62	@dataclass(frozen=True)
63	class Section:
64	"""A single body section.
65
66	`start_line` is the 1-indexed line in the source where the section
67	begins (the fence line for fenced sections, the first prose line for
68	PROSE). Used for error reporting and is not part of the section
69	identity.
70
71	`content` is the raw section body, fence-free. Fence lines are
72	stripped; leading/trailing blank lines around the content are
73	preserved as-is to keep round-trip idempotent after the first pass.
74
75	`adapter` is the optional `#name` routing suffix from a fence like
76	`::instruction#tone::`. `None` means "unrouted" — the section's rows
77	flow to whichever adapter the router picks as default (the first
78	declared, in multi-adapter docs). The field is intentionally not
79	part of `section_id`: moving a section between adapters is a routing
80	change, not a content change, and retention snapshots key off the
81	content hash.
82
83	`tags` is the optional free-form metadata map flowed from
84	`.dlm/training.yaml`. Consumers (weighting, filtering,
85	sway probes) read these; the trainer's row-production path
86	ignores them. Like `adapter`, tags are not part of `section_id`
87	— metadata churn doesn't invalidate replay identity.
88
89	`auto_harvest` marks a section as written back into the `.dlm` by
90	`dlm harvest` — the pull-mode that ingests failing probes from a
91	sway report (schema v7). `harvest_source` records the source run
92	("run_N_sway"-style opaque token) for provenance. Like `tags`,
93	neither field participates in `section_id`.
94
95	`auto_mined` marks a `::preference::` section as synthesized by
96	the preference-mining loop rather than hand-authored. The
97	accompanying judge metadata (`judge_name`, `judge_score_chosen`,
98	`judge_score_rejected`, `mined_at`, `mined_run_id`) captures
99	provenance for review, metrics, and revert flows. Like harvest
100	metadata, these fields do not participate in `section_id`.
101
102	`auto_synth` marks an `::instruction::` section as synthesized by
103	the instruction-generation loop rather than hand-authored. The
104	accompanying metadata (`synth_teacher`, `synth_strategy`,
105	`synth_at`, `source_section_id`) captures provenance for review,
106	metrics, and revert flows. Like the other provenance flags, these
107	fields do not participate in `section_id`.
108
109	`media_path` / `media_alt` / `media_blob_sha` are media-section
110	fields (IMAGE + AUDIO) populated from the fence attributes and
111	the content-addressed blob store (after ingestion). Non-media
112	sections leave them at their `None` defaults and they do not
113	participate in identity; media sections use them as the identity
114	inputs in place of `content`. `media_alt` is IMAGE-only;
115	`media_transcript` is AUDIO-only (the audio's text-side
116	supervision, required for training).
117	"""
118
119	type: SectionType
120	content: str
121	start_line: int = 0
122	adapter: str \| None = None
123	tags: Mapping[str, str] = field(default_factory=lambda: _EMPTY_TAGS)
124	auto_harvest: bool = False
125	harvest_source: str \| None = None
126	auto_mined: bool = False
127	judge_name: str \| None = None
128	judge_score_chosen: float \| None = None
129	judge_score_rejected: float \| None = None
130	mined_at: str \| None = None
131	mined_run_id: int \| None = None
132	auto_synth: bool = False
133	synth_teacher: str \| None = None
134	synth_strategy: str \| None = None
135	synth_at: str \| None = None
136	source_section_id: str \| None = None
137	media_path: str \| None = None
138	media_alt: str \| None = None
139	media_blob_sha: str \| None = None
140	media_transcript: str \| None = None
141
142	def __post_init__(self) -> None:
143	if self.auto_mined:
144	if self.type != SectionType.PREFERENCE:
145	raise ValueError("auto_mined metadata is only valid on preference sections")
146	missing = [
147	name
148	for name, value in (
149	("judge_name", self.judge_name),
150	("judge_score_chosen", self.judge_score_chosen),
151	("judge_score_rejected", self.judge_score_rejected),
152	("mined_at", self.mined_at),
153	("mined_run_id", self.mined_run_id),
154	)
155	if value is None
156	]
157	if missing:
158	raise ValueError(
159	f"auto_mined preference sections require metadata fields {missing!r}"
160	)
161	assert self.judge_score_chosen is not None
162	assert self.judge_score_rejected is not None
163	if not math.isfinite(self.judge_score_chosen) or not math.isfinite(
164	self.judge_score_rejected
165	):
166	raise ValueError("judge scores must be finite floats")
167	assert self.mined_run_id is not None
168	if self.mined_run_id < 1:
169	raise ValueError("mined_run_id must be >= 1")
170
171	if self.auto_synth:
172	if self.type != SectionType.INSTRUCTION:
173	raise ValueError("auto_synth metadata is only valid on instruction sections")
174	missing = [
175	name
176	for name, value in (
177	("synth_teacher", self.synth_teacher),
178	("synth_strategy", self.synth_strategy),
179	("synth_at", self.synth_at),
180	("source_section_id", self.source_section_id),
181	)
182	if value is None
183	]
184	if missing:
185	raise ValueError(
186	f"auto_synth instruction sections require metadata fields {missing!r}"
187	)
188	assert self.synth_teacher is not None
189	assert self.synth_strategy is not None
190	assert self.synth_at is not None
191	assert self.source_section_id is not None
192	if not self.synth_teacher:
193	raise ValueError("synth_teacher must be non-empty")
194	if not self.synth_strategy:
195	raise ValueError("synth_strategy must be non-empty")
196	if len(self.source_section_id) != _SECTION_ID_BYTES * 2 or any(
197	ch not in "0123456789abcdef" for ch in self.source_section_id
198	):
199	raise ValueError("source_section_id must be a 16-char lowercase hex section id")
200
201	@property
202	def section_id(self) -> str:
203	"""Stable 16-char hex content-hash ID."""
204	h = hashlib.sha256()
205	h.update(self.type.value.encode("utf-8"))
206	h.update(b"\n")
207	if self.type in (SectionType.IMAGE, SectionType.AUDIO):
208	# Media identity: path \|\| blob_sha. Pre-ingest fallback
209	# hashes path alone so `dlm show` and parser round-trips
210	# work before the trainer writes bytes through the blob
211	# store; the trainer always populates `media_blob_sha`
212	# before deterministic splits see the ID. Transcript /
213	# alt-text do not participate — they're metadata on the
214	# section, not part of identity (edit an audio's
215	# transcript → same section, new training pair; edit the
216	# audio bytes → new section entirely).
217	h.update((self.media_path or "").encode("utf-8"))
218	if self.media_blob_sha is not None:
219	h.update(b"\n")
220	h.update(self.media_blob_sha.encode("utf-8"))
221	else:
222	normalized = normalize_for_hashing(self.content)
223	h.update(normalized.encode("utf-8"))
224	return h.hexdigest()[: _SECTION_ID_BYTES * 2]