Python · 9838 bytes Raw Blame History
1 """Body sections: PROSE (default), INSTRUCTION (`::instruction::`),
2 PREFERENCE (`::preference::`), IMAGE (`::image path="..." alt="..."::`),
3 AUDIO (`::audio path="..." transcript="..."::`).
4
5 Text-body sections carry their raw content verbatim plus a stable
6 `section_id` derived from `sha256(type || "\\n" || normalized_content)[:16]`
7 where normalization is the same LF+BOM-stripping applied by `dlm.io.text`.
8
9 Media sections (IMAGE, AUDIO) reference a binary blob outside the
10 `.dlm` file; their identity is
11 `sha256(type || "\\n" || path || "\\n" || blob_sha)[:16]` once the blob
12 has been ingested into the content-addressed store. The path is part
13 of the hash because different logical uses of the same bytes
14 (`hero.png` in section A, `same-bytes.png` in section B) should not
15 collapse to one training row. Before ingestion, `media_blob_sha` is
16 `None` and the path alone seeds identity — sufficient for `dlm show`
17 but not for training.
18
19 This means:
20
21 - The section ID is stable across Windows/Unix line endings (audit F15).
22 - A whitespace-only edit inside *another* section does not change this
23 section's ID (content-addressing correctness).
24 - Changing the section type (prose → instruction, image → audio)
25 produces a different ID even for identical content (type namespaces
26 are disjoint).
27 - For media sections, a blob-bytes change flips the ID even if the
28 path didn't move; a path change flips the ID even if the bytes are
29 identical.
30
31 AUDIO vs IMAGE: audio sections require `media_transcript` (text-side
32 supervision); image sections optionally carry a caption in `content`.
33 The training row for audio ties the transcript to the audio features;
34 the training row for image uses the image-token placeholder and caption.
35 """
36
37 from __future__ import annotations
38
39 import hashlib
40 import math
41 from collections.abc import Mapping
42 from dataclasses import dataclass, field
43 from enum import StrEnum
44 from types import MappingProxyType
45
46 from dlm.io.text import normalize_for_hashing
47
48 _EMPTY_TAGS: Mapping[str, str] = MappingProxyType({})
49
50
51 class SectionType(StrEnum):
52 PROSE = "prose"
53 INSTRUCTION = "instruction"
54 PREFERENCE = "preference"
55 IMAGE = "image"
56 AUDIO = "audio"
57
58
59 _SECTION_ID_BYTES = 8 # 16 hex chars
60
61
62 @dataclass(frozen=True)
63 class Section:
64 """A single body section.
65
66 `start_line` is the 1-indexed line in the source where the section
67 begins (the fence line for fenced sections, the first prose line for
68 PROSE). Used for error reporting and is **not** part of the section
69 identity.
70
71 `content` is the raw section body, fence-free. Fence lines are
72 stripped; leading/trailing blank lines around the content are
73 preserved as-is to keep round-trip idempotent after the first pass.
74
75 `adapter` is the optional `#name` routing suffix from a fence like
76 `::instruction#tone::`. `None` means "unrouted" — the section's rows
77 flow to whichever adapter the router picks as default (the first
78 declared, in multi-adapter docs). The field is intentionally not
79 part of `section_id`: moving a section between adapters is a routing
80 change, not a content change, and retention snapshots key off the
81 content hash.
82
83 `tags` is the optional free-form metadata map flowed from
84 `.dlm/training.yaml`. Consumers (weighting, filtering,
85 sway probes) read these; the trainer's row-production path
86 ignores them. Like `adapter`, tags are **not** part of `section_id`
87 — metadata churn doesn't invalidate replay identity.
88
89 `auto_harvest` marks a section as written back into the `.dlm` by
90 `dlm harvest` — the pull-mode that ingests failing probes from a
91 sway report (schema v7). `harvest_source` records the source run
92 ("run_N_sway"-style opaque token) for provenance. Like `tags`,
93 neither field participates in `section_id`.
94
95 `auto_mined` marks a `::preference::` section as synthesized by
96 the preference-mining loop rather than hand-authored. The
97 accompanying judge metadata (`judge_name`, `judge_score_chosen`,
98 `judge_score_rejected`, `mined_at`, `mined_run_id`) captures
99 provenance for review, metrics, and revert flows. Like harvest
100 metadata, these fields do not participate in `section_id`.
101
102 `auto_synth` marks an `::instruction::` section as synthesized by
103 the instruction-generation loop rather than hand-authored. The
104 accompanying metadata (`synth_teacher`, `synth_strategy`,
105 `synth_at`, `source_section_id`) captures provenance for review,
106 metrics, and revert flows. Like the other provenance flags, these
107 fields do not participate in `section_id`.
108
109 `media_path` / `media_alt` / `media_blob_sha` are media-section
110 fields (IMAGE + AUDIO) populated from the fence attributes and
111 the content-addressed blob store (after ingestion). Non-media
112 sections leave them at their `None` defaults and they do not
113 participate in identity; media sections use them as the identity
114 inputs in place of `content`. `media_alt` is IMAGE-only;
115 `media_transcript` is AUDIO-only (the audio's text-side
116 supervision, required for training).
117 """
118
119 type: SectionType
120 content: str
121 start_line: int = 0
122 adapter: str | None = None
123 tags: Mapping[str, str] = field(default_factory=lambda: _EMPTY_TAGS)
124 auto_harvest: bool = False
125 harvest_source: str | None = None
126 auto_mined: bool = False
127 judge_name: str | None = None
128 judge_score_chosen: float | None = None
129 judge_score_rejected: float | None = None
130 mined_at: str | None = None
131 mined_run_id: int | None = None
132 auto_synth: bool = False
133 synth_teacher: str | None = None
134 synth_strategy: str | None = None
135 synth_at: str | None = None
136 source_section_id: str | None = None
137 media_path: str | None = None
138 media_alt: str | None = None
139 media_blob_sha: str | None = None
140 media_transcript: str | None = None
141
142 def __post_init__(self) -> None:
143 if self.auto_mined:
144 if self.type != SectionType.PREFERENCE:
145 raise ValueError("auto_mined metadata is only valid on preference sections")
146 missing = [
147 name
148 for name, value in (
149 ("judge_name", self.judge_name),
150 ("judge_score_chosen", self.judge_score_chosen),
151 ("judge_score_rejected", self.judge_score_rejected),
152 ("mined_at", self.mined_at),
153 ("mined_run_id", self.mined_run_id),
154 )
155 if value is None
156 ]
157 if missing:
158 raise ValueError(
159 f"auto_mined preference sections require metadata fields {missing!r}"
160 )
161 assert self.judge_score_chosen is not None
162 assert self.judge_score_rejected is not None
163 if not math.isfinite(self.judge_score_chosen) or not math.isfinite(
164 self.judge_score_rejected
165 ):
166 raise ValueError("judge scores must be finite floats")
167 assert self.mined_run_id is not None
168 if self.mined_run_id < 1:
169 raise ValueError("mined_run_id must be >= 1")
170
171 if self.auto_synth:
172 if self.type != SectionType.INSTRUCTION:
173 raise ValueError("auto_synth metadata is only valid on instruction sections")
174 missing = [
175 name
176 for name, value in (
177 ("synth_teacher", self.synth_teacher),
178 ("synth_strategy", self.synth_strategy),
179 ("synth_at", self.synth_at),
180 ("source_section_id", self.source_section_id),
181 )
182 if value is None
183 ]
184 if missing:
185 raise ValueError(
186 f"auto_synth instruction sections require metadata fields {missing!r}"
187 )
188 assert self.synth_teacher is not None
189 assert self.synth_strategy is not None
190 assert self.synth_at is not None
191 assert self.source_section_id is not None
192 if not self.synth_teacher:
193 raise ValueError("synth_teacher must be non-empty")
194 if not self.synth_strategy:
195 raise ValueError("synth_strategy must be non-empty")
196 if len(self.source_section_id) != _SECTION_ID_BYTES * 2 or any(
197 ch not in "0123456789abcdef" for ch in self.source_section_id
198 ):
199 raise ValueError("source_section_id must be a 16-char lowercase hex section id")
200
201 @property
202 def section_id(self) -> str:
203 """Stable 16-char hex content-hash ID."""
204 h = hashlib.sha256()
205 h.update(self.type.value.encode("utf-8"))
206 h.update(b"\n")
207 if self.type in (SectionType.IMAGE, SectionType.AUDIO):
208 # Media identity: path || blob_sha. Pre-ingest fallback
209 # hashes path alone so `dlm show` and parser round-trips
210 # work before the trainer writes bytes through the blob
211 # store; the trainer always populates `media_blob_sha`
212 # before deterministic splits see the ID. Transcript /
213 # alt-text do not participate — they're metadata on the
214 # section, not part of identity (edit an audio's
215 # transcript → same section, new training pair; edit the
216 # audio bytes → new section entirely).
217 h.update((self.media_path or "").encode("utf-8"))
218 if self.media_blob_sha is not None:
219 h.update(b"\n")
220 h.update(self.media_blob_sha.encode("utf-8"))
221 else:
222 normalized = normalize_for_hashing(self.content)
223 h.update(normalized.encode("utf-8"))
224 return h.hexdigest()[: _SECTION_ID_BYTES * 2]