| 1 |
"""Resolve `training.sources` directives into synthesized Sections. |
| 2 |
|
| 3 |
Entry point: `expand_sources(parsed, base_path)`. Walks every |
| 4 |
`SourceDirective` on the parsed frontmatter, reads matching files |
| 5 |
through `dlm.io.text.read_text` (so UTF-8 strict + BOM + CRLF |
| 6 |
hygiene are identical to in-body sections), and returns a tuple of |
| 7 |
`Section(SectionType.PROSE, ...)` ready to be concatenated with |
| 8 |
`parsed.sections` before `build_dataset`. |
| 9 |
|
| 10 |
Discovery + merge: when a directive points at a directory, |
| 11 |
`discover_configs` finds every `.dlm/training.yaml` + `.dlm/ignore` |
| 12 |
inside the tree; for each candidate file, `effective_config_for` |
| 13 |
resolves the merged include/exclude verdict and the metadata tags to |
| 14 |
flow onto the synthesized Section. |
| 15 |
|
| 16 |
Content-hash collision defense: every synthesized Section's content |
| 17 |
is prefixed with a canonical `# source: <relpath>\\n\\n` header. Two |
| 18 |
different files with identical bodies therefore produce distinct |
| 19 |
`section_id`s — the path becomes part of identity, matching the |
| 20 |
file-granular semantics users expect for tree ingestion. Tags are |
| 21 |
not part of `section_id`, so metadata churn doesn't invalidate the |
| 22 |
replay corpus. |
| 23 |
|
| 24 |
Per-source provenance is returned alongside the sections so |
| 25 |
`TrainingRunSummary.source_directives` can record what was ingested, |
| 26 |
what was skipped, and why. |
| 27 |
""" |
| 28 |
|
| 29 |
from __future__ import annotations |
| 30 |
|
| 31 |
import logging |
| 32 |
from collections.abc import Iterator |
| 33 |
from dataclasses import dataclass |
| 34 |
from pathlib import Path |
| 35 |
from typing import Final |
| 36 |
|
| 37 |
from dlm.directives.discovery import DiscoveredConfig, discover_configs |
| 38 |
from dlm.directives.errors import DirectivePathError |
| 39 |
from dlm.directives.merge import effective_config_for |
| 40 |
from dlm.directives.safety import confine_path, is_probably_binary |
| 41 |
from dlm.doc.parser import ParsedDlm |
| 42 |
from dlm.doc.schema import SourceDirective |
| 43 |
from dlm.doc.sections import Section, SectionType |
| 44 |
from dlm.io.text import DlmEncodingError, read_text |
| 45 |
from dlm.store.blobs import BlobStore |
| 46 |
|
| 47 |
_LOG = logging.getLogger(__name__) |
| 48 |
|
| 49 |
# File extensions dispatched to the blob store as IMAGE sections. |
| 50 |
# Kept lowercase; comparison lowers the observed suffix. |
| 51 |
_IMAGE_EXTENSIONS: Final[frozenset[str]] = frozenset( |
| 52 |
{".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff"} |
| 53 |
) |
| 54 |
|
| 55 |
# File extensions dispatched to the blob store as AUDIO sections. |
| 56 |
# `.mp3` / `.m4a` deferred — soundfile can't decode them without |
| 57 |
# libsndfile MP3 support, which isn't in our runtime dep tree. |
| 58 |
# Users with mp3 corpora re-encode to wav/flac first. |
| 59 |
_AUDIO_EXTENSIONS: Final[frozenset[str]] = frozenset({".wav", ".flac", ".ogg"}) |
| 60 |
|
| 61 |
# Sidecar transcript filename suffix: `clips/hello.wav` pairs with |
| 62 |
# `clips/hello.txt`. Missing the sidecar is a hard refusal — audio |
| 63 |
# without text supervision has no training signal, so we surface the |
| 64 |
# gap immediately rather than emitting a section that'd fail at |
| 65 |
# row-emission time. |
| 66 |
_TRANSCRIPT_SIDECAR_SUFFIX: Final[str] = ".txt" |
| 67 |
|
| 68 |
|
| 69 |
@dataclass(frozen=True) |
| 70 |
class SourceProvenance: |
| 71 |
"""Per-directive bookkeeping for the training summary. |
| 72 |
|
| 73 |
`path` is the directive's raw path (user-facing, before ~ or |
| 74 |
symlink resolution) so it matches the frontmatter on disk. |
| 75 |
`file_count` / `total_bytes` reflect text sections that made it |
| 76 |
into the Section list. `image_count` / `image_bytes` / |
| 77 |
`audio_count` / `audio_bytes` reflect media sections ingested |
| 78 |
through the blob store. `skipped_binary` / `skipped_encoding` / |
| 79 |
`skipped_over_size` / `skipped_by_descent` break down the drops — |
| 80 |
if a directive yields zero sections, the skip counts let the user |
| 81 |
see why. `skipped_image_no_store` / `skipped_audio_no_store` |
| 82 |
count media hits dropped because no BlobStore was passed to |
| 83 |
`expand_sources` (tests, `dlm show`). `skipped_audio_no_transcript` |
| 84 |
counts audio files missing their `<stem>.txt` sidecar. |
| 85 |
""" |
| 86 |
|
| 87 |
path: str |
| 88 |
file_count: int |
| 89 |
total_bytes: int |
| 90 |
image_count: int = 0 |
| 91 |
image_bytes: int = 0 |
| 92 |
audio_count: int = 0 |
| 93 |
audio_bytes: int = 0 |
| 94 |
skipped_binary: int = 0 |
| 95 |
skipped_encoding: int = 0 |
| 96 |
skipped_over_size: int = 0 |
| 97 |
skipped_by_descent: int = 0 |
| 98 |
skipped_image_no_store: int = 0 |
| 99 |
skipped_audio_no_store: int = 0 |
| 100 |
skipped_audio_no_transcript: int = 0 |
| 101 |
|
| 102 |
|
| 103 |
@dataclass(frozen=True) |
| 104 |
class ExpandResult: |
| 105 |
"""Return value of `expand_sources` — sections + provenance.""" |
| 106 |
|
| 107 |
sections: tuple[Section, ...] |
| 108 |
provenance: tuple[SourceProvenance, ...] |
| 109 |
discovered: tuple[DiscoveredConfig, ...] = () |
| 110 |
|
| 111 |
|
| 112 |
def expand_sources( |
| 113 |
parsed: ParsedDlm, |
| 114 |
*, |
| 115 |
base_path: Path, |
| 116 |
blob_store: BlobStore | None = None, |
| 117 |
) -> ExpandResult: |
| 118 |
"""Walk every `training.sources` directive and synthesize sections. |
| 119 |
|
| 120 |
`base_path` is the `.dlm` file's parent directory — the anchor |
| 121 |
for relative paths and the strict-policy confinement root. |
| 122 |
`parsed.source_path` could supply this but we take it explicitly |
| 123 |
because unit tests commonly synthesize `ParsedDlm` with |
| 124 |
`source_path=None`. |
| 125 |
|
| 126 |
When the `.dlm` lives under a `.dlm/` metadata directory (the |
| 127 |
scaffolded shape at `<corpus>/.dlm/corpus.dlm`), the user's |
| 128 |
intended anchor is the corpus directory, not the metadata |
| 129 |
directory — so relative resolution and strict confinement use the |
| 130 |
grandparent as the effective base. |
| 131 |
|
| 132 |
`blob_store` receives ingested image bytes. When `None`, image- |
| 133 |
extension files are counted in `skipped_image_no_store` and no |
| 134 |
IMAGE section is emitted — this is the right shape for read-only |
| 135 |
paths like `dlm show`, where we don't want to mutate disk. |
| 136 |
|
| 137 |
Returns an `ExpandResult` with an empty sections tuple when the |
| 138 |
frontmatter has no directives — callers can unconditionally |
| 139 |
concatenate without a None check. |
| 140 |
""" |
| 141 |
training = parsed.frontmatter.training |
| 142 |
directives = training.sources or () |
| 143 |
if not directives: |
| 144 |
return ExpandResult(sections=(), provenance=(), discovered=()) |
| 145 |
|
| 146 |
effective_base = base_path.parent if base_path.name == ".dlm" else base_path |
| 147 |
strict = training.sources_policy == "strict" |
| 148 |
sections: list[Section] = [] |
| 149 |
provenance: list[SourceProvenance] = [] |
| 150 |
all_discovered: list[DiscoveredConfig] = [] |
| 151 |
|
| 152 |
for directive in directives: |
| 153 |
root_raw = Path(directive.path) |
| 154 |
# Relative paths anchor on effective_base (the corpus dir for |
| 155 |
# scaffolded .dlms, the .dlm's parent otherwise). |
| 156 |
if not root_raw.is_absolute() and not directive.path.startswith("~"): |
| 157 |
root_raw = effective_base / root_raw |
| 158 |
|
| 159 |
resolved_root = confine_path(root_raw, effective_base, strict=strict) |
| 160 |
if not resolved_root.exists(): |
| 161 |
raise DirectivePathError(resolved_root, "path does not exist") |
| 162 |
|
| 163 |
# Discovery: only meaningful for directory directives. A |
| 164 |
# single-file directive has no tree to descend into. |
| 165 |
discovered = discover_configs(resolved_root) if resolved_root.is_dir() else () |
| 166 |
all_discovered.extend(discovered) |
| 167 |
|
| 168 |
dir_sections, dir_prov = _expand_one( |
| 169 |
directive=directive, |
| 170 |
resolved_root=resolved_root, |
| 171 |
discovered=discovered, |
| 172 |
blob_store=blob_store, |
| 173 |
) |
| 174 |
sections.extend(dir_sections) |
| 175 |
provenance.append(dir_prov) |
| 176 |
|
| 177 |
return ExpandResult( |
| 178 |
sections=tuple(sections), |
| 179 |
provenance=tuple(provenance), |
| 180 |
discovered=tuple(all_discovered), |
| 181 |
) |
| 182 |
|
| 183 |
|
| 184 |
def _expand_one( |
| 185 |
*, |
| 186 |
directive: SourceDirective, |
| 187 |
resolved_root: Path, |
| 188 |
discovered: tuple[DiscoveredConfig, ...], |
| 189 |
blob_store: BlobStore | None, |
| 190 |
) -> tuple[list[Section], SourceProvenance]: |
| 191 |
"""Expand a single directive into sections + per-directive provenance.""" |
| 192 |
sections: list[Section] = [] |
| 193 |
total_bytes = 0 |
| 194 |
image_count = 0 |
| 195 |
image_bytes = 0 |
| 196 |
audio_count = 0 |
| 197 |
audio_bytes = 0 |
| 198 |
skipped_binary = 0 |
| 199 |
skipped_encoding = 0 |
| 200 |
skipped_over_size = 0 |
| 201 |
skipped_by_descent = 0 |
| 202 |
skipped_image_no_store = 0 |
| 203 |
skipped_audio_no_store = 0 |
| 204 |
skipped_audio_no_transcript = 0 |
| 205 |
|
| 206 |
# Anchor for relpath-in-header. For a single-file directive the |
| 207 |
# header uses the file name; for a directory the relpath is the |
| 208 |
# tree-relative form. |
| 209 |
header_root = resolved_root if resolved_root.is_dir() else resolved_root.parent |
| 210 |
|
| 211 |
for file_path in _iter_candidates(resolved_root): |
| 212 |
if directive.max_files is not None and _section_cap_reached(sections, directive.max_files): |
| 213 |
_LOG.info( |
| 214 |
"directive: hit max_files=%d for %s; truncating deterministically", |
| 215 |
directive.max_files, |
| 216 |
directive.path, |
| 217 |
) |
| 218 |
break |
| 219 |
|
| 220 |
# Descent verdict: merge parent directive + discovered `.dlm/` |
| 221 |
# configs. `included=False` means a deeper config or default- |
| 222 |
# exclude ruled this file out. |
| 223 |
effective = effective_config_for( |
| 224 |
file_path, |
| 225 |
source_root=resolved_root, |
| 226 |
discovered=discovered, |
| 227 |
parent_directive=directive, |
| 228 |
is_dir=False, |
| 229 |
) |
| 230 |
if not effective.included: |
| 231 |
skipped_by_descent += 1 |
| 232 |
continue |
| 233 |
|
| 234 |
try: |
| 235 |
size = file_path.stat().st_size |
| 236 |
except OSError as exc: |
| 237 |
_LOG.warning("directive: stat failed for %s: %s; skipping", file_path, exc) |
| 238 |
continue |
| 239 |
|
| 240 |
if directive.max_bytes_per_file is not None and size > directive.max_bytes_per_file: |
| 241 |
_LOG.info( |
| 242 |
"directive: %s (%d bytes) exceeds max_bytes_per_file=%d; skipping", |
| 243 |
file_path, |
| 244 |
size, |
| 245 |
directive.max_bytes_per_file, |
| 246 |
) |
| 247 |
skipped_over_size += 1 |
| 248 |
continue |
| 249 |
|
| 250 |
# Image-extension dispatch: skips the text-read + binary-skip |
| 251 |
# path entirely. The blob store owns ingestion; the synthesized |
| 252 |
# section carries only the path + blob sha. |
| 253 |
if file_path.suffix.lower() in _IMAGE_EXTENSIONS: |
| 254 |
if blob_store is None: |
| 255 |
_LOG.info( |
| 256 |
"directive: %s is an image but no blob_store supplied; skipping", |
| 257 |
file_path, |
| 258 |
) |
| 259 |
skipped_image_no_store += 1 |
| 260 |
continue |
| 261 |
handle = blob_store.put(file_path) |
| 262 |
relpath = file_path.relative_to(header_root).as_posix() |
| 263 |
alt = file_path.stem |
| 264 |
sections.append( |
| 265 |
Section( |
| 266 |
type=SectionType.IMAGE, |
| 267 |
content="", |
| 268 |
media_path=relpath, |
| 269 |
media_alt=alt, |
| 270 |
media_blob_sha=handle.sha, |
| 271 |
tags=effective.tags, |
| 272 |
), |
| 273 |
) |
| 274 |
image_count += 1 |
| 275 |
image_bytes += handle.size |
| 276 |
continue |
| 277 |
|
| 278 |
# Audio-extension dispatch: same shape as image but requires |
| 279 |
# a `<stem>.txt` sidecar for the transcript — |
| 280 |
# audio without text has no training signal. Missing sidecar |
| 281 |
# is a skip (with an explicit counter), not a hard raise, |
| 282 |
# because a mixed corpus may have both "for-training" audio |
| 283 |
# (has .txt) and "reference" audio (no .txt) side by side. |
| 284 |
if file_path.suffix.lower() in _AUDIO_EXTENSIONS: |
| 285 |
if blob_store is None: |
| 286 |
_LOG.info( |
| 287 |
"directive: %s is audio but no blob_store supplied; skipping", |
| 288 |
file_path, |
| 289 |
) |
| 290 |
skipped_audio_no_store += 1 |
| 291 |
continue |
| 292 |
transcript = _read_audio_transcript(file_path) |
| 293 |
if transcript is None: |
| 294 |
_LOG.info( |
| 295 |
"directive: %s has no %s sidecar; skipping " |
| 296 |
"(audio without transcript has no training signal)", |
| 297 |
file_path, |
| 298 |
_TRANSCRIPT_SIDECAR_SUFFIX, |
| 299 |
) |
| 300 |
skipped_audio_no_transcript += 1 |
| 301 |
continue |
| 302 |
handle = blob_store.put(file_path) |
| 303 |
relpath = file_path.relative_to(header_root).as_posix() |
| 304 |
sections.append( |
| 305 |
Section( |
| 306 |
type=SectionType.AUDIO, |
| 307 |
content="", |
| 308 |
media_path=relpath, |
| 309 |
media_blob_sha=handle.sha, |
| 310 |
media_transcript=transcript, |
| 311 |
tags=effective.tags, |
| 312 |
), |
| 313 |
) |
| 314 |
audio_count += 1 |
| 315 |
audio_bytes += handle.size |
| 316 |
continue |
| 317 |
|
| 318 |
try: |
| 319 |
raw = file_path.read_bytes() |
| 320 |
except OSError as exc: |
| 321 |
_LOG.warning("directive: read failed for %s: %s; skipping", file_path, exc) |
| 322 |
continue |
| 323 |
|
| 324 |
if is_probably_binary(raw): |
| 325 |
_LOG.info("directive: %s looks binary (NUL in first KiB); skipping", file_path) |
| 326 |
skipped_binary += 1 |
| 327 |
continue |
| 328 |
|
| 329 |
try: |
| 330 |
text = read_text(file_path) |
| 331 |
except DlmEncodingError: |
| 332 |
_LOG.warning("directive: %s is not UTF-8; skipping", file_path) |
| 333 |
skipped_encoding += 1 |
| 334 |
continue |
| 335 |
|
| 336 |
relpath = file_path.relative_to(header_root).as_posix() |
| 337 |
content = f"# source: {relpath}\n\n{text}" |
| 338 |
sections.append(Section(type=SectionType.PROSE, content=content, tags=effective.tags)) |
| 339 |
total_bytes += len(raw) |
| 340 |
|
| 341 |
# Count prose-bearing sections separately from media so the |
| 342 |
# per-modality counters don't collide. PROSE + INSTRUCTION + |
| 343 |
# PREFERENCE → file_count; IMAGE → image_count; AUDIO → |
| 344 |
# audio_count. |
| 345 |
text_sections = sum(1 for s in sections if s.type not in (SectionType.IMAGE, SectionType.AUDIO)) |
| 346 |
return sections, SourceProvenance( |
| 347 |
path=directive.path, |
| 348 |
file_count=text_sections, |
| 349 |
total_bytes=total_bytes, |
| 350 |
image_count=image_count, |
| 351 |
image_bytes=image_bytes, |
| 352 |
audio_count=audio_count, |
| 353 |
audio_bytes=audio_bytes, |
| 354 |
skipped_binary=skipped_binary, |
| 355 |
skipped_encoding=skipped_encoding, |
| 356 |
skipped_over_size=skipped_over_size, |
| 357 |
skipped_by_descent=skipped_by_descent, |
| 358 |
skipped_image_no_store=skipped_image_no_store, |
| 359 |
skipped_audio_no_store=skipped_audio_no_store, |
| 360 |
skipped_audio_no_transcript=skipped_audio_no_transcript, |
| 361 |
) |
| 362 |
|
| 363 |
|
| 364 |
def _read_audio_transcript(audio_path: Path) -> str | None: |
| 365 |
"""Read the sibling `<stem>.txt` transcript, or return None. |
| 366 |
|
| 367 |
Sidecar is `<stem><_TRANSCRIPT_SIDECAR_SUFFIX>` in the same |
| 368 |
directory as the audio file — `clips/hello.wav` pairs with |
| 369 |
`clips/hello.txt`. The transcript is stripped of leading/trailing |
| 370 |
whitespace and re-UTF-8-decoded through `dlm.io.text.read_text` |
| 371 |
so it matches the encoding contract the rest of the pipeline |
| 372 |
uses for ingested text. |
| 373 |
|
| 374 |
Returns None (not empty string) when the sidecar is missing — the |
| 375 |
caller treats "no sidecar" and "empty sidecar" differently: the |
| 376 |
former is a skip, the latter is an author bug surfaced at train |
| 377 |
time via a loud refusal (an empty transcript has no training |
| 378 |
signal either). |
| 379 |
""" |
| 380 |
sidecar = audio_path.with_suffix(_TRANSCRIPT_SIDECAR_SUFFIX) |
| 381 |
if not sidecar.exists(): |
| 382 |
return None |
| 383 |
try: |
| 384 |
text = read_text(sidecar) |
| 385 |
except (OSError, DlmEncodingError) as exc: |
| 386 |
_LOG.warning( |
| 387 |
"directive: transcript sidecar %s is unreadable: %s; skipping audio", |
| 388 |
sidecar, |
| 389 |
exc, |
| 390 |
) |
| 391 |
return None |
| 392 |
return text.strip() |
| 393 |
|
| 394 |
|
| 395 |
def _section_cap_reached(sections: list[Section], max_files: int) -> bool: |
| 396 |
"""True when the cap has been hit for the user's intent. |
| 397 |
|
| 398 |
`max_files` caps total ingested files (text + image) — the user |
| 399 |
wrote one number; both modalities count against it. |
| 400 |
""" |
| 401 |
return len(sections) >= max_files |
| 402 |
|
| 403 |
|
| 404 |
def _iter_candidates(root: Path) -> Iterator[Path]: |
| 405 |
"""Yield every file under `root` in deterministic order. |
| 406 |
|
| 407 |
Filtering moves into `effective_config_for` now that descent |
| 408 |
protocol layers rules beyond the parent directive's include / |
| 409 |
exclude. Single-file roots are yielded as the single file. |
| 410 |
""" |
| 411 |
if root.is_file(): |
| 412 |
yield root |
| 413 |
return |
| 414 |
if not root.is_dir(): |
| 415 |
return |
| 416 |
for candidate in sorted(root.rglob("*")): |
| 417 |
if candidate.is_file(): |
| 418 |
yield candidate |