Python · 19644 bytes Raw Blame History
1 """Pydantic models for `.dlm` frontmatter.
2
3 Every model is `extra="forbid"` and `frozen=True` — strict validation and
4 immutable values. Default values must match the shape produced by
5 `tests/fixtures/dlm_factory.py` so round-trips are stable.
6
7 Versioned schema dispatch lives in `dlm.doc.versioned`; this module
8 defines the current frontmatter shape.
9 """
10
11 from __future__ import annotations
12
13 import re
14 from typing import Final, Literal
15
16 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
17
18 # Crockford base32 alphabet used by ULID: 0-9, A-Z minus I L O U.
19 _ULID_RE: Final[re.Pattern[str]] = re.compile(r"^[0-9A-HJ-KM-NP-TV-Z]{26}$")
20
21 # Adapter names: lowercase alpha start, alphanumeric + underscore tail.
22 # Keeps store paths safe (adapter/<name>/versions/) and log lines readable.
23 _ADAPTER_NAME_RE: Final[re.Pattern[str]] = re.compile(r"^[a-z][a-z0-9_]{0,31}$")
24
25 CURRENT_SCHEMA_VERSION: Final[int] = 15
26 """Schema version this parser implements.
27
28 New fields bump the version and register a migrator in the same
29 commit — enforced by `test_all_versions_have_migrator_up_to_latest`.
30 v2 renamed `training.dpo` → `training.preference` to accommodate both
31 DPO and ORPO under one `method`-switched config. v3 added the
32 additive `training.cpt` block (DAPT schedule + embedding warm-up)
33 for continued-pretraining refinements. v4 added the additive
34 `training.adapters` map for named multi-adapter composition; flat
35 `adapter`/`lora_*` keys remain the single-adapter shorthand. v5
36 added the additive `training.precision` override (opt-in fp16/bf16
37 on MPS after the NaN-adapter bug). v6 adds the additive
38 `training.sources` block — declarative file-tree directives that
39 synthesize PROSE sections at train time, letting a `.dlm` act as a
40 training plan over content stored elsewhere on disk. v10 introduces
41 `SectionType.IMAGE` + the `::image path="..." alt="..."::` fence
42 grammar for multi-modal training; the body schema is strictly
43 additive and the fence extension is backward-compatible (images
44 are parsed via a separate attribute grammar rather than changing
45 the existing `::type#name::` form). v11 adds `SectionType.AUDIO`
46 with the parallel `::audio path="..." transcript="..."::` fence —
47 the transcript becomes the text-side supervision (no equivalent to
48 the optional image caption; audio without a transcript has no
49 training signal). v12 adds the additive `training.audio` block
50 (currently one field, `auto_resample: bool`) — opt-in automatic
51 resampling when audio files don't match the base's pinned rate.
52 Default False preserves the "refuse on SR mismatch" contract. v13 is
53 an identity bump paired with the 2026 base-model registry refresh:
54 the document frontmatter shape is unchanged, but the migration chain
55 still advances so tooling can distinguish post-refresh docs from older
56 ones. v14 adds additive auto-mined preference metadata on
57 `::preference::` sections; the frontmatter shape remains unchanged,
58 but the schema still advances so migration-aware tooling can tell
59 pre-mining docs from ones that may carry mined-preference markers in
60 the body. v15 adds additive auto-synth instruction metadata on
61 `::instruction::` sections; the frontmatter shape remains unchanged,
62 but the schema still advances so migration-aware tooling can tell
63 pre-synth docs from ones that may carry synthesized-instruction
64 markers in the body.
65 """
66
67
68 class PreferenceHyperparams(BaseModel):
69 """Hyperparameters shared across preference methods.
70
71 Some fields are method-specific (`beta` for DPO, `alpha` for
72 ORPO); the trainer reads whichever applies. Keeping both on one
73 flat block simplifies migration and lets users switch methods
74 without reshaping their document.
75 """
76
77 model_config = ConfigDict(extra="forbid", frozen=True)
78
79 beta: float = Field(0.1, ge=0.0, le=1.0)
80 alpha: float = Field(0.1, ge=0.0, le=1.0)
81 learning_rate: float = Field(5e-6, gt=0.0)
82 num_epochs: int = Field(1, ge=1)
83
84
85 class PreferenceConfig(BaseModel):
86 """Preference-phase knobs (DPO or ORPO). Additive to `TrainingConfig`;
87 default disabled. `enabled` flips to `True` automatically when the
88 document contains `::preference::` sections unless the user has
89 explicitly set it to `False` — the phase orchestrator reads that
90 signal."""
91
92 model_config = ConfigDict(extra="forbid", frozen=True)
93
94 enabled: bool = False
95 method: Literal["dpo", "orpo"] = "dpo"
96 hyperparams: PreferenceHyperparams = Field(default_factory=lambda: PreferenceHyperparams())
97 # DPO-only fields — ignored for ORPO but kept on the config so a
98 # user switching methods doesn't have to delete them.
99 loss_type: Literal["sigmoid", "hinge", "ipo"] = "sigmoid"
100 reference: Literal["base", "pre_adapter"] = "pre_adapter"
101
102
103 def _default_preference() -> PreferenceConfig:
104 return PreferenceConfig()
105
106
107 class CptConfig(BaseModel):
108 """Continued-pretraining refinements.
109
110 `schedule="auto"` lets the trainer pick: `dapt` when CPT rows
111 dominate (>70% of training rows), otherwise the SFT default. A
112 user who wants the DAPT curve regardless of the row mix pins
113 `schedule="dapt"`; `schedule="sft"` opts out entirely.
114
115 `embed_warmup_steps>0` unfreezes `embed_tokens` + `lm_head` for
116 the first N steps and adds them to `modules_to_save`, which
117 inflates adapter size by `vocab_size * hidden_dim`. The trainer
118 warns loudly when this is enabled.
119 """
120
121 model_config = ConfigDict(extra="forbid", frozen=True)
122
123 schedule: Literal["auto", "dapt", "sft"] = "auto"
124 embed_warmup_steps: int = Field(0, ge=0)
125
126
127 def _default_cpt() -> CptConfig:
128 return CptConfig()
129
130
131 class GateConfig(BaseModel):
132 """Learned MoE-style adapter gate.
133
134 When `enabled`, a small MLP trained post-SFT routes each prompt to
135 a weighted combination of the document's named adapters. Applied
136 uniformly across adapter layers (per-layer routing is the research
137 follow-up).
138
139 `cold_start_floor` is the minimum number of supervising sections
140 per adapter below which gate training is skipped and inference
141 defaults to uniform weights — small corpora overfit a tiny router.
142 """
143
144 model_config = ConfigDict(extra="forbid", frozen=True)
145
146 enabled: bool = False
147 hidden_proj_dim: int = Field(64, ge=8, le=2048)
148 steps: int = Field(200, ge=1, le=10000)
149 lr: float = Field(3e-4, gt=0.0, le=1.0)
150 cold_start_floor: int = Field(4, ge=1, le=1024)
151 # Entropy-regularization weight on the gate loss. Higher values
152 # discourage mode collapse (one adapter takes all the weight);
153 # lower values let the gate commit harder when data justifies it.
154 entropy_lambda: float = Field(0.01, ge=0.0, le=1.0)
155
156
157 def _default_gate() -> GateConfig:
158 return GateConfig()
159
160
161 class CacheConfig(BaseModel):
162 """Tokenized-section cache tuning.
163
164 The cache lives at `~/.dlm/store/<dlm_id>/tokenized-cache/` and
165 trades disk for tokenization wall-clock on directive-sourced runs.
166 Defaults cover the typical case: cache on, 10 GiB cap, 90-day
167 retention. Per-document overrides here let authors tune for their
168 corpus size.
169
170 All fields are independent — no cross-field validation. The three
171 knobs map to three distinct operator concerns:
172 - ``enabled`` is the off-switch.
173 - ``max_bytes`` is the disk ceiling.
174 - ``prune_older_than_days`` is the retention window.
175 """
176
177 model_config = ConfigDict(extra="forbid", frozen=True)
178
179 enabled: bool = True
180 # Default: 10 GiB (10 * 1024^3). Per-document cap that supersedes
181 # the cache module's built-in default when the trainer opens the
182 # cache. Lower for small personal corpora, higher for 50K+ file
183 # codebases.
184 max_bytes: int = Field(10 * 1024 * 1024 * 1024, ge=1)
185 # Default cutoff for `dlm cache prune` when the user doesn't pass
186 # `--older-than`. Overridable by the CLI flag on a per-command
187 # basis.
188 prune_older_than_days: int = Field(90, ge=1)
189
190
191 def _default_cache() -> CacheConfig:
192 return CacheConfig()
193
194
195 class AudioConfig(BaseModel):
196 """Audio-pipeline knobs for audio-language training (v12).
197
198 Only meaningful when the base is an audio-language model (the
199 trainer reads this block only on that branch). Default leaves
200 behavior identical to v11: ``auto_resample=False`` preserves the
201 "refuse on SR mismatch" contract, so adding a no-audio or
202 non-audio document to v12 never changes training output.
203 """
204
205 model_config = ConfigDict(extra="forbid", frozen=True)
206
207 # When True, audio files at a native sample rate different from the
208 # base's pinned rate (e.g. 48 kHz clip with Qwen2-Audio's 16 kHz
209 # pin) are resampled on the fly via `dlm.data.audio_resample`
210 # instead of raising. Resampled waveforms cache separately from
211 # native-rate ones (cache key carries the flag), so toggling
212 # auto_resample on an existing corpus doesn't serve stale entries.
213 auto_resample: bool = False
214
215
216 def _default_audio() -> AudioConfig:
217 return AudioConfig()
218
219
220 class AdapterConfig(BaseModel):
221 """One named adapter in a multi-adapter document.
222
223 A subset of the flat config — only the per-adapter LoRA knobs plus
224 `learning_rate`. Hyperparameters that are intrinsically run-scoped
225 (`sequence_len`, `num_epochs`, `seed`, `optimizer`, `lr_scheduler`,
226 `warmup_ratio`, `micro_batch_size`, `grad_accum`) stay at the
227 `TrainingConfig` top level because mixing them per-adapter makes
228 schedules and batching incoherent.
229 """
230
231 model_config = ConfigDict(extra="forbid", frozen=True)
232
233 adapter: Literal["lora", "qlora", "dora"] = "lora"
234 lora_r: int = Field(8, ge=1, le=256)
235 lora_alpha: int = Field(16, ge=1)
236 lora_dropout: float = Field(0.05, ge=0.0, le=0.5)
237 target_modules: Literal["auto"] | list[str] = "auto"
238 learning_rate: float = Field(2e-4, gt=0.0)
239
240
241 class SourceDirective(BaseModel):
242 """A directive to ingest file(s) as synthetic PROSE sections at train
243 time.
244
245 Paths resolve relative to the `.dlm` file's parent directory when
246 not absolute; `~` expands via `Path.expanduser()`. Under
247 `training.sources_policy="strict"` the resolved path must stay
248 under the `.dlm` parent dir (symlinks included — containment is
249 checked after `Path.resolve()`). `permissive` lets absolute paths
250 anywhere on disk.
251
252 `include` / `exclude` are POSIX-glob patterns relative to each
253 source root (default `("**/*",)` + `()` — every file matches).
254 Size caps apply per-file and per-directive; binary files (first-
255 KiB NUL scan) and non-UTF-8 bytes are skipped with a log warning,
256 never a fatal error, because mixed trees are the common case.
257 """
258
259 model_config = ConfigDict(extra="forbid", frozen=True)
260
261 path: str = Field(..., min_length=1)
262 include: tuple[str, ...] = ("**/*",)
263 exclude: tuple[str, ...] = ()
264 max_bytes_per_file: int | None = Field(default=None, ge=1)
265 max_files: int | None = Field(default=None, ge=1)
266
267
268 class TrainingConfig(BaseModel):
269 """Training-time knobs. `auto` values are resolved by the hardware doctor."""
270
271 model_config = ConfigDict(extra="forbid", frozen=True)
272
273 adapter: Literal["lora", "qlora", "dora"] = "lora"
274 lora_r: int = Field(8, ge=1, le=256)
275 lora_alpha: int = Field(16, ge=1)
276 lora_dropout: float = Field(0.05, ge=0.0, le=0.5)
277 target_modules: Literal["auto"] | list[str] = "auto"
278 sequence_len: int = Field(2048, ge=64, le=32768)
279 micro_batch_size: Literal["auto"] | int = "auto"
280 grad_accum: Literal["auto"] | int = "auto"
281 learning_rate: float = Field(2e-4, gt=0.0)
282 num_epochs: int = Field(3, ge=1)
283 optimizer: Literal[
284 "adamw_torch",
285 "adamw_bnb_8bit",
286 "paged_adamw_8bit",
287 "galore_adamw",
288 "galore_adamw_8bit",
289 ] = "adamw_torch"
290 lr_scheduler: Literal["cosine", "linear", "constant"] = "cosine"
291 warmup_ratio: float = Field(0.1, ge=0.0, le=0.5)
292 # Advanced: override the hardware doctor's auto-picked precision.
293 # `None` (default) lets the planner pick per backend — bf16 on
294 # Ampere+, fp16 on older CUDA, fp32 on MPS (the last pin is
295 # defensive: MPS fp16 attention kernels produce NaN LoRA weights
296 # on tiny-data runs; see `.docs/bugs/01-nan-adapter-on-mps.md`).
297 # Users who want fp16 on MPS for memory (e.g. running an 8B base
298 # on a 24 GB unified-memory budget) can opt in here, accepting
299 # the stability risk on small datasets.
300 precision: Literal["bf16", "fp16", "fp32"] | None = None
301 seed: int = 42
302 preference: PreferenceConfig = Field(default_factory=_default_preference)
303 cpt: CptConfig = Field(default_factory=_default_cpt)
304 # Learned adapter gate. Only meaningful when `adapters`
305 # declares two or more named adapters — a gate over a single
306 # adapter is a tautology. Enforced at validate-time below.
307 gate: GateConfig = Field(default_factory=_default_gate)
308 # Tokenized-section cache tuning. Defaults preserve
309 # pre-v9 behavior: cache on, 10 GiB cap, 90-day prune window.
310 cache: CacheConfig = Field(default_factory=_default_cache)
311 # Audio-pipeline knobs (v12). Only read when the base is an
312 # audio-language model. Default `auto_resample=False` preserves the
313 # v11 contract (refuse on SR mismatch); set to True to enable on-
314 # the-fly resampling to the base's pinned rate.
315 audio: AudioConfig = Field(default_factory=_default_audio)
316 # Named adapters for multi-adapter composition. When set, the flat
317 # `adapter`/`lora_*`/`target_modules`/`learning_rate` fields must
318 # stay at their defaults — mixing the two shapes creates ambiguous
319 # "which config wins?" semantics. An empty/None `adapters` keeps
320 # the single-adapter shorthand fully backward-compatible.
321 adapters: dict[str, AdapterConfig] | None = None
322 # Source directives (v6). Declarative file-tree ingestion — each
323 # entry becomes a walk-and-read at train time, synthesizing PROSE
324 # sections for the CPT path. `None` / empty keeps the `.dlm` as a
325 # self-contained training corpus; populated lets the document
326 # reference external codebases, notes directories, etc. See
327 # `dlm.directives.expand_sources`.
328 sources: tuple[SourceDirective, ...] | None = None
329 # `permissive` (default) lets directive paths point anywhere on
330 # disk. `strict` confines them to the `.dlm` parent subtree —
331 # useful when a `.dlm` travels with a project and the author wants
332 # training content to stay project-local regardless of where a
333 # downstream user places the file.
334 sources_policy: Literal["permissive", "strict"] = "permissive"
335
336 @field_validator("micro_batch_size", "grad_accum")
337 @classmethod
338 def _validate_auto_or_positive(cls, v: int | str) -> int | str:
339 if v == "auto":
340 return v
341 if not isinstance(v, int) or v < 1:
342 raise ValueError(f"must be a positive int or 'auto', got {v!r}")
343 return v
344
345 @field_validator("adapters")
346 @classmethod
347 def _validate_adapter_names(
348 cls, v: dict[str, AdapterConfig] | None
349 ) -> dict[str, AdapterConfig] | None:
350 if v is None:
351 return v
352 if not v:
353 raise ValueError(
354 "training.adapters: at least one adapter must be declared "
355 "(omit the block entirely for the single-adapter shorthand)"
356 )
357 for name in v:
358 if not _ADAPTER_NAME_RE.fullmatch(name):
359 raise ValueError(
360 f"training.adapters: {name!r} is not a valid adapter "
361 f"name (must match {_ADAPTER_NAME_RE.pattern})"
362 )
363 return v
364
365 @model_validator(mode="after")
366 def _gate_requires_multiple_adapters(self) -> TrainingConfig:
367 if self.gate.enabled and (self.adapters is None or len(self.adapters) < 2):
368 raise ValueError(
369 "training.gate.enabled=true requires training.adapters "
370 "with two or more named adapters (a gate over a single "
371 "adapter has nothing to route between)"
372 )
373 return self
374
375 @model_validator(mode="after")
376 def _flat_and_named_are_mutually_exclusive(self) -> TrainingConfig:
377 if self.adapters is None:
378 return self
379 # A set flat-adapter field would silently lose to the named
380 # block at train time. Refuse at parse time instead.
381 flat_defaults = {
382 "adapter": "lora",
383 "lora_r": 8,
384 "lora_alpha": 16,
385 "lora_dropout": 0.05,
386 "target_modules": "auto",
387 "learning_rate": 2e-4,
388 }
389 drift = [key for key, default in flat_defaults.items() if getattr(self, key) != default]
390 if drift:
391 raise ValueError(
392 "training.adapters is declared; flat per-adapter fields "
393 f"{drift} must stay at their defaults (move them into the "
394 "per-adapter block instead)"
395 )
396 return self
397
398
399 class ExportConfig(BaseModel):
400 """Export-time defaults."""
401
402 model_config = ConfigDict(extra="forbid", frozen=True)
403
404 default_quant: Literal["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"] = "Q4_K_M"
405 # Optional per-document sampling overrides. When set, the Modelfile
406 # emits `PARAMETER temperature <v>` / `PARAMETER top_p <v>` in place
407 # of the dialect default — a Q&A document prefers temperature=0.2,
408 # a creative one prefers 0.9. `None` keeps the dialect default
409 # from the template registry.
410 default_temperature: float | None = Field(None, gt=0.0, le=2.0)
411 default_top_p: float | None = Field(None, gt=0.0, le=1.0)
412
413
414 # Named factories so mypy can type-check the field defaults correctly.
415 def _default_training() -> TrainingConfig:
416 return TrainingConfig()
417
418
419 def _default_export() -> ExportConfig:
420 return ExportConfig()
421
422
423 class DlmFrontmatter(BaseModel):
424 """Top-level frontmatter: the YAML block between `---` delimiters.
425
426 `dlm_id` is a canonical 26-character ULID. It is assigned by
427 `dlm init` and never regenerated by the parser.
428 `base_model` is either a registry key (e.g. `qwen2.5-1.5b`) or an
429 `hf:org/name` escape hatch — the registry validates the
430 actual lookup; this module only validates that the string is non-empty.
431 """
432
433 model_config = ConfigDict(extra="forbid", frozen=True)
434
435 dlm_id: str
436 dlm_version: int = CURRENT_SCHEMA_VERSION
437 base_model: str = Field(..., min_length=1)
438 training: TrainingConfig = Field(default_factory=_default_training)
439 export: ExportConfig = Field(default_factory=_default_export)
440 system_prompt: str | None = None
441
442 @field_validator("dlm_id")
443 @classmethod
444 def _validate_ulid(cls, v: str) -> str:
445 if not _ULID_RE.fullmatch(v):
446 raise ValueError(
447 f"dlm_id must be a 26-char Crockford base32 ULID, got {v!r}",
448 )
449 return v
450
451 @field_validator("dlm_version")
452 @classmethod
453 def _validate_version(cls, v: int) -> int:
454 # Defense in depth: the `versioned` dispatcher is
455 # the intended entry point, but direct `DlmFrontmatter.model_validate`
456 # callers (tests, tooling) need the same guard. Reject both
457 # under-1 and beyond-current values at the field level.
458 if v < 1:
459 raise ValueError(f"dlm_version must be ≥1, got {v}")
460 if v > CURRENT_SCHEMA_VERSION:
461 raise ValueError(
462 f"dlm_version {v} is newer than this CLI supports "
463 f"(CURRENT_SCHEMA_VERSION={CURRENT_SCHEMA_VERSION})."
464 )
465 return v