documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 19644 bytes Raw Blame History

  
        1
        """Pydantic models for `.dlm` frontmatter.
      
        2
        
        3
        Every model is `extra="forbid"` and `frozen=True` — strict validation and
      
        4
        immutable values. Default values must match the shape produced by
      
        5
        `tests/fixtures/dlm_factory.py` so round-trips are stable.
      
        6
        
        7
        Versioned schema dispatch lives in `dlm.doc.versioned`; this module
      
        8
        defines the current frontmatter shape.
      
        9
        """
      
        10
        
        11
        from __future__ import annotations
      
        12
        
        13
        import re
      
        14
        from typing import Final, Literal
      
        15
        
        16
        from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
      
        17
        
        18
        # Crockford base32 alphabet used by ULID: 0-9, A-Z minus I L O U.
      
        19
        _ULID_RE: Final[re.Pattern[str]] = re.compile(r"^[0-9A-HJ-KM-NP-TV-Z]{26}$")
      
        20
        
        21
        # Adapter names: lowercase alpha start, alphanumeric + underscore tail.
      
        22
        # Keeps store paths safe (adapter/<name>/versions/) and log lines readable.
      
        23
        _ADAPTER_NAME_RE: Final[re.Pattern[str]] = re.compile(r"^[a-z][a-z0-9_]{0,31}$")
      
        24
        
        25
        CURRENT_SCHEMA_VERSION: Final[int] = 15
      
        26
        """Schema version this parser implements.
      
        27
        
        28
        New fields bump the version and register a migrator in the same
      
        29
        commit — enforced by `test_all_versions_have_migrator_up_to_latest`.
      
        30
        v2 renamed `training.dpo` → `training.preference` to accommodate both
      
        31
        DPO and ORPO under one `method`-switched config. v3 added the
      
        32
        additive `training.cpt` block (DAPT schedule + embedding warm-up)
      
        33
        for continued-pretraining refinements. v4 added the additive
      
        34
        `training.adapters` map for named multi-adapter composition; flat
      
        35
        `adapter`/`lora_*` keys remain the single-adapter shorthand. v5
      
        36
        added the additive `training.precision` override (opt-in fp16/bf16
      
        37
        on MPS after the NaN-adapter bug). v6 adds the additive
      
        38
        `training.sources` block — declarative file-tree directives that
      
        39
        synthesize PROSE sections at train time, letting a `.dlm` act as a
      
        40
        training plan over content stored elsewhere on disk. v10 introduces
      
        41
        `SectionType.IMAGE` + the `::image path="..." alt="..."::` fence
      
        42
        grammar for multi-modal training; the body schema is strictly
      
        43
        additive and the fence extension is backward-compatible (images
      
        44
        are parsed via a separate attribute grammar rather than changing
      
        45
        the existing `::type#name::` form). v11 adds `SectionType.AUDIO`
      
        46
        with the parallel `::audio path="..." transcript="..."::` fence —
      
        47
        the transcript becomes the text-side supervision (no equivalent to
      
        48
        the optional image caption; audio without a transcript has no
      
        49
        training signal). v12 adds the additive `training.audio` block
      
        50
        (currently one field, `auto_resample: bool`) — opt-in automatic
      
        51
        resampling when audio files don't match the base's pinned rate.
      
        52
        Default False preserves the "refuse on SR mismatch" contract. v13 is
      
        53
        an identity bump paired with the 2026 base-model registry refresh:
      
        54
        the document frontmatter shape is unchanged, but the migration chain
      
        55
        still advances so tooling can distinguish post-refresh docs from older
      
        56
        ones. v14 adds additive auto-mined preference metadata on
      
        57
        `::preference::` sections; the frontmatter shape remains unchanged,
      
        58
        but the schema still advances so migration-aware tooling can tell
      
        59
        pre-mining docs from ones that may carry mined-preference markers in
      
        60
        the body. v15 adds additive auto-synth instruction metadata on
      
        61
        `::instruction::` sections; the frontmatter shape remains unchanged,
      
        62
        but the schema still advances so migration-aware tooling can tell
      
        63
        pre-synth docs from ones that may carry synthesized-instruction
      
        64
        markers in the body.
      
        65
        """
      
        66
        
        67
        
        68
        class PreferenceHyperparams(BaseModel):
      
        69
            """Hyperparameters shared across preference methods.
      
        70
        
        71
            Some fields are method-specific (`beta` for DPO, `alpha` for
      
        72
            ORPO); the trainer reads whichever applies. Keeping both on one
      
        73
            flat block simplifies migration and lets users switch methods
      
        74
            without reshaping their document.
      
        75
            """
      
        76
        
        77
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        78
        
        79
            beta: float = Field(0.1, ge=0.0, le=1.0)
      
        80
            alpha: float = Field(0.1, ge=0.0, le=1.0)
      
        81
            learning_rate: float = Field(5e-6, gt=0.0)
      
        82
            num_epochs: int = Field(1, ge=1)
      
        83
        
        84
        
        85
        class PreferenceConfig(BaseModel):
      
        86
            """Preference-phase knobs (DPO or ORPO). Additive to `TrainingConfig`;
      
        87
            default disabled. `enabled` flips to `True` automatically when the
      
        88
            document contains `::preference::` sections unless the user has
      
        89
            explicitly set it to `False` — the phase orchestrator reads that
      
        90
            signal."""
      
        91
        
        92
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        93
        
        94
            enabled: bool = False
      
        95
            method: Literal["dpo", "orpo"] = "dpo"
      
        96
            hyperparams: PreferenceHyperparams = Field(default_factory=lambda: PreferenceHyperparams())
      
        97
            # DPO-only fields — ignored for ORPO but kept on the config so a
      
        98
            # user switching methods doesn't have to delete them.
      
        99
            loss_type: Literal["sigmoid", "hinge", "ipo"] = "sigmoid"
      
        100
            reference: Literal["base", "pre_adapter"] = "pre_adapter"
      
        101
        
        102
        
        103
        def _default_preference() -> PreferenceConfig:
      
        104
            return PreferenceConfig()
      
        105
        
        106
        
        107
        class CptConfig(BaseModel):
      
        108
            """Continued-pretraining refinements.
      
        109
        
        110
            `schedule="auto"` lets the trainer pick: `dapt` when CPT rows
      
        111
            dominate (>70% of training rows), otherwise the SFT default. A
      
        112
            user who wants the DAPT curve regardless of the row mix pins
      
        113
            `schedule="dapt"`; `schedule="sft"` opts out entirely.
      
        114
        
        115
            `embed_warmup_steps>0` unfreezes `embed_tokens` + `lm_head` for
      
        116
            the first N steps and adds them to `modules_to_save`, which
      
        117
            inflates adapter size by `vocab_size * hidden_dim`. The trainer
      
        118
            warns loudly when this is enabled.
      
        119
            """
      
        120
        
        121
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        122
        
        123
            schedule: Literal["auto", "dapt", "sft"] = "auto"
      
        124
            embed_warmup_steps: int = Field(0, ge=0)
      
        125
        
        126
        
        127
        def _default_cpt() -> CptConfig:
      
        128
            return CptConfig()
      
        129
        
        130
        
        131
        class GateConfig(BaseModel):
      
        132
            """Learned MoE-style adapter gate.
      
        133
        
        134
            When `enabled`, a small MLP trained post-SFT routes each prompt to
      
        135
            a weighted combination of the document's named adapters. Applied
      
        136
            uniformly across adapter layers (per-layer routing is the research
      
        137
            follow-up).
      
        138
        
        139
            `cold_start_floor` is the minimum number of supervising sections
      
        140
            per adapter below which gate training is skipped and inference
      
        141
            defaults to uniform weights — small corpora overfit a tiny router.
      
        142
            """
      
        143
        
        144
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        145
        
        146
            enabled: bool = False
      
        147
            hidden_proj_dim: int = Field(64, ge=8, le=2048)
      
        148
            steps: int = Field(200, ge=1, le=10000)
      
        149
            lr: float = Field(3e-4, gt=0.0, le=1.0)
      
        150
            cold_start_floor: int = Field(4, ge=1, le=1024)
      
        151
            # Entropy-regularization weight on the gate loss. Higher values
      
        152
            # discourage mode collapse (one adapter takes all the weight);
      
        153
            # lower values let the gate commit harder when data justifies it.
      
        154
            entropy_lambda: float = Field(0.01, ge=0.0, le=1.0)
      
        155
        
        156
        
        157
        def _default_gate() -> GateConfig:
      
        158
            return GateConfig()
      
        159
        
        160
        
        161
        class CacheConfig(BaseModel):
      
        162
            """Tokenized-section cache tuning.
      
        163
        
        164
            The cache lives at `~/.dlm/store/<dlm_id>/tokenized-cache/` and
      
        165
            trades disk for tokenization wall-clock on directive-sourced runs.
      
        166
            Defaults cover the typical case: cache on, 10 GiB cap, 90-day
      
        167
            retention. Per-document overrides here let authors tune for their
      
        168
            corpus size.
      
        169
        
        170
            All fields are independent — no cross-field validation. The three
      
        171
            knobs map to three distinct operator concerns:
      
        172
            - ``enabled`` is the off-switch.
      
        173
            - ``max_bytes`` is the disk ceiling.
      
        174
            - ``prune_older_than_days`` is the retention window.
      
        175
            """
      
        176
        
        177
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        178
        
        179
            enabled: bool = True
      
        180
            # Default: 10 GiB (10 * 1024^3). Per-document cap that supersedes
      
        181
            # the cache module's built-in default when the trainer opens the
      
        182
            # cache. Lower for small personal corpora, higher for 50K+ file
      
        183
            # codebases.
      
        184
            max_bytes: int = Field(10 * 1024 * 1024 * 1024, ge=1)
      
        185
            # Default cutoff for `dlm cache prune` when the user doesn't pass
      
        186
            # `--older-than`. Overridable by the CLI flag on a per-command
      
        187
            # basis.
      
        188
            prune_older_than_days: int = Field(90, ge=1)
      
        189
        
        190
        
        191
        def _default_cache() -> CacheConfig:
      
        192
            return CacheConfig()
      
        193
        
        194
        
        195
        class AudioConfig(BaseModel):
      
        196
            """Audio-pipeline knobs for audio-language training (v12).
      
        197
        
        198
            Only meaningful when the base is an audio-language model (the
      
        199
            trainer reads this block only on that branch). Default leaves
      
        200
            behavior identical to v11: ``auto_resample=False`` preserves the
      
        201
            "refuse on SR mismatch" contract, so adding a no-audio or
      
        202
            non-audio document to v12 never changes training output.
      
        203
            """
      
        204
        
        205
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        206
        
        207
            # When True, audio files at a native sample rate different from the
      
        208
            # base's pinned rate (e.g. 48 kHz clip with Qwen2-Audio's 16 kHz
      
        209
            # pin) are resampled on the fly via `dlm.data.audio_resample`
      
        210
            # instead of raising. Resampled waveforms cache separately from
      
        211
            # native-rate ones (cache key carries the flag), so toggling
      
        212
            # auto_resample on an existing corpus doesn't serve stale entries.
      
        213
            auto_resample: bool = False
      
        214
        
        215
        
        216
        def _default_audio() -> AudioConfig:
      
        217
            return AudioConfig()
      
        218
        
        219
        
        220
        class AdapterConfig(BaseModel):
      
        221
            """One named adapter in a multi-adapter document.
      
        222
        
        223
            A subset of the flat config — only the per-adapter LoRA knobs plus
      
        224
            `learning_rate`. Hyperparameters that are intrinsically run-scoped
      
        225
            (`sequence_len`, `num_epochs`, `seed`, `optimizer`, `lr_scheduler`,
      
        226
            `warmup_ratio`, `micro_batch_size`, `grad_accum`) stay at the
      
        227
            `TrainingConfig` top level because mixing them per-adapter makes
      
        228
            schedules and batching incoherent.
      
        229
            """
      
        230
        
        231
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        232
        
        233
            adapter: Literal["lora", "qlora", "dora"] = "lora"
      
        234
            lora_r: int = Field(8, ge=1, le=256)
      
        235
            lora_alpha: int = Field(16, ge=1)
      
        236
            lora_dropout: float = Field(0.05, ge=0.0, le=0.5)
      
        237
            target_modules: Literal["auto"] | list[str] = "auto"
      
        238
            learning_rate: float = Field(2e-4, gt=0.0)
      
        239
        
        240
        
        241
        class SourceDirective(BaseModel):
      
        242
            """A directive to ingest file(s) as synthetic PROSE sections at train
      
        243
            time.
      
        244
        
        245
            Paths resolve relative to the `.dlm` file's parent directory when
      
        246
            not absolute; `~` expands via `Path.expanduser()`. Under
      
        247
            `training.sources_policy="strict"` the resolved path must stay
      
        248
            under the `.dlm` parent dir (symlinks included — containment is
      
        249
            checked after `Path.resolve()`). `permissive` lets absolute paths
      
        250
            anywhere on disk.
      
        251
        
        252
            `include` / `exclude` are POSIX-glob patterns relative to each
      
        253
            source root (default `("**/*",)` + `()` — every file matches).
      
        254
            Size caps apply per-file and per-directive; binary files (first-
      
        255
            KiB NUL scan) and non-UTF-8 bytes are skipped with a log warning,
      
        256
            never a fatal error, because mixed trees are the common case.
      
        257
            """
      
        258
        
        259
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        260
        
        261
            path: str = Field(..., min_length=1)
      
        262
            include: tuple[str, ...] = ("**/*",)
      
        263
            exclude: tuple[str, ...] = ()
      
        264
            max_bytes_per_file: int | None = Field(default=None, ge=1)
      
        265
            max_files: int | None = Field(default=None, ge=1)
      
        266
        
        267
        
        268
        class TrainingConfig(BaseModel):
      
        269
            """Training-time knobs. `auto` values are resolved by the hardware doctor."""
      
        270
        
        271
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        272
        
        273
            adapter: Literal["lora", "qlora", "dora"] = "lora"
      
        274
            lora_r: int = Field(8, ge=1, le=256)
      
        275
            lora_alpha: int = Field(16, ge=1)
      
        276
            lora_dropout: float = Field(0.05, ge=0.0, le=0.5)
      
        277
            target_modules: Literal["auto"] | list[str] = "auto"
      
        278
            sequence_len: int = Field(2048, ge=64, le=32768)
      
        279
            micro_batch_size: Literal["auto"] | int = "auto"
      
        280
            grad_accum: Literal["auto"] | int = "auto"
      
        281
            learning_rate: float = Field(2e-4, gt=0.0)
      
        282
            num_epochs: int = Field(3, ge=1)
      
        283
            optimizer: Literal[
      
        284
                "adamw_torch",
      
        285
                "adamw_bnb_8bit",
      
        286
                "paged_adamw_8bit",
      
        287
                "galore_adamw",
      
        288
                "galore_adamw_8bit",
      
        289
            ] = "adamw_torch"
      
        290
            lr_scheduler: Literal["cosine", "linear", "constant"] = "cosine"
      
        291
            warmup_ratio: float = Field(0.1, ge=0.0, le=0.5)
      
        292
            # Advanced: override the hardware doctor's auto-picked precision.
      
        293
            # `None` (default) lets the planner pick per backend — bf16 on
      
        294
            # Ampere+, fp16 on older CUDA, fp32 on MPS (the last pin is
      
        295
            # defensive: MPS fp16 attention kernels produce NaN LoRA weights
      
        296
            # on tiny-data runs; see `.docs/bugs/01-nan-adapter-on-mps.md`).
      
        297
            # Users who want fp16 on MPS for memory (e.g. running an 8B base
      
        298
            # on a 24 GB unified-memory budget) can opt in here, accepting
      
        299
            # the stability risk on small datasets.
      
        300
            precision: Literal["bf16", "fp16", "fp32"] | None = None
      
        301
            seed: int = 42
      
        302
            preference: PreferenceConfig = Field(default_factory=_default_preference)
      
        303
            cpt: CptConfig = Field(default_factory=_default_cpt)
      
        304
            # Learned adapter gate. Only meaningful when `adapters`
      
        305
            # declares two or more named adapters — a gate over a single
      
        306
            # adapter is a tautology. Enforced at validate-time below.
      
        307
            gate: GateConfig = Field(default_factory=_default_gate)
      
        308
            # Tokenized-section cache tuning. Defaults preserve
      
        309
            # pre-v9 behavior: cache on, 10 GiB cap, 90-day prune window.
      
        310
            cache: CacheConfig = Field(default_factory=_default_cache)
      
        311
            # Audio-pipeline knobs (v12). Only read when the base is an
      
        312
            # audio-language model. Default `auto_resample=False` preserves the
      
        313
            # v11 contract (refuse on SR mismatch); set to True to enable on-
      
        314
            # the-fly resampling to the base's pinned rate.
      
        315
            audio: AudioConfig = Field(default_factory=_default_audio)
      
        316
            # Named adapters for multi-adapter composition. When set, the flat
      
        317
            # `adapter`/`lora_*`/`target_modules`/`learning_rate` fields must
      
        318
            # stay at their defaults — mixing the two shapes creates ambiguous
      
        319
            # "which config wins?" semantics. An empty/None `adapters` keeps
      
        320
            # the single-adapter shorthand fully backward-compatible.
      
        321
            adapters: dict[str, AdapterConfig] | None = None
      
        322
            # Source directives (v6). Declarative file-tree ingestion — each
      
        323
            # entry becomes a walk-and-read at train time, synthesizing PROSE
      
        324
            # sections for the CPT path. `None` / empty keeps the `.dlm` as a
      
        325
            # self-contained training corpus; populated lets the document
      
        326
            # reference external codebases, notes directories, etc. See
      
        327
            # `dlm.directives.expand_sources`.
      
        328
            sources: tuple[SourceDirective, ...] | None = None
      
        329
            # `permissive` (default) lets directive paths point anywhere on
      
        330
            # disk. `strict` confines them to the `.dlm` parent subtree —
      
        331
            # useful when a `.dlm` travels with a project and the author wants
      
        332
            # training content to stay project-local regardless of where a
      
        333
            # downstream user places the file.
      
        334
            sources_policy: Literal["permissive", "strict"] = "permissive"
      
        335
        
        336
            @field_validator("micro_batch_size", "grad_accum")
      
        337
            @classmethod
      
        338
            def _validate_auto_or_positive(cls, v: int | str) -> int | str:
      
        339
                if v == "auto":
      
        340
                    return v
      
        341
                if not isinstance(v, int) or v < 1:
      
        342
                    raise ValueError(f"must be a positive int or 'auto', got {v!r}")
      
        343
                return v
      
        344
        
        345
            @field_validator("adapters")
      
        346
            @classmethod
      
        347
            def _validate_adapter_names(
      
        348
                cls, v: dict[str, AdapterConfig] | None
      
        349
            ) -> dict[str, AdapterConfig] | None:
      
        350
                if v is None:
      
        351
                    return v
      
        352
                if not v:
      
        353
                    raise ValueError(
      
        354
                        "training.adapters: at least one adapter must be declared "
      
        355
                        "(omit the block entirely for the single-adapter shorthand)"
      
        356
                    )
      
        357
                for name in v:
      
        358
                    if not _ADAPTER_NAME_RE.fullmatch(name):
      
        359
                        raise ValueError(
      
        360
                            f"training.adapters: {name!r} is not a valid adapter "
      
        361
                            f"name (must match {_ADAPTER_NAME_RE.pattern})"
      
        362
                        )
      
        363
                return v
      
        364
        
        365
            @model_validator(mode="after")
      
        366
            def _gate_requires_multiple_adapters(self) -> TrainingConfig:
      
        367
                if self.gate.enabled and (self.adapters is None or len(self.adapters) < 2):
      
        368
                    raise ValueError(
      
        369
                        "training.gate.enabled=true requires training.adapters "
      
        370
                        "with two or more named adapters (a gate over a single "
      
        371
                        "adapter has nothing to route between)"
      
        372
                    )
      
        373
                return self
      
        374
        
        375
            @model_validator(mode="after")
      
        376
            def _flat_and_named_are_mutually_exclusive(self) -> TrainingConfig:
      
        377
                if self.adapters is None:
      
        378
                    return self
      
        379
                # A set flat-adapter field would silently lose to the named
      
        380
                # block at train time. Refuse at parse time instead.
      
        381
                flat_defaults = {
      
        382
                    "adapter": "lora",
      
        383
                    "lora_r": 8,
      
        384
                    "lora_alpha": 16,
      
        385
                    "lora_dropout": 0.05,
      
        386
                    "target_modules": "auto",
      
        387
                    "learning_rate": 2e-4,
      
        388
                }
      
        389
                drift = [key for key, default in flat_defaults.items() if getattr(self, key) != default]
      
        390
                if drift:
      
        391
                    raise ValueError(
      
        392
                        "training.adapters is declared; flat per-adapter fields "
      
        393
                        f"{drift} must stay at their defaults (move them into the "
      
        394
                        "per-adapter block instead)"
      
        395
                    )
      
        396
                return self
      
        397
        
        398
        
        399
        class ExportConfig(BaseModel):
      
        400
            """Export-time defaults."""
      
        401
        
        402
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        403
        
        404
            default_quant: Literal["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"] = "Q4_K_M"
      
        405
            # Optional per-document sampling overrides. When set, the Modelfile
      
        406
            # emits `PARAMETER temperature <v>` / `PARAMETER top_p <v>` in place
      
        407
            # of the dialect default — a Q&A document prefers temperature=0.2,
      
        408
            # a creative one prefers 0.9. `None` keeps the dialect default
      
        409
            # from the template registry.
      
        410
            default_temperature: float | None = Field(None, gt=0.0, le=2.0)
      
        411
            default_top_p: float | None = Field(None, gt=0.0, le=1.0)
      
        412
        
        413
        
        414
        # Named factories so mypy can type-check the field defaults correctly.
      
        415
        def _default_training() -> TrainingConfig:
      
        416
            return TrainingConfig()
      
        417
        
        418
        
        419
        def _default_export() -> ExportConfig:
      
        420
            return ExportConfig()
      
        421
        
        422
        
        423
        class DlmFrontmatter(BaseModel):
      
        424
            """Top-level frontmatter: the YAML block between `---` delimiters.
      
        425
        
        426
            `dlm_id` is a canonical 26-character ULID. It is assigned by
      
        427
            `dlm init` and never regenerated by the parser.
      
        428
            `base_model` is either a registry key (e.g. `qwen2.5-1.5b`) or an
      
        429
            `hf:org/name` escape hatch — the registry validates the
      
        430
            actual lookup; this module only validates that the string is non-empty.
      
        431
            """
      
        432
        
        433
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        434
        
        435
            dlm_id: str
      
        436
            dlm_version: int = CURRENT_SCHEMA_VERSION
      
        437
            base_model: str = Field(..., min_length=1)
      
        438
            training: TrainingConfig = Field(default_factory=_default_training)
      
        439
            export: ExportConfig = Field(default_factory=_default_export)
      
        440
            system_prompt: str | None = None
      
        441
        
        442
            @field_validator("dlm_id")
      
        443
            @classmethod
      
        444
            def _validate_ulid(cls, v: str) -> str:
      
        445
                if not _ULID_RE.fullmatch(v):
      
        446
                    raise ValueError(
      
        447
                        f"dlm_id must be a 26-char Crockford base32 ULID, got {v!r}",
      
        448
                    )
      
        449
                return v
      
        450
        
        451
            @field_validator("dlm_version")
      
        452
            @classmethod
      
        453
            def _validate_version(cls, v: int) -> int:
      
        454
                # Defense in depth: the `versioned` dispatcher is
      
        455
                # the intended entry point, but direct `DlmFrontmatter.model_validate`
      
        456
                # callers (tests, tooling) need the same guard. Reject both
      
        457
                # under-1 and beyond-current values at the field level.
      
        458
                if v < 1:
      
        459
                    raise ValueError(f"dlm_version must be ≥1, got {v}")
      
        460
                if v > CURRENT_SCHEMA_VERSION:
      
        461
                    raise ValueError(
      
        462
                        f"dlm_version {v} is newer than this CLI supports "
      
        463
                        f"(CURRENT_SCHEMA_VERSION={CURRENT_SCHEMA_VERSION})."
      
        464
                    )
      
        465
                return v

1	"""Pydantic models for `.dlm` frontmatter.
2
3	Every model is `extra="forbid"` and `frozen=True` — strict validation and
4	immutable values. Default values must match the shape produced by
5	`tests/fixtures/dlm_factory.py` so round-trips are stable.
6
7	Versioned schema dispatch lives in `dlm.doc.versioned`; this module
8	defines the current frontmatter shape.
9	"""
10
11	from __future__ import annotations
12
13	import re
14	from typing import Final, Literal
15
16	from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
17
18	# Crockford base32 alphabet used by ULID: 0-9, A-Z minus I L O U.
19	_ULID_RE: Final[re.Pattern[str]] = re.compile(r"^[0-9A-HJ-KM-NP-TV-Z]{26}$")
20
21	# Adapter names: lowercase alpha start, alphanumeric + underscore tail.
22	# Keeps store paths safe (adapter/<name>/versions/) and log lines readable.
23	_ADAPTER_NAME_RE: Final[re.Pattern[str]] = re.compile(r"^[a-z][a-z0-9_]{0,31}$")
24
25	CURRENT_SCHEMA_VERSION: Final[int] = 15
26	"""Schema version this parser implements.
27
28	New fields bump the version and register a migrator in the same
29	commit — enforced by `test_all_versions_have_migrator_up_to_latest`.
30	v2 renamed `training.dpo` → `training.preference` to accommodate both
31	DPO and ORPO under one `method`-switched config. v3 added the
32	additive `training.cpt` block (DAPT schedule + embedding warm-up)
33	for continued-pretraining refinements. v4 added the additive
34	`training.adapters` map for named multi-adapter composition; flat
35	`adapter`/`lora_*` keys remain the single-adapter shorthand. v5
36	added the additive `training.precision` override (opt-in fp16/bf16
37	on MPS after the NaN-adapter bug). v6 adds the additive
38	`training.sources` block — declarative file-tree directives that
39	synthesize PROSE sections at train time, letting a `.dlm` act as a
40	training plan over content stored elsewhere on disk. v10 introduces
41	`SectionType.IMAGE` + the `::image path="..." alt="..."::` fence
42	grammar for multi-modal training; the body schema is strictly
43	additive and the fence extension is backward-compatible (images
44	are parsed via a separate attribute grammar rather than changing
45	the existing `::type#name::` form). v11 adds `SectionType.AUDIO`
46	with the parallel `::audio path="..." transcript="..."::` fence —
47	the transcript becomes the text-side supervision (no equivalent to
48	the optional image caption; audio without a transcript has no
49	training signal). v12 adds the additive `training.audio` block
50	(currently one field, `auto_resample: bool`) — opt-in automatic
51	resampling when audio files don't match the base's pinned rate.
52	Default False preserves the "refuse on SR mismatch" contract. v13 is
53	an identity bump paired with the 2026 base-model registry refresh:
54	the document frontmatter shape is unchanged, but the migration chain
55	still advances so tooling can distinguish post-refresh docs from older
56	ones. v14 adds additive auto-mined preference metadata on
57	`::preference::` sections; the frontmatter shape remains unchanged,
58	but the schema still advances so migration-aware tooling can tell
59	pre-mining docs from ones that may carry mined-preference markers in
60	the body. v15 adds additive auto-synth instruction metadata on
61	`::instruction::` sections; the frontmatter shape remains unchanged,
62	but the schema still advances so migration-aware tooling can tell
63	pre-synth docs from ones that may carry synthesized-instruction
64	markers in the body.
65	"""
66
67
68	class PreferenceHyperparams(BaseModel):
69	"""Hyperparameters shared across preference methods.
70
71	Some fields are method-specific (`beta` for DPO, `alpha` for
72	ORPO); the trainer reads whichever applies. Keeping both on one
73	flat block simplifies migration and lets users switch methods
74	without reshaping their document.
75	"""
76
77	model_config = ConfigDict(extra="forbid", frozen=True)
78
79	beta: float = Field(0.1, ge=0.0, le=1.0)
80	alpha: float = Field(0.1, ge=0.0, le=1.0)
81	learning_rate: float = Field(5e-6, gt=0.0)
82	num_epochs: int = Field(1, ge=1)
83
84
85	class PreferenceConfig(BaseModel):
86	"""Preference-phase knobs (DPO or ORPO). Additive to `TrainingConfig`;
87	default disabled. `enabled` flips to `True` automatically when the
88	document contains `::preference::` sections unless the user has
89	explicitly set it to `False` — the phase orchestrator reads that
90	signal."""
91
92	model_config = ConfigDict(extra="forbid", frozen=True)
93
94	enabled: bool = False
95	method: Literal["dpo", "orpo"] = "dpo"
96	hyperparams: PreferenceHyperparams = Field(default_factory=lambda: PreferenceHyperparams())
97	# DPO-only fields — ignored for ORPO but kept on the config so a
98	# user switching methods doesn't have to delete them.
99	loss_type: Literal["sigmoid", "hinge", "ipo"] = "sigmoid"
100	reference: Literal["base", "pre_adapter"] = "pre_adapter"
101
102
103	def _default_preference() -> PreferenceConfig:
104	return PreferenceConfig()
105
106
107	class CptConfig(BaseModel):
108	"""Continued-pretraining refinements.
109
110	`schedule="auto"` lets the trainer pick: `dapt` when CPT rows
111	dominate (>70% of training rows), otherwise the SFT default. A
112	user who wants the DAPT curve regardless of the row mix pins
113	`schedule="dapt"`; `schedule="sft"` opts out entirely.
114
115	`embed_warmup_steps>0` unfreezes `embed_tokens` + `lm_head` for
116	the first N steps and adds them to `modules_to_save`, which
117	inflates adapter size by `vocab_size * hidden_dim`. The trainer
118	warns loudly when this is enabled.
119	"""
120
121	model_config = ConfigDict(extra="forbid", frozen=True)
122
123	schedule: Literal["auto", "dapt", "sft"] = "auto"
124	embed_warmup_steps: int = Field(0, ge=0)
125
126
127	def _default_cpt() -> CptConfig:
128	return CptConfig()
129
130
131	class GateConfig(BaseModel):
132	"""Learned MoE-style adapter gate.
133
134	When `enabled`, a small MLP trained post-SFT routes each prompt to
135	a weighted combination of the document's named adapters. Applied
136	uniformly across adapter layers (per-layer routing is the research
137	follow-up).
138
139	`cold_start_floor` is the minimum number of supervising sections
140	per adapter below which gate training is skipped and inference
141	defaults to uniform weights — small corpora overfit a tiny router.
142	"""
143
144	model_config = ConfigDict(extra="forbid", frozen=True)
145
146	enabled: bool = False
147	hidden_proj_dim: int = Field(64, ge=8, le=2048)
148	steps: int = Field(200, ge=1, le=10000)
149	lr: float = Field(3e-4, gt=0.0, le=1.0)
150	cold_start_floor: int = Field(4, ge=1, le=1024)
151	# Entropy-regularization weight on the gate loss. Higher values
152	# discourage mode collapse (one adapter takes all the weight);
153	# lower values let the gate commit harder when data justifies it.
154	entropy_lambda: float = Field(0.01, ge=0.0, le=1.0)
155
156
157	def _default_gate() -> GateConfig:
158	return GateConfig()
159
160
161	class CacheConfig(BaseModel):
162	"""Tokenized-section cache tuning.
163
164	The cache lives at `~/.dlm/store/<dlm_id>/tokenized-cache/` and
165	trades disk for tokenization wall-clock on directive-sourced runs.
166	Defaults cover the typical case: cache on, 10 GiB cap, 90-day
167	retention. Per-document overrides here let authors tune for their
168	corpus size.
169
170	All fields are independent — no cross-field validation. The three
171	knobs map to three distinct operator concerns:
172	- ``enabled`` is the off-switch.
173	- ``max_bytes`` is the disk ceiling.
174	- ``prune_older_than_days`` is the retention window.
175	"""
176
177	model_config = ConfigDict(extra="forbid", frozen=True)
178
179	enabled: bool = True
180	# Default: 10 GiB (10 * 1024^3). Per-document cap that supersedes
181	# the cache module's built-in default when the trainer opens the
182	# cache. Lower for small personal corpora, higher for 50K+ file
183	# codebases.
184	max_bytes: int = Field(10 * 1024 * 1024 * 1024, ge=1)
185	# Default cutoff for `dlm cache prune` when the user doesn't pass
186	# `--older-than`. Overridable by the CLI flag on a per-command
187	# basis.
188	prune_older_than_days: int = Field(90, ge=1)
189
190
191	def _default_cache() -> CacheConfig:
192	return CacheConfig()
193
194
195	class AudioConfig(BaseModel):
196	"""Audio-pipeline knobs for audio-language training (v12).
197
198	Only meaningful when the base is an audio-language model (the
199	trainer reads this block only on that branch). Default leaves
200	behavior identical to v11: ``auto_resample=False`` preserves the
201	"refuse on SR mismatch" contract, so adding a no-audio or
202	non-audio document to v12 never changes training output.
203	"""
204
205	model_config = ConfigDict(extra="forbid", frozen=True)
206
207	# When True, audio files at a native sample rate different from the
208	# base's pinned rate (e.g. 48 kHz clip with Qwen2-Audio's 16 kHz
209	# pin) are resampled on the fly via `dlm.data.audio_resample`
210	# instead of raising. Resampled waveforms cache separately from
211	# native-rate ones (cache key carries the flag), so toggling
212	# auto_resample on an existing corpus doesn't serve stale entries.
213	auto_resample: bool = False
214
215
216	def _default_audio() -> AudioConfig:
217	return AudioConfig()
218
219
220	class AdapterConfig(BaseModel):
221	"""One named adapter in a multi-adapter document.
222
223	A subset of the flat config — only the per-adapter LoRA knobs plus
224	`learning_rate`. Hyperparameters that are intrinsically run-scoped
225	(`sequence_len`, `num_epochs`, `seed`, `optimizer`, `lr_scheduler`,
226	`warmup_ratio`, `micro_batch_size`, `grad_accum`) stay at the
227	`TrainingConfig` top level because mixing them per-adapter makes
228	schedules and batching incoherent.
229	"""
230
231	model_config = ConfigDict(extra="forbid", frozen=True)
232
233	adapter: Literal["lora", "qlora", "dora"] = "lora"
234	lora_r: int = Field(8, ge=1, le=256)
235	lora_alpha: int = Field(16, ge=1)
236	lora_dropout: float = Field(0.05, ge=0.0, le=0.5)
237	target_modules: Literal["auto"] \| list[str] = "auto"
238	learning_rate: float = Field(2e-4, gt=0.0)
239
240
241	class SourceDirective(BaseModel):
242	"""A directive to ingest file(s) as synthetic PROSE sections at train
243	time.
244
245	Paths resolve relative to the `.dlm` file's parent directory when
246	not absolute; `~` expands via `Path.expanduser()`. Under
247	`training.sources_policy="strict"` the resolved path must stay
248	under the `.dlm` parent dir (symlinks included — containment is
249	checked after `Path.resolve()`). `permissive` lets absolute paths
250	anywhere on disk.
251
252	`include` / `exclude` are POSIX-glob patterns relative to each
253	source root (default `("*/",)` + `()` — every file matches).
254	Size caps apply per-file and per-directive; binary files (first-
255	KiB NUL scan) and non-UTF-8 bytes are skipped with a log warning,
256	never a fatal error, because mixed trees are the common case.
257	"""
258
259	model_config = ConfigDict(extra="forbid", frozen=True)
260
261	path: str = Field(..., min_length=1)
262	include: tuple[str, ...] = ("*/",)
263	exclude: tuple[str, ...] = ()
264	max_bytes_per_file: int \| None = Field(default=None, ge=1)
265	max_files: int \| None = Field(default=None, ge=1)
266
267
268	class TrainingConfig(BaseModel):
269	"""Training-time knobs. `auto` values are resolved by the hardware doctor."""
270
271	model_config = ConfigDict(extra="forbid", frozen=True)
272
273	adapter: Literal["lora", "qlora", "dora"] = "lora"
274	lora_r: int = Field(8, ge=1, le=256)
275	lora_alpha: int = Field(16, ge=1)
276	lora_dropout: float = Field(0.05, ge=0.0, le=0.5)
277	target_modules: Literal["auto"] \| list[str] = "auto"
278	sequence_len: int = Field(2048, ge=64, le=32768)
279	micro_batch_size: Literal["auto"] \| int = "auto"
280	grad_accum: Literal["auto"] \| int = "auto"
281	learning_rate: float = Field(2e-4, gt=0.0)
282	num_epochs: int = Field(3, ge=1)
283	optimizer: Literal[
284	"adamw_torch",
285	"adamw_bnb_8bit",
286	"paged_adamw_8bit",
287	"galore_adamw",
288	"galore_adamw_8bit",
289	] = "adamw_torch"
290	lr_scheduler: Literal["cosine", "linear", "constant"] = "cosine"
291	warmup_ratio: float = Field(0.1, ge=0.0, le=0.5)
292	# Advanced: override the hardware doctor's auto-picked precision.
293	# `None` (default) lets the planner pick per backend — bf16 on
294	# Ampere+, fp16 on older CUDA, fp32 on MPS (the last pin is
295	# defensive: MPS fp16 attention kernels produce NaN LoRA weights
296	# on tiny-data runs; see `.docs/bugs/01-nan-adapter-on-mps.md`).
297	# Users who want fp16 on MPS for memory (e.g. running an 8B base
298	# on a 24 GB unified-memory budget) can opt in here, accepting
299	# the stability risk on small datasets.
300	precision: Literal["bf16", "fp16", "fp32"] \| None = None
301	seed: int = 42
302	preference: PreferenceConfig = Field(default_factory=_default_preference)
303	cpt: CptConfig = Field(default_factory=_default_cpt)
304	# Learned adapter gate. Only meaningful when `adapters`
305	# declares two or more named adapters — a gate over a single
306	# adapter is a tautology. Enforced at validate-time below.
307	gate: GateConfig = Field(default_factory=_default_gate)
308	# Tokenized-section cache tuning. Defaults preserve
309	# pre-v9 behavior: cache on, 10 GiB cap, 90-day prune window.
310	cache: CacheConfig = Field(default_factory=_default_cache)
311	# Audio-pipeline knobs (v12). Only read when the base is an
312	# audio-language model. Default `auto_resample=False` preserves the
313	# v11 contract (refuse on SR mismatch); set to True to enable on-
314	# the-fly resampling to the base's pinned rate.
315	audio: AudioConfig = Field(default_factory=_default_audio)
316	# Named adapters for multi-adapter composition. When set, the flat
317	# `adapter`/`lora_*`/`target_modules`/`learning_rate` fields must
318	# stay at their defaults — mixing the two shapes creates ambiguous
319	# "which config wins?" semantics. An empty/None `adapters` keeps
320	# the single-adapter shorthand fully backward-compatible.
321	adapters: dict[str, AdapterConfig] \| None = None
322	# Source directives (v6). Declarative file-tree ingestion — each
323	# entry becomes a walk-and-read at train time, synthesizing PROSE
324	# sections for the CPT path. `None` / empty keeps the `.dlm` as a
325	# self-contained training corpus; populated lets the document
326	# reference external codebases, notes directories, etc. See
327	# `dlm.directives.expand_sources`.
328	sources: tuple[SourceDirective, ...] \| None = None
329	# `permissive` (default) lets directive paths point anywhere on
330	# disk. `strict` confines them to the `.dlm` parent subtree —
331	# useful when a `.dlm` travels with a project and the author wants
332	# training content to stay project-local regardless of where a
333	# downstream user places the file.
334	sources_policy: Literal["permissive", "strict"] = "permissive"
335
336	@field_validator("micro_batch_size", "grad_accum")
337	@classmethod
338	def _validate_auto_or_positive(cls, v: int \| str) -> int \| str:
339	if v == "auto":
340	return v
341	if not isinstance(v, int) or v < 1:
342	raise ValueError(f"must be a positive int or 'auto', got {v!r}")
343	return v
344
345	@field_validator("adapters")
346	@classmethod
347	def _validate_adapter_names(
348	cls, v: dict[str, AdapterConfig] \| None
349	) -> dict[str, AdapterConfig] \| None:
350	if v is None:
351	return v
352	if not v:
353	raise ValueError(
354	"training.adapters: at least one adapter must be declared "
355	"(omit the block entirely for the single-adapter shorthand)"
356	)
357	for name in v:
358	if not _ADAPTER_NAME_RE.fullmatch(name):
359	raise ValueError(
360	f"training.adapters: {name!r} is not a valid adapter "
361	f"name (must match {_ADAPTER_NAME_RE.pattern})"
362	)
363	return v
364
365	@model_validator(mode="after")
366	def _gate_requires_multiple_adapters(self) -> TrainingConfig:
367	if self.gate.enabled and (self.adapters is None or len(self.adapters) < 2):
368	raise ValueError(
369	"training.gate.enabled=true requires training.adapters "
370	"with two or more named adapters (a gate over a single "
371	"adapter has nothing to route between)"
372	)
373	return self
374
375	@model_validator(mode="after")
376	def _flat_and_named_are_mutually_exclusive(self) -> TrainingConfig:
377	if self.adapters is None:
378	return self
379	# A set flat-adapter field would silently lose to the named
380	# block at train time. Refuse at parse time instead.
381	flat_defaults = {
382	"adapter": "lora",
383	"lora_r": 8,
384	"lora_alpha": 16,
385	"lora_dropout": 0.05,
386	"target_modules": "auto",
387	"learning_rate": 2e-4,
388	}
389	drift = [key for key, default in flat_defaults.items() if getattr(self, key) != default]
390	if drift:
391	raise ValueError(
392	"training.adapters is declared; flat per-adapter fields "
393	f"{drift} must stay at their defaults (move them into the "
394	"per-adapter block instead)"
395	)
396	return self
397
398
399	class ExportConfig(BaseModel):
400	"""Export-time defaults."""
401
402	model_config = ConfigDict(extra="forbid", frozen=True)
403
404	default_quant: Literal["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"] = "Q4_K_M"
405	# Optional per-document sampling overrides. When set, the Modelfile
406	# emits `PARAMETER temperature <v>` / `PARAMETER top_p <v>` in place
407	# of the dialect default — a Q&A document prefers temperature=0.2,
408	# a creative one prefers 0.9. `None` keeps the dialect default
409	# from the template registry.
410	default_temperature: float \| None = Field(None, gt=0.0, le=2.0)
411	default_top_p: float \| None = Field(None, gt=0.0, le=1.0)
412
413
414	# Named factories so mypy can type-check the field defaults correctly.
415	def _default_training() -> TrainingConfig:
416	return TrainingConfig()
417
418
419	def _default_export() -> ExportConfig:
420	return ExportConfig()
421
422
423	class DlmFrontmatter(BaseModel):
424	"""Top-level frontmatter: the YAML block between `---` delimiters.
425
426	`dlm_id` is a canonical 26-character ULID. It is assigned by
427	`dlm init` and never regenerated by the parser.
428	`base_model` is either a registry key (e.g. `qwen2.5-1.5b`) or an
429	`hf:org/name` escape hatch — the registry validates the
430	actual lookup; this module only validates that the string is non-empty.
431	"""
432
433	model_config = ConfigDict(extra="forbid", frozen=True)
434
435	dlm_id: str
436	dlm_version: int = CURRENT_SCHEMA_VERSION
437	base_model: str = Field(..., min_length=1)
438	training: TrainingConfig = Field(default_factory=_default_training)
439	export: ExportConfig = Field(default_factory=_default_export)
440	system_prompt: str \| None = None
441
442	@field_validator("dlm_id")
443	@classmethod
444	def _validate_ulid(cls, v: str) -> str:
445	if not _ULID_RE.fullmatch(v):
446	raise ValueError(
447	f"dlm_id must be a 26-char Crockford base32 ULID, got {v!r}",
448	)
449	return v
450
451	@field_validator("dlm_version")
452	@classmethod
453	def _validate_version(cls, v: int) -> int:
454	# Defense in depth: the `versioned` dispatcher is
455	# the intended entry point, but direct `DlmFrontmatter.model_validate`
456	# callers (tests, tooling) need the same guard. Reject both
457	# under-1 and beyond-current values at the field level.
458	if v < 1:
459	raise ValueError(f"dlm_version must be ≥1, got {v}")
460	if v > CURRENT_SCHEMA_VERSION:
461	raise ValueError(
462	f"dlm_version {v} is newer than this CLI supports "
463	f"(CURRENT_SCHEMA_VERSION={CURRENT_SCHEMA_VERSION})."
464	)
465	return v