Python · 2914 bytes Raw Blame History
1 """`.dlm/training.yaml` per-codebase config.
2
3 Distinct from the `.dlm` frontmatter schema — different file, different
4 shape, different namespace. A codebase drops this alongside its source
5 to declare what `dlm train` should ingest when a directive descends
6 into the tree. The nearest-ancestor `.dlm/training.yaml` wins for each
7 file under its subtree, matching `.gitignore`'s resolution semantics.
8
9 See `dlm.directives.merge` for the full precedence table and
10 `docs/format/dlm-training-yaml.md` for the user-facing reference.
11 """
12
13 from __future__ import annotations
14
15 from collections.abc import Mapping
16 from typing import Literal
17
18 from pydantic import BaseModel, ConfigDict, Field, field_validator
19
20
21 class DlmTrainingConfig(BaseModel):
22 """Per-subtree training config discovered at `.dlm/training.yaml`.
23
24 All fields optional — an empty config (just `dlm_training_version: 1`)
25 is legal and means "no refinement at this level", which is useful
26 as a placeholder marker while drafting.
27
28 `exclude_defaults` controls whether `dlm.directives.defaults.DEFAULT_EXCLUDES`
29 applies at this subtree. Most users want it True (secrets, VCS,
30 build artifacts skipped automatically); trees that legitimately
31 train on e.g. generated code can set it False to opt out.
32
33 `metadata` flows onto every `Section` synthesized from this
34 subtree via `Section.tags`. Tags do NOT affect `section_id`, so
35 adjusting metadata doesn't invalidate the replay corpus.
36
37 `weights` scales per-row training exposure by `(tag_key, tag_value)`.
38 Resolution is multiplicative across tag keys and merges shallow-to-
39 deep like `metadata` — deeper configs override shallower keys. A
40 row with tags `{domain: auth, generated: true}` under a tree where
41 the root sets `weights.domain.auth = 2.0` and a subtree sets
42 `weights.generated.true = 0.5` ends up at effective weight 1.0.
43 Rows with no matching tag/value get weight 1.0.
44 """
45
46 model_config = ConfigDict(extra="forbid", frozen=True)
47
48 dlm_training_version: Literal[1] = 1
49 include: tuple[str, ...] = ()
50 exclude: tuple[str, ...] = ()
51 exclude_defaults: bool = True
52 metadata: Mapping[str, str] = Field(default_factory=dict)
53 weights: Mapping[str, Mapping[str, float]] = Field(default_factory=dict)
54
55 @field_validator("weights")
56 @classmethod
57 def _validate_weights(
58 cls, value: Mapping[str, Mapping[str, float]]
59 ) -> Mapping[str, Mapping[str, float]]:
60 for tag_key, inner in value.items():
61 for tag_value, scale in inner.items():
62 if scale < 0:
63 raise ValueError(
64 f"weights[{tag_key!r}][{tag_value!r}] must be ≥ 0, "
65 f"got {scale}. Negative weights don't have a "
66 f"well-defined meaning under row-repetition expansion."
67 )
68 return value