documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 2914 bytes Raw Blame History

  
        1
        """`.dlm/training.yaml` per-codebase config.
      
        2
        
        3
        Distinct from the `.dlm` frontmatter schema — different file, different
      
        4
        shape, different namespace. A codebase drops this alongside its source
      
        5
        to declare what `dlm train` should ingest when a directive descends
      
        6
        into the tree. The nearest-ancestor `.dlm/training.yaml` wins for each
      
        7
        file under its subtree, matching `.gitignore`'s resolution semantics.
      
        8
        
        9
        See `dlm.directives.merge` for the full precedence table and
      
        10
        `docs/format/dlm-training-yaml.md` for the user-facing reference.
      
        11
        """
      
        12
        
        13
        from __future__ import annotations
      
        14
        
        15
        from collections.abc import Mapping
      
        16
        from typing import Literal
      
        17
        
        18
        from pydantic import BaseModel, ConfigDict, Field, field_validator
      
        19
        
        20
        
        21
        class DlmTrainingConfig(BaseModel):
      
        22
            """Per-subtree training config discovered at `.dlm/training.yaml`.
      
        23
        
        24
            All fields optional — an empty config (just `dlm_training_version: 1`)
      
        25
            is legal and means "no refinement at this level", which is useful
      
        26
            as a placeholder marker while drafting.
      
        27
        
        28
            `exclude_defaults` controls whether `dlm.directives.defaults.DEFAULT_EXCLUDES`
      
        29
            applies at this subtree. Most users want it True (secrets, VCS,
      
        30
            build artifacts skipped automatically); trees that legitimately
      
        31
            train on e.g. generated code can set it False to opt out.
      
        32
        
        33
            `metadata` flows onto every `Section` synthesized from this
      
        34
            subtree via `Section.tags`. Tags do NOT affect `section_id`, so
      
        35
            adjusting metadata doesn't invalidate the replay corpus.
      
        36
        
        37
            `weights` scales per-row training exposure by `(tag_key, tag_value)`.
      
        38
            Resolution is multiplicative across tag keys and merges shallow-to-
      
        39
            deep like `metadata` — deeper configs override shallower keys. A
      
        40
            row with tags `{domain: auth, generated: true}` under a tree where
      
        41
            the root sets `weights.domain.auth = 2.0` and a subtree sets
      
        42
            `weights.generated.true = 0.5` ends up at effective weight 1.0.
      
        43
            Rows with no matching tag/value get weight 1.0.
      
        44
            """
      
        45
        
        46
            model_config = ConfigDict(extra="forbid", frozen=True)
      
        47
        
        48
            dlm_training_version: Literal[1] = 1
      
        49
            include: tuple[str, ...] = ()
      
        50
            exclude: tuple[str, ...] = ()
      
        51
            exclude_defaults: bool = True
      
        52
            metadata: Mapping[str, str] = Field(default_factory=dict)
      
        53
            weights: Mapping[str, Mapping[str, float]] = Field(default_factory=dict)
      
        54
        
        55
            @field_validator("weights")
      
        56
            @classmethod
      
        57
            def _validate_weights(
      
        58
                cls, value: Mapping[str, Mapping[str, float]]
      
        59
            ) -> Mapping[str, Mapping[str, float]]:
      
        60
                for tag_key, inner in value.items():
      
        61
                    for tag_value, scale in inner.items():
      
        62
                        if scale < 0:
      
        63
                            raise ValueError(
      
        64
                                f"weights[{tag_key!r}][{tag_value!r}] must be ≥ 0, "
      
        65
                                f"got {scale}. Negative weights don't have a "
      
        66
                                f"well-defined meaning under row-repetition expansion."
      
        67
                            )
      
        68
                return value

1	"""`.dlm/training.yaml` per-codebase config.
2
3	Distinct from the `.dlm` frontmatter schema — different file, different
4	shape, different namespace. A codebase drops this alongside its source
5	to declare what `dlm train` should ingest when a directive descends
6	into the tree. The nearest-ancestor `.dlm/training.yaml` wins for each
7	file under its subtree, matching `.gitignore`'s resolution semantics.
8
9	See `dlm.directives.merge` for the full precedence table and
10	`docs/format/dlm-training-yaml.md` for the user-facing reference.
11	"""
12
13	from __future__ import annotations
14
15	from collections.abc import Mapping
16	from typing import Literal
17
18	from pydantic import BaseModel, ConfigDict, Field, field_validator
19
20
21	class DlmTrainingConfig(BaseModel):
22	"""Per-subtree training config discovered at `.dlm/training.yaml`.
23
24	All fields optional — an empty config (just `dlm_training_version: 1`)
25	is legal and means "no refinement at this level", which is useful
26	as a placeholder marker while drafting.
27
28	`exclude_defaults` controls whether `dlm.directives.defaults.DEFAULT_EXCLUDES`
29	applies at this subtree. Most users want it True (secrets, VCS,
30	build artifacts skipped automatically); trees that legitimately
31	train on e.g. generated code can set it False to opt out.
32
33	`metadata` flows onto every `Section` synthesized from this
34	subtree via `Section.tags`. Tags do NOT affect `section_id`, so
35	adjusting metadata doesn't invalidate the replay corpus.
36
37	`weights` scales per-row training exposure by `(tag_key, tag_value)`.
38	Resolution is multiplicative across tag keys and merges shallow-to-
39	deep like `metadata` — deeper configs override shallower keys. A
40	row with tags `{domain: auth, generated: true}` under a tree where
41	the root sets `weights.domain.auth = 2.0` and a subtree sets
42	`weights.generated.true = 0.5` ends up at effective weight 1.0.
43	Rows with no matching tag/value get weight 1.0.
44	"""
45
46	model_config = ConfigDict(extra="forbid", frozen=True)
47
48	dlm_training_version: Literal[1] = 1
49	include: tuple[str, ...] = ()
50	exclude: tuple[str, ...] = ()
51	exclude_defaults: bool = True
52	metadata: Mapping[str, str] = Field(default_factory=dict)
53	weights: Mapping[str, Mapping[str, float]] = Field(default_factory=dict)
54
55	@field_validator("weights")
56	@classmethod
57	def _validate_weights(
58	cls, value: Mapping[str, Mapping[str, float]]
59	) -> Mapping[str, Mapping[str, float]]:
60	for tag_key, inner in value.items():
61	for tag_value, scale in inner.items():
62	if scale < 0:
63	raise ValueError(
64	f"weights[{tag_key!r}][{tag_value!r}] must be ≥ 0, "
65	f"got {scale}. Negative weights don't have a "
66	f"well-defined meaning under row-repetition expansion."
67	)
68	return value