documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 4717 bytes Raw Blame History

  
        1
        """Walk a source tree and collect every `.dlm/` configuration.
      
        2
        
        3
        One `DiscoveredConfig` per `.dlm/` directory found under the walk
      
        4
        root. Each config aggregates both `.dlm/training.yaml` (parsed as
      
        5
        `DlmTrainingConfig`) and `.dlm/ignore` (parsed as a tuple of
      
        6
        `IgnoreRule`). Either or both may be absent — the presence of the
      
        7
        `.dlm/` directory alone is enough to produce a record (useful when a
      
        8
        user wants a pure drive-by `.dlm/ignore` without writing YAML).
      
        9
        
        10
        Results are sorted by anchor path length ascending, so parents
      
        11
        appear before descendants. This matches the resolution order in
      
        12
        `dlm.directives.merge.effective_config_for`.
      
        13
        
        14
        Malformed YAML or broken lines in `.dlm/ignore` log + degrade — the
      
        15
        walk never fails. The CLI has no way to recover from a mid-train
      
        16
        discovery crash, so tolerance here is load-bearing.
      
        17
        """
      
        18
        
        19
        from __future__ import annotations
      
        20
        
        21
        import logging
      
        22
        from dataclasses import dataclass
      
        23
        from pathlib import Path
      
        24
        
        25
        import yaml
      
        26
        from pydantic import ValidationError
      
        27
        
        28
        from dlm.directives.ignore_parser import IgnoreRule, parse_ignore_file
      
        29
        from dlm.directives.schema import DlmTrainingConfig
      
        30
        from dlm.io.text import DlmEncodingError, read_text
      
        31
        
        32
        _LOG = logging.getLogger(__name__)
      
        33
        
        34
        _CONFIG_FILENAME = "training.yaml"
      
        35
        _IGNORE_FILENAME = "ignore"
      
        36
        
        37
        
        38
        @dataclass(frozen=True)
      
        39
        class DiscoveredConfig:
      
        40
            """Aggregated `.dlm/` config at one anchor directory.
      
        41
        
        42
            `anchor` is the directory that *contains* the `.dlm/` dir (i.e.
      
        43
            the repo root, or a subtree root). Relative paths in
      
        44
            `config.include` / `config.exclude` and ignore rules resolve
      
        45
            against this anchor.
      
        46
        
        47
            Both `config` and `ignore_rules` can be empty — a bare `.dlm/`
      
        48
            directory with no files inside still produces a (no-op)
      
        49
            DiscoveredConfig, letting users mark subtrees explicitly without
      
        50
            writing YAML.
      
        51
            """
      
        52
        
        53
            anchor: Path
      
        54
            config: DlmTrainingConfig | None
      
        55
            ignore_rules: tuple[IgnoreRule, ...]
      
        56
        
        57
        
        58
        def discover_configs(root: Path) -> tuple[DiscoveredConfig, ...]:
      
        59
            """Walk `root` top-down and return a `DiscoveredConfig` per `.dlm/`.
      
        60
        
        61
            `root` itself is included — if `<root>/.dlm/` exists, it becomes
      
        62
            the first (shallowest) discovered config. Each deeper `.dlm/`
      
        63
            dir produces an additional record.
      
        64
        
        65
            Results are sorted by anchor path depth ascending so callers
      
        66
            iterating can apply parent rules before child rules.
      
        67
            """
      
        68
            discovered: list[DiscoveredConfig] = []
      
        69
        
        70
            if not root.is_dir():
      
        71
                return ()
      
        72
        
        73
            for dlm_dir in sorted(root.rglob(".dlm")):
      
        74
                if not dlm_dir.is_dir():
      
        75
                    continue
      
        76
                anchor = dlm_dir.parent
      
        77
                config = _load_training_yaml(dlm_dir / _CONFIG_FILENAME)
      
        78
                ignore_rules = _load_ignore(dlm_dir / _IGNORE_FILENAME)
      
        79
                discovered.append(DiscoveredConfig(anchor=anchor, config=config, ignore_rules=ignore_rules))
      
        80
        
        81
            discovered.sort(key=lambda d: len(d.anchor.as_posix()))
      
        82
            return tuple(discovered)
      
        83
        
        84
        
        85
        def _load_training_yaml(path: Path) -> DlmTrainingConfig | None:
      
        86
            """Load + validate a `.dlm/training.yaml`. Missing file → None.
      
        87
        
        88
            Malformed YAML, schema violations, or encoding errors log one
      
        89
            warning and return None. The anchor still produces a
      
        90
            DiscoveredConfig (just with `config=None`), so a neighboring
      
        91
            `.dlm/ignore` at the same anchor keeps working.
      
        92
            """
      
        93
            if not path.is_file():
      
        94
                return None
      
        95
            try:
      
        96
                text = read_text(path)
      
        97
            except DlmEncodingError as exc:
      
        98
                _LOG.warning("discovery: %s: not UTF-8 (%s); skipping config", path, exc)
      
        99
                return None
      
        100
        
        101
            try:
      
        102
                raw = yaml.safe_load(text) if text.strip() else {}
      
        103
            except yaml.YAMLError as exc:
      
        104
                _LOG.warning("discovery: %s: invalid YAML (%s); skipping config", path, exc)
      
        105
                return None
      
        106
        
        107
            if raw is None:
      
        108
                raw = {}
      
        109
            if not isinstance(raw, dict):
      
        110
                _LOG.warning(
      
        111
                    "discovery: %s: top-level must be a mapping, got %s; skipping config",
      
        112
                    path,
      
        113
                    type(raw).__name__,
      
        114
                )
      
        115
                return None
      
        116
        
        117
            try:
      
        118
                return DlmTrainingConfig.model_validate(raw)
      
        119
            except ValidationError as exc:
      
        120
                _LOG.warning("discovery: %s: schema violation (%s); skipping config", path, exc)
      
        121
                return None
      
        122
        
        123
        
        124
        def _load_ignore(path: Path) -> tuple[IgnoreRule, ...]:
      
        125
            """Load + parse a `.dlm/ignore`. Missing file → empty tuple.
      
        126
        
        127
            The parser itself never raises; malformed lines log + skip. An
      
        128
            unreadable file (encoding error) logs once and degrades to empty
      
        129
            rules.
      
        130
            """
      
        131
            if not path.is_file():
      
        132
                return ()
      
        133
            try:
      
        134
                text = read_text(path)
      
        135
            except DlmEncodingError as exc:
      
        136
                _LOG.warning("discovery: %s: not UTF-8 (%s); skipping ignore", path, exc)
      
        137
                return ()
      
        138
            return parse_ignore_file(text)

1	"""Walk a source tree and collect every `.dlm/` configuration.
2
3	One `DiscoveredConfig` per `.dlm/` directory found under the walk
4	root. Each config aggregates both `.dlm/training.yaml` (parsed as
5	`DlmTrainingConfig`) and `.dlm/ignore` (parsed as a tuple of
6	`IgnoreRule`). Either or both may be absent — the presence of the
7	`.dlm/` directory alone is enough to produce a record (useful when a
8	user wants a pure drive-by `.dlm/ignore` without writing YAML).
9
10	Results are sorted by anchor path length ascending, so parents
11	appear before descendants. This matches the resolution order in
12	`dlm.directives.merge.effective_config_for`.
13
14	Malformed YAML or broken lines in `.dlm/ignore` log + degrade — the
15	walk never fails. The CLI has no way to recover from a mid-train
16	discovery crash, so tolerance here is load-bearing.
17	"""
18
19	from __future__ import annotations
20
21	import logging
22	from dataclasses import dataclass
23	from pathlib import Path
24
25	import yaml
26	from pydantic import ValidationError
27
28	from dlm.directives.ignore_parser import IgnoreRule, parse_ignore_file
29	from dlm.directives.schema import DlmTrainingConfig
30	from dlm.io.text import DlmEncodingError, read_text
31
32	_LOG = logging.getLogger(__name__)
33
34	_CONFIG_FILENAME = "training.yaml"
35	_IGNORE_FILENAME = "ignore"
36
37
38	@dataclass(frozen=True)
39	class DiscoveredConfig:
40	"""Aggregated `.dlm/` config at one anchor directory.
41
42	`anchor` is the directory that contains the `.dlm/` dir (i.e.
43	the repo root, or a subtree root). Relative paths in
44	`config.include` / `config.exclude` and ignore rules resolve
45	against this anchor.
46
47	Both `config` and `ignore_rules` can be empty — a bare `.dlm/`
48	directory with no files inside still produces a (no-op)
49	DiscoveredConfig, letting users mark subtrees explicitly without
50	writing YAML.
51	"""
52
53	anchor: Path
54	config: DlmTrainingConfig \| None
55	ignore_rules: tuple[IgnoreRule, ...]
56
57
58	def discover_configs(root: Path) -> tuple[DiscoveredConfig, ...]:
59	"""Walk `root` top-down and return a `DiscoveredConfig` per `.dlm/`.
60
61	`root` itself is included — if `<root>/.dlm/` exists, it becomes
62	the first (shallowest) discovered config. Each deeper `.dlm/`
63	dir produces an additional record.
64
65	Results are sorted by anchor path depth ascending so callers
66	iterating can apply parent rules before child rules.
67	"""
68	discovered: list[DiscoveredConfig] = []
69
70	if not root.is_dir():
71	return ()
72
73	for dlm_dir in sorted(root.rglob(".dlm")):
74	if not dlm_dir.is_dir():
75	continue
76	anchor = dlm_dir.parent
77	config = _load_training_yaml(dlm_dir / _CONFIG_FILENAME)
78	ignore_rules = _load_ignore(dlm_dir / _IGNORE_FILENAME)
79	discovered.append(DiscoveredConfig(anchor=anchor, config=config, ignore_rules=ignore_rules))
80
81	discovered.sort(key=lambda d: len(d.anchor.as_posix()))
82	return tuple(discovered)
83
84
85	def _load_training_yaml(path: Path) -> DlmTrainingConfig \| None:
86	"""Load + validate a `.dlm/training.yaml`. Missing file → None.
87
88	Malformed YAML, schema violations, or encoding errors log one
89	warning and return None. The anchor still produces a
90	DiscoveredConfig (just with `config=None`), so a neighboring
91	`.dlm/ignore` at the same anchor keeps working.
92	"""
93	if not path.is_file():
94	return None
95	try:
96	text = read_text(path)
97	except DlmEncodingError as exc:
98	_LOG.warning("discovery: %s: not UTF-8 (%s); skipping config", path, exc)
99	return None
100
101	try:
102	raw = yaml.safe_load(text) if text.strip() else {}
103	except yaml.YAMLError as exc:
104	_LOG.warning("discovery: %s: invalid YAML (%s); skipping config", path, exc)
105	return None
106
107	if raw is None:
108	raw = {}
109	if not isinstance(raw, dict):
110	_LOG.warning(
111	"discovery: %s: top-level must be a mapping, got %s; skipping config",
112	path,
113	type(raw).__name__,
114	)
115	return None
116
117	try:
118	return DlmTrainingConfig.model_validate(raw)
119	except ValidationError as exc:
120	_LOG.warning("discovery: %s: schema violation (%s); skipping config", path, exc)
121	return None
122
123
124	def _load_ignore(path: Path) -> tuple[IgnoreRule, ...]:
125	"""Load + parse a `.dlm/ignore`. Missing file → empty tuple.
126
127	The parser itself never raises; malformed lines log + skip. An
128	unreadable file (encoding error) logs once and degrades to empty
129	rules.
130	"""
131	if not path.is_file():
132	return ()
133	try:
134	text = read_text(path)
135	except DlmEncodingError as exc:
136	_LOG.warning("discovery: %s: not UTF-8 (%s); skipping ignore", path, exc)
137	return ()
138	return parse_ignore_file(text)