| 1 |
"""Walk a source tree and collect every `.dlm/` configuration. |
| 2 |
|
| 3 |
One `DiscoveredConfig` per `.dlm/` directory found under the walk |
| 4 |
root. Each config aggregates both `.dlm/training.yaml` (parsed as |
| 5 |
`DlmTrainingConfig`) and `.dlm/ignore` (parsed as a tuple of |
| 6 |
`IgnoreRule`). Either or both may be absent — the presence of the |
| 7 |
`.dlm/` directory alone is enough to produce a record (useful when a |
| 8 |
user wants a pure drive-by `.dlm/ignore` without writing YAML). |
| 9 |
|
| 10 |
Results are sorted by anchor path length ascending, so parents |
| 11 |
appear before descendants. This matches the resolution order in |
| 12 |
`dlm.directives.merge.effective_config_for`. |
| 13 |
|
| 14 |
Malformed YAML or broken lines in `.dlm/ignore` log + degrade — the |
| 15 |
walk never fails. The CLI has no way to recover from a mid-train |
| 16 |
discovery crash, so tolerance here is load-bearing. |
| 17 |
""" |
| 18 |
|
| 19 |
from __future__ import annotations |
| 20 |
|
| 21 |
import logging |
| 22 |
from dataclasses import dataclass |
| 23 |
from pathlib import Path |
| 24 |
|
| 25 |
import yaml |
| 26 |
from pydantic import ValidationError |
| 27 |
|
| 28 |
from dlm.directives.ignore_parser import IgnoreRule, parse_ignore_file |
| 29 |
from dlm.directives.schema import DlmTrainingConfig |
| 30 |
from dlm.io.text import DlmEncodingError, read_text |
| 31 |
|
| 32 |
_LOG = logging.getLogger(__name__) |
| 33 |
|
| 34 |
_CONFIG_FILENAME = "training.yaml" |
| 35 |
_IGNORE_FILENAME = "ignore" |
| 36 |
|
| 37 |
|
| 38 |
@dataclass(frozen=True) |
| 39 |
class DiscoveredConfig: |
| 40 |
"""Aggregated `.dlm/` config at one anchor directory. |
| 41 |
|
| 42 |
`anchor` is the directory that *contains* the `.dlm/` dir (i.e. |
| 43 |
the repo root, or a subtree root). Relative paths in |
| 44 |
`config.include` / `config.exclude` and ignore rules resolve |
| 45 |
against this anchor. |
| 46 |
|
| 47 |
Both `config` and `ignore_rules` can be empty — a bare `.dlm/` |
| 48 |
directory with no files inside still produces a (no-op) |
| 49 |
DiscoveredConfig, letting users mark subtrees explicitly without |
| 50 |
writing YAML. |
| 51 |
""" |
| 52 |
|
| 53 |
anchor: Path |
| 54 |
config: DlmTrainingConfig | None |
| 55 |
ignore_rules: tuple[IgnoreRule, ...] |
| 56 |
|
| 57 |
|
| 58 |
def discover_configs(root: Path) -> tuple[DiscoveredConfig, ...]: |
| 59 |
"""Walk `root` top-down and return a `DiscoveredConfig` per `.dlm/`. |
| 60 |
|
| 61 |
`root` itself is included — if `<root>/.dlm/` exists, it becomes |
| 62 |
the first (shallowest) discovered config. Each deeper `.dlm/` |
| 63 |
dir produces an additional record. |
| 64 |
|
| 65 |
Results are sorted by anchor path depth ascending so callers |
| 66 |
iterating can apply parent rules before child rules. |
| 67 |
""" |
| 68 |
discovered: list[DiscoveredConfig] = [] |
| 69 |
|
| 70 |
if not root.is_dir(): |
| 71 |
return () |
| 72 |
|
| 73 |
for dlm_dir in sorted(root.rglob(".dlm")): |
| 74 |
if not dlm_dir.is_dir(): |
| 75 |
continue |
| 76 |
anchor = dlm_dir.parent |
| 77 |
config = _load_training_yaml(dlm_dir / _CONFIG_FILENAME) |
| 78 |
ignore_rules = _load_ignore(dlm_dir / _IGNORE_FILENAME) |
| 79 |
discovered.append(DiscoveredConfig(anchor=anchor, config=config, ignore_rules=ignore_rules)) |
| 80 |
|
| 81 |
discovered.sort(key=lambda d: len(d.anchor.as_posix())) |
| 82 |
return tuple(discovered) |
| 83 |
|
| 84 |
|
| 85 |
def _load_training_yaml(path: Path) -> DlmTrainingConfig | None: |
| 86 |
"""Load + validate a `.dlm/training.yaml`. Missing file → None. |
| 87 |
|
| 88 |
Malformed YAML, schema violations, or encoding errors log one |
| 89 |
warning and return None. The anchor still produces a |
| 90 |
DiscoveredConfig (just with `config=None`), so a neighboring |
| 91 |
`.dlm/ignore` at the same anchor keeps working. |
| 92 |
""" |
| 93 |
if not path.is_file(): |
| 94 |
return None |
| 95 |
try: |
| 96 |
text = read_text(path) |
| 97 |
except DlmEncodingError as exc: |
| 98 |
_LOG.warning("discovery: %s: not UTF-8 (%s); skipping config", path, exc) |
| 99 |
return None |
| 100 |
|
| 101 |
try: |
| 102 |
raw = yaml.safe_load(text) if text.strip() else {} |
| 103 |
except yaml.YAMLError as exc: |
| 104 |
_LOG.warning("discovery: %s: invalid YAML (%s); skipping config", path, exc) |
| 105 |
return None |
| 106 |
|
| 107 |
if raw is None: |
| 108 |
raw = {} |
| 109 |
if not isinstance(raw, dict): |
| 110 |
_LOG.warning( |
| 111 |
"discovery: %s: top-level must be a mapping, got %s; skipping config", |
| 112 |
path, |
| 113 |
type(raw).__name__, |
| 114 |
) |
| 115 |
return None |
| 116 |
|
| 117 |
try: |
| 118 |
return DlmTrainingConfig.model_validate(raw) |
| 119 |
except ValidationError as exc: |
| 120 |
_LOG.warning("discovery: %s: schema violation (%s); skipping config", path, exc) |
| 121 |
return None |
| 122 |
|
| 123 |
|
| 124 |
def _load_ignore(path: Path) -> tuple[IgnoreRule, ...]: |
| 125 |
"""Load + parse a `.dlm/ignore`. Missing file → empty tuple. |
| 126 |
|
| 127 |
The parser itself never raises; malformed lines log + skip. An |
| 128 |
unreadable file (encoding error) logs once and degrades to empty |
| 129 |
rules. |
| 130 |
""" |
| 131 |
if not path.is_file(): |
| 132 |
return () |
| 133 |
try: |
| 134 |
text = read_text(path) |
| 135 |
except DlmEncodingError as exc: |
| 136 |
_LOG.warning("discovery: %s: not UTF-8 (%s); skipping ignore", path, exc) |
| 137 |
return () |
| 138 |
return parse_ignore_file(text) |