Python · 7798 bytes Raw Blame History
1 """`dlm train <dir>` auto-scaffold — zero-ceremony directory training.
2
3 When the CLI arg points at a directory rather than a `.dlm` file,
4 `scaffold_train_target` resolves or creates the `.dlm` file that will
5 drive the train:
6
7 1. Look for `<dir>/.dlm/*.dlm`. Exactly one match → reuse it.
8 Multiple matches + no `--name` → refuse with the candidates listed.
9 `--name <n>` narrows to `<dir>/.dlm/<n>.dlm`.
10 2. No match → require `--base` (no silent default), mint a fresh
11 ULID, write `<dir>/.dlm/corpus.dlm` (or `<name>.dlm`) with
12 `training.sources` built from the CLI flags, and return its
13 path. The caller then proceeds as if the `.dlm` was passed
14 directly.
15
16 `--rescaffold` rewrites an existing scaffolded `.dlm` in place (same
17 ULID kept so the store stays intact). Without it, re-running with
18 frontmatter-editing flags refuses to shadow-edit the on-disk config.
19 """
20
21 from __future__ import annotations
22
23 import logging
24 from dataclasses import dataclass
25 from pathlib import Path
26 from typing import Literal
27
28 from dlm.doc.errors import DlmParseError
29 from dlm.doc.parser import parse_file
30 from dlm.doc.schema import CURRENT_SCHEMA_VERSION
31 from dlm.io.atomic import write_text as atomic_write_text
32 from dlm.io.ulid import mint_ulid
33
34 _LOG = logging.getLogger(__name__)
35
36 _SCAFFOLD_DIR = ".dlm"
37 _DEFAULT_NAME = "corpus"
38
39
40 Policy = Literal["permissive", "strict"]
41
42
43 class ScaffoldError(DlmParseError):
44 """Scaffold-mode failure surfaced through the CLI reporter.
45
46 Subclass of DlmParseError so it gets the uniform `file:line:col`
47 treatment even though scaffold errors don't have line/col info.
48 """
49
50 def __init__(self, message: str, *, path: Path | None = None) -> None:
51 super().__init__(message, path=path)
52
53
54 @dataclass(frozen=True)
55 class ScaffoldResult:
56 """What the CLI command reports after resolving or creating."""
57
58 dlm_path: Path
59 scaffolded: bool
60 """True when we wrote a new file on this invocation (first run
61 or --rescaffold). False when we reused an existing one."""
62 dlm_id: str
63
64
65 def scaffold_train_target(
66 target: Path,
67 *,
68 base: str | None,
69 include: tuple[str, ...],
70 exclude: tuple[str, ...],
71 recursive: bool,
72 name: str,
73 policy: Policy,
74 rescaffold: bool = False,
75 ) -> ScaffoldResult:
76 """Resolve or scaffold a `.dlm` file anchoring `target` (a directory).
77
78 Returns a `ScaffoldResult` pointing at the `.dlm` the trainer
79 should consume. Raises `ScaffoldError` when scaffolding is needed
80 but required inputs (most commonly `--base`) are missing, or when
81 an ambiguous multi-`.dlm` tree was passed without `--name`.
82 """
83 if not target.exists():
84 raise ScaffoldError(f"target does not exist: {target}", path=target)
85 if not target.is_dir():
86 raise ScaffoldError(f"scaffold expects a directory, got file: {target}", path=target)
87
88 dlm_dir = target / _SCAFFOLD_DIR
89 existing = sorted(dlm_dir.glob("*.dlm")) if dlm_dir.is_dir() else []
90 named_match = next((c for c in existing if c.stem == name), None)
91 name_is_default = name == _DEFAULT_NAME
92
93 # --- Resume: reuse an existing .dlm when appropriate --------------
94 # (1) Explicit --name matches an existing file: reuse that one.
95 # (2) Default --name (corpus), single existing file: reuse it as a
96 # convenience (user passed no explicit name, we assume they
97 # want the one `.dlm` that's there).
98 # (3) Default --name, multiple existing files: refuse (ambiguous
99 # without --name disambiguation).
100 if not rescaffold:
101 if named_match is not None:
102 dlm_id = _dlm_id_from_file(named_match)
103 return ScaffoldResult(dlm_path=named_match, scaffolded=False, dlm_id=dlm_id)
104 if name_is_default and len(existing) == 1:
105 dlm_id = _dlm_id_from_file(existing[0])
106 return ScaffoldResult(dlm_path=existing[0], scaffolded=False, dlm_id=dlm_id)
107 if name_is_default and len(existing) > 1:
108 listing = "\n".join(f" dlm train {target} --name {c.stem}" for c in existing)
109 raise ScaffoldError(
110 f"multiple .dlm files found under {target / _SCAFFOLD_DIR}; "
111 f"pass --name to pick one:\n{listing}",
112 path=target,
113 )
114
115 # --- Scaffold or rescaffold path ----------------------------------
116 if base is None:
117 raise ScaffoldError(
118 "first-run scaffold requires --base <key>. Pick from the base "
119 "registry (e.g. smollm2-135m, qwen2.5-coder-1.5b) or pass "
120 "--base hf:<org>/<name> for an off-registry model.",
121 path=target,
122 )
123
124 dlm_path = dlm_dir / f"{name}.dlm"
125 existing_id = _dlm_id_from_file(dlm_path) if rescaffold and dlm_path.is_file() else None
126
127 dlm_id = existing_id or mint_ulid()
128 dlm_dir.mkdir(parents=True, exist_ok=True)
129 _write_scaffold(
130 dlm_path=dlm_path,
131 dlm_id=dlm_id,
132 base=base,
133 include=include,
134 exclude=exclude,
135 recursive=recursive,
136 policy=policy,
137 target=target,
138 )
139 _LOG.info("scaffold: wrote %s (dlm_id=%s, base=%s)", dlm_path, dlm_id, base)
140 return ScaffoldResult(dlm_path=dlm_path, scaffolded=True, dlm_id=dlm_id)
141
142
143 def _dlm_id_from_file(path: Path) -> str:
144 """Extract `dlm_id` from an existing `.dlm` by parsing its frontmatter."""
145 parsed = parse_file(path)
146 return parsed.frontmatter.dlm_id
147
148
149 def _write_scaffold(
150 *,
151 dlm_path: Path,
152 dlm_id: str,
153 base: str,
154 include: tuple[str, ...],
155 exclude: tuple[str, ...],
156 recursive: bool,
157 policy: Policy,
158 target: Path,
159 ) -> None:
160 """Serialize a minimal `.dlm` frontmatter + body to `dlm_path`.
161
162 Writes directly as YAML-frontmatter text rather than round-tripping
163 through the `DlmFrontmatter` serializer — the scaffold is small,
164 the frontmatter is a fixed shape, and keeping the write path
165 independent of the full parser simplifies bootstrapping for
166 directory-first users.
167
168 The `path:` field is the absolute resolved `target`, not `"."`. A
169 relative `"."` would anchor on `dlm_path.parent` (the `.dlm/`
170 directory), which doesn't contain the user's files and is
171 default-excluded by the descent protocol — the first scaffolded
172 train would ingest zero content.
173 """
174 effective_include = _build_include_globs(include, recursive=recursive)
175 lines: list[str] = [
176 "---",
177 f"dlm_id: {dlm_id}",
178 f"dlm_version: {CURRENT_SCHEMA_VERSION}",
179 f"base_model: {base}",
180 "training:",
181 f" sources_policy: {policy}",
182 " sources:",
183 f' - path: "{target.resolve().as_posix()}"',
184 " include:",
185 ]
186 for pat in effective_include:
187 lines.append(f' - "{pat}"')
188 if exclude:
189 lines.append(" exclude:")
190 for pat in exclude:
191 lines.append(f' - "{pat}"')
192 lines.extend(
193 [
194 "---",
195 "",
196 "# Auto-scaffolded by `dlm train`. Edit the frontmatter above to refine training.",
197 "",
198 ]
199 )
200 atomic_write_text(dlm_path, "\n".join(lines))
201
202
203 def _build_include_globs(include: tuple[str, ...], *, recursive: bool) -> tuple[str, ...]:
204 """Map `--include` flags + `--recursive` to frontmatter globs.
205
206 Empty `--include` + `--recursive` → `["**/*"]`: train on every
207 file the descent protocol approves.
208 Empty `--include` + `--no-recursive` → `["*"]`: top-level files
209 only.
210 Explicit `--include` globs are passed through unchanged; the
211 `--recursive` flag doesn't transform user-supplied patterns (users
212 writing `*.py` know what they want).
213 """
214 if include:
215 return include
216 return ("**/*",) if recursive else ("*",)