| 1 |
"""`dlm train <dir>` auto-scaffold — zero-ceremony directory training. |
| 2 |
|
| 3 |
When the CLI arg points at a directory rather than a `.dlm` file, |
| 4 |
`scaffold_train_target` resolves or creates the `.dlm` file that will |
| 5 |
drive the train: |
| 6 |
|
| 7 |
1. Look for `<dir>/.dlm/*.dlm`. Exactly one match → reuse it. |
| 8 |
Multiple matches + no `--name` → refuse with the candidates listed. |
| 9 |
`--name <n>` narrows to `<dir>/.dlm/<n>.dlm`. |
| 10 |
2. No match → require `--base` (no silent default), mint a fresh |
| 11 |
ULID, write `<dir>/.dlm/corpus.dlm` (or `<name>.dlm`) with |
| 12 |
`training.sources` built from the CLI flags, and return its |
| 13 |
path. The caller then proceeds as if the `.dlm` was passed |
| 14 |
directly. |
| 15 |
|
| 16 |
`--rescaffold` rewrites an existing scaffolded `.dlm` in place (same |
| 17 |
ULID kept so the store stays intact). Without it, re-running with |
| 18 |
frontmatter-editing flags refuses to shadow-edit the on-disk config. |
| 19 |
""" |
| 20 |
|
| 21 |
from __future__ import annotations |
| 22 |
|
| 23 |
import logging |
| 24 |
from dataclasses import dataclass |
| 25 |
from pathlib import Path |
| 26 |
from typing import Literal |
| 27 |
|
| 28 |
from dlm.doc.errors import DlmParseError |
| 29 |
from dlm.doc.parser import parse_file |
| 30 |
from dlm.doc.schema import CURRENT_SCHEMA_VERSION |
| 31 |
from dlm.io.atomic import write_text as atomic_write_text |
| 32 |
from dlm.io.ulid import mint_ulid |
| 33 |
|
| 34 |
_LOG = logging.getLogger(__name__) |
| 35 |
|
| 36 |
_SCAFFOLD_DIR = ".dlm" |
| 37 |
_DEFAULT_NAME = "corpus" |
| 38 |
|
| 39 |
|
| 40 |
Policy = Literal["permissive", "strict"] |
| 41 |
|
| 42 |
|
| 43 |
class ScaffoldError(DlmParseError): |
| 44 |
"""Scaffold-mode failure surfaced through the CLI reporter. |
| 45 |
|
| 46 |
Subclass of DlmParseError so it gets the uniform `file:line:col` |
| 47 |
treatment even though scaffold errors don't have line/col info. |
| 48 |
""" |
| 49 |
|
| 50 |
def __init__(self, message: str, *, path: Path | None = None) -> None: |
| 51 |
super().__init__(message, path=path) |
| 52 |
|
| 53 |
|
| 54 |
@dataclass(frozen=True) |
| 55 |
class ScaffoldResult: |
| 56 |
"""What the CLI command reports after resolving or creating.""" |
| 57 |
|
| 58 |
dlm_path: Path |
| 59 |
scaffolded: bool |
| 60 |
"""True when we wrote a new file on this invocation (first run |
| 61 |
or --rescaffold). False when we reused an existing one.""" |
| 62 |
dlm_id: str |
| 63 |
|
| 64 |
|
| 65 |
def scaffold_train_target( |
| 66 |
target: Path, |
| 67 |
*, |
| 68 |
base: str | None, |
| 69 |
include: tuple[str, ...], |
| 70 |
exclude: tuple[str, ...], |
| 71 |
recursive: bool, |
| 72 |
name: str, |
| 73 |
policy: Policy, |
| 74 |
rescaffold: bool = False, |
| 75 |
) -> ScaffoldResult: |
| 76 |
"""Resolve or scaffold a `.dlm` file anchoring `target` (a directory). |
| 77 |
|
| 78 |
Returns a `ScaffoldResult` pointing at the `.dlm` the trainer |
| 79 |
should consume. Raises `ScaffoldError` when scaffolding is needed |
| 80 |
but required inputs (most commonly `--base`) are missing, or when |
| 81 |
an ambiguous multi-`.dlm` tree was passed without `--name`. |
| 82 |
""" |
| 83 |
if not target.exists(): |
| 84 |
raise ScaffoldError(f"target does not exist: {target}", path=target) |
| 85 |
if not target.is_dir(): |
| 86 |
raise ScaffoldError(f"scaffold expects a directory, got file: {target}", path=target) |
| 87 |
|
| 88 |
dlm_dir = target / _SCAFFOLD_DIR |
| 89 |
existing = sorted(dlm_dir.glob("*.dlm")) if dlm_dir.is_dir() else [] |
| 90 |
named_match = next((c for c in existing if c.stem == name), None) |
| 91 |
name_is_default = name == _DEFAULT_NAME |
| 92 |
|
| 93 |
# --- Resume: reuse an existing .dlm when appropriate -------------- |
| 94 |
# (1) Explicit --name matches an existing file: reuse that one. |
| 95 |
# (2) Default --name (corpus), single existing file: reuse it as a |
| 96 |
# convenience (user passed no explicit name, we assume they |
| 97 |
# want the one `.dlm` that's there). |
| 98 |
# (3) Default --name, multiple existing files: refuse (ambiguous |
| 99 |
# without --name disambiguation). |
| 100 |
if not rescaffold: |
| 101 |
if named_match is not None: |
| 102 |
dlm_id = _dlm_id_from_file(named_match) |
| 103 |
return ScaffoldResult(dlm_path=named_match, scaffolded=False, dlm_id=dlm_id) |
| 104 |
if name_is_default and len(existing) == 1: |
| 105 |
dlm_id = _dlm_id_from_file(existing[0]) |
| 106 |
return ScaffoldResult(dlm_path=existing[0], scaffolded=False, dlm_id=dlm_id) |
| 107 |
if name_is_default and len(existing) > 1: |
| 108 |
listing = "\n".join(f" dlm train {target} --name {c.stem}" for c in existing) |
| 109 |
raise ScaffoldError( |
| 110 |
f"multiple .dlm files found under {target / _SCAFFOLD_DIR}; " |
| 111 |
f"pass --name to pick one:\n{listing}", |
| 112 |
path=target, |
| 113 |
) |
| 114 |
|
| 115 |
# --- Scaffold or rescaffold path ---------------------------------- |
| 116 |
if base is None: |
| 117 |
raise ScaffoldError( |
| 118 |
"first-run scaffold requires --base <key>. Pick from the base " |
| 119 |
"registry (e.g. smollm2-135m, qwen2.5-coder-1.5b) or pass " |
| 120 |
"--base hf:<org>/<name> for an off-registry model.", |
| 121 |
path=target, |
| 122 |
) |
| 123 |
|
| 124 |
dlm_path = dlm_dir / f"{name}.dlm" |
| 125 |
existing_id = _dlm_id_from_file(dlm_path) if rescaffold and dlm_path.is_file() else None |
| 126 |
|
| 127 |
dlm_id = existing_id or mint_ulid() |
| 128 |
dlm_dir.mkdir(parents=True, exist_ok=True) |
| 129 |
_write_scaffold( |
| 130 |
dlm_path=dlm_path, |
| 131 |
dlm_id=dlm_id, |
| 132 |
base=base, |
| 133 |
include=include, |
| 134 |
exclude=exclude, |
| 135 |
recursive=recursive, |
| 136 |
policy=policy, |
| 137 |
target=target, |
| 138 |
) |
| 139 |
_LOG.info("scaffold: wrote %s (dlm_id=%s, base=%s)", dlm_path, dlm_id, base) |
| 140 |
return ScaffoldResult(dlm_path=dlm_path, scaffolded=True, dlm_id=dlm_id) |
| 141 |
|
| 142 |
|
| 143 |
def _dlm_id_from_file(path: Path) -> str: |
| 144 |
"""Extract `dlm_id` from an existing `.dlm` by parsing its frontmatter.""" |
| 145 |
parsed = parse_file(path) |
| 146 |
return parsed.frontmatter.dlm_id |
| 147 |
|
| 148 |
|
| 149 |
def _write_scaffold( |
| 150 |
*, |
| 151 |
dlm_path: Path, |
| 152 |
dlm_id: str, |
| 153 |
base: str, |
| 154 |
include: tuple[str, ...], |
| 155 |
exclude: tuple[str, ...], |
| 156 |
recursive: bool, |
| 157 |
policy: Policy, |
| 158 |
target: Path, |
| 159 |
) -> None: |
| 160 |
"""Serialize a minimal `.dlm` frontmatter + body to `dlm_path`. |
| 161 |
|
| 162 |
Writes directly as YAML-frontmatter text rather than round-tripping |
| 163 |
through the `DlmFrontmatter` serializer — the scaffold is small, |
| 164 |
the frontmatter is a fixed shape, and keeping the write path |
| 165 |
independent of the full parser simplifies bootstrapping for |
| 166 |
directory-first users. |
| 167 |
|
| 168 |
The `path:` field is the absolute resolved `target`, not `"."`. A |
| 169 |
relative `"."` would anchor on `dlm_path.parent` (the `.dlm/` |
| 170 |
directory), which doesn't contain the user's files and is |
| 171 |
default-excluded by the descent protocol — the first scaffolded |
| 172 |
train would ingest zero content. |
| 173 |
""" |
| 174 |
effective_include = _build_include_globs(include, recursive=recursive) |
| 175 |
lines: list[str] = [ |
| 176 |
"---", |
| 177 |
f"dlm_id: {dlm_id}", |
| 178 |
f"dlm_version: {CURRENT_SCHEMA_VERSION}", |
| 179 |
f"base_model: {base}", |
| 180 |
"training:", |
| 181 |
f" sources_policy: {policy}", |
| 182 |
" sources:", |
| 183 |
f' - path: "{target.resolve().as_posix()}"', |
| 184 |
" include:", |
| 185 |
] |
| 186 |
for pat in effective_include: |
| 187 |
lines.append(f' - "{pat}"') |
| 188 |
if exclude: |
| 189 |
lines.append(" exclude:") |
| 190 |
for pat in exclude: |
| 191 |
lines.append(f' - "{pat}"') |
| 192 |
lines.extend( |
| 193 |
[ |
| 194 |
"---", |
| 195 |
"", |
| 196 |
"# Auto-scaffolded by `dlm train`. Edit the frontmatter above to refine training.", |
| 197 |
"", |
| 198 |
] |
| 199 |
) |
| 200 |
atomic_write_text(dlm_path, "\n".join(lines)) |
| 201 |
|
| 202 |
|
| 203 |
def _build_include_globs(include: tuple[str, ...], *, recursive: bool) -> tuple[str, ...]: |
| 204 |
"""Map `--include` flags + `--recursive` to frontmatter globs. |
| 205 |
|
| 206 |
Empty `--include` + `--recursive` → `["**/*"]`: train on every |
| 207 |
file the descent protocol approves. |
| 208 |
Empty `--include` + `--no-recursive` → `["*"]`: top-level files |
| 209 |
only. |
| 210 |
Explicit `--include` globs are passed through unchanged; the |
| 211 |
`--recursive` flag doesn't transform user-supplied patterns (users |
| 212 |
writing `*.py` know what they want). |
| 213 |
""" |
| 214 |
if include: |
| 215 |
return include |
| 216 |
return ("**/*",) if recursive else ("*",) |