"""`dlm train
` auto-scaffold — zero-ceremony directory training.
When the CLI arg points at a directory rather than a `.dlm` file,
`scaffold_train_target` resolves or creates the `.dlm` file that will
drive the train:
1. Look for `/.dlm/*.dlm`. Exactly one match → reuse it.
Multiple matches + no `--name` → refuse with the candidates listed.
`--name ` narrows to `/.dlm/.dlm`.
2. No match → require `--base` (no silent default), mint a fresh
ULID, write `/.dlm/corpus.dlm` (or `.dlm`) with
`training.sources` built from the CLI flags, and return its
path. The caller then proceeds as if the `.dlm` was passed
directly.
`--rescaffold` rewrites an existing scaffolded `.dlm` in place (same
ULID kept so the store stays intact). Without it, re-running with
frontmatter-editing flags refuses to shadow-edit the on-disk config.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Literal
from dlm.doc.errors import DlmParseError
from dlm.doc.parser import parse_file
from dlm.doc.schema import CURRENT_SCHEMA_VERSION
from dlm.io.atomic import write_text as atomic_write_text
from dlm.io.ulid import mint_ulid
_LOG = logging.getLogger(__name__)
_SCAFFOLD_DIR = ".dlm"
_DEFAULT_NAME = "corpus"
Policy = Literal["permissive", "strict"]
class ScaffoldError(DlmParseError):
"""Scaffold-mode failure surfaced through the CLI reporter.
Subclass of DlmParseError so it gets the uniform `file:line:col`
treatment even though scaffold errors don't have line/col info.
"""
def __init__(self, message: str, *, path: Path | None = None) -> None:
super().__init__(message, path=path)
@dataclass(frozen=True)
class ScaffoldResult:
"""What the CLI command reports after resolving or creating."""
dlm_path: Path
scaffolded: bool
"""True when we wrote a new file on this invocation (first run
or --rescaffold). False when we reused an existing one."""
dlm_id: str
def scaffold_train_target(
target: Path,
*,
base: str | None,
include: tuple[str, ...],
exclude: tuple[str, ...],
recursive: bool,
name: str,
policy: Policy,
rescaffold: bool = False,
) -> ScaffoldResult:
"""Resolve or scaffold a `.dlm` file anchoring `target` (a directory).
Returns a `ScaffoldResult` pointing at the `.dlm` the trainer
should consume. Raises `ScaffoldError` when scaffolding is needed
but required inputs (most commonly `--base`) are missing, or when
an ambiguous multi-`.dlm` tree was passed without `--name`.
"""
if not target.exists():
raise ScaffoldError(f"target does not exist: {target}", path=target)
if not target.is_dir():
raise ScaffoldError(f"scaffold expects a directory, got file: {target}", path=target)
dlm_dir = target / _SCAFFOLD_DIR
existing = sorted(dlm_dir.glob("*.dlm")) if dlm_dir.is_dir() else []
named_match = next((c for c in existing if c.stem == name), None)
name_is_default = name == _DEFAULT_NAME
# --- Resume: reuse an existing .dlm when appropriate --------------
# (1) Explicit --name matches an existing file: reuse that one.
# (2) Default --name (corpus), single existing file: reuse it as a
# convenience (user passed no explicit name, we assume they
# want the one `.dlm` that's there).
# (3) Default --name, multiple existing files: refuse (ambiguous
# without --name disambiguation).
if not rescaffold:
if named_match is not None:
dlm_id = _dlm_id_from_file(named_match)
return ScaffoldResult(dlm_path=named_match, scaffolded=False, dlm_id=dlm_id)
if name_is_default and len(existing) == 1:
dlm_id = _dlm_id_from_file(existing[0])
return ScaffoldResult(dlm_path=existing[0], scaffolded=False, dlm_id=dlm_id)
if name_is_default and len(existing) > 1:
listing = "\n".join(f" dlm train {target} --name {c.stem}" for c in existing)
raise ScaffoldError(
f"multiple .dlm files found under {target / _SCAFFOLD_DIR}; "
f"pass --name to pick one:\n{listing}",
path=target,
)
# --- Scaffold or rescaffold path ----------------------------------
if base is None:
raise ScaffoldError(
"first-run scaffold requires --base . Pick from the base "
"registry (e.g. smollm2-135m, qwen2.5-coder-1.5b) or pass "
"--base hf:/ for an off-registry model.",
path=target,
)
dlm_path = dlm_dir / f"{name}.dlm"
existing_id = _dlm_id_from_file(dlm_path) if rescaffold and dlm_path.is_file() else None
dlm_id = existing_id or mint_ulid()
dlm_dir.mkdir(parents=True, exist_ok=True)
_write_scaffold(
dlm_path=dlm_path,
dlm_id=dlm_id,
base=base,
include=include,
exclude=exclude,
recursive=recursive,
policy=policy,
target=target,
)
_LOG.info("scaffold: wrote %s (dlm_id=%s, base=%s)", dlm_path, dlm_id, base)
return ScaffoldResult(dlm_path=dlm_path, scaffolded=True, dlm_id=dlm_id)
def _dlm_id_from_file(path: Path) -> str:
"""Extract `dlm_id` from an existing `.dlm` by parsing its frontmatter."""
parsed = parse_file(path)
return parsed.frontmatter.dlm_id
def _write_scaffold(
*,
dlm_path: Path,
dlm_id: str,
base: str,
include: tuple[str, ...],
exclude: tuple[str, ...],
recursive: bool,
policy: Policy,
target: Path,
) -> None:
"""Serialize a minimal `.dlm` frontmatter + body to `dlm_path`.
Writes directly as YAML-frontmatter text rather than round-tripping
through the `DlmFrontmatter` serializer — the scaffold is small,
the frontmatter is a fixed shape, and keeping the write path
independent of the full parser simplifies bootstrapping for
directory-first users.
The `path:` field is the absolute resolved `target`, not `"."`. A
relative `"."` would anchor on `dlm_path.parent` (the `.dlm/`
directory), which doesn't contain the user's files and is
default-excluded by the descent protocol — the first scaffolded
train would ingest zero content.
"""
effective_include = _build_include_globs(include, recursive=recursive)
lines: list[str] = [
"---",
f"dlm_id: {dlm_id}",
f"dlm_version: {CURRENT_SCHEMA_VERSION}",
f"base_model: {base}",
"training:",
f" sources_policy: {policy}",
" sources:",
f' - path: "{target.resolve().as_posix()}"',
" include:",
]
for pat in effective_include:
lines.append(f' - "{pat}"')
if exclude:
lines.append(" exclude:")
for pat in exclude:
lines.append(f' - "{pat}"')
lines.extend(
[
"---",
"",
"# Auto-scaffolded by `dlm train`. Edit the frontmatter above to refine training.",
"",
]
)
atomic_write_text(dlm_path, "\n".join(lines))
def _build_include_globs(include: tuple[str, ...], *, recursive: bool) -> tuple[str, ...]:
"""Map `--include` flags + `--recursive` to frontmatter globs.
Empty `--include` + `--recursive` → `["**/*"]`: train on every
file the descent protocol approves.
Empty `--include` + `--no-recursive` → `["*"]`: top-level files
only.
Explicit `--include` globs are passed through unchanged; the
`--recursive` flag doesn't transform user-supplied patterns (users
writing `*.py` know what they want).
"""
if include:
return include
return ("**/*",) if recursive else ("*",)