| 1 |
"""`scaffold_train_target` — dir detection, flag→frontmatter mapping, |
| 2 |
first-run vs resume, --rescaffold, --name disambiguation.""" |
| 3 |
|
| 4 |
from __future__ import annotations |
| 5 |
|
| 6 |
from pathlib import Path |
| 7 |
|
| 8 |
import pytest |
| 9 |
|
| 10 |
from dlm.cli.scaffold import ScaffoldError, scaffold_train_target |
| 11 |
from dlm.doc.parser import parse_file |
| 12 |
|
| 13 |
|
| 14 |
def _default_kwargs() -> dict[str, object]: |
| 15 |
return { |
| 16 |
"base": "smollm2-135m", |
| 17 |
"include": (), |
| 18 |
"exclude": (), |
| 19 |
"recursive": True, |
| 20 |
"name": "corpus", |
| 21 |
"policy": "strict", |
| 22 |
"rescaffold": False, |
| 23 |
} |
| 24 |
|
| 25 |
|
| 26 |
# ---- Dir detection + input validation -------------------------------------- |
| 27 |
|
| 28 |
|
| 29 |
def test_missing_target_raises(tmp_path: Path) -> None: |
| 30 |
missing = tmp_path / "nope" |
| 31 |
with pytest.raises(ScaffoldError, match="does not exist"): |
| 32 |
scaffold_train_target(missing, **_default_kwargs()) # type: ignore[arg-type] |
| 33 |
|
| 34 |
|
| 35 |
def test_file_target_raises(tmp_path: Path) -> None: |
| 36 |
f = tmp_path / "file.dlm" |
| 37 |
f.write_text("x") |
| 38 |
with pytest.raises(ScaffoldError, match="expects a directory"): |
| 39 |
scaffold_train_target(f, **_default_kwargs()) # type: ignore[arg-type] |
| 40 |
|
| 41 |
|
| 42 |
# ---- First-run scaffold ---------------------------------------------------- |
| 43 |
|
| 44 |
|
| 45 |
def test_first_run_requires_base(tmp_path: Path) -> None: |
| 46 |
kwargs = _default_kwargs() |
| 47 |
kwargs["base"] = None |
| 48 |
with pytest.raises(ScaffoldError, match="--base"): |
| 49 |
scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 50 |
|
| 51 |
|
| 52 |
def test_first_run_writes_scaffolded_dlm(tmp_path: Path) -> None: |
| 53 |
result = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type] |
| 54 |
assert result.scaffolded is True |
| 55 |
assert result.dlm_path == tmp_path / ".dlm" / "corpus.dlm" |
| 56 |
assert result.dlm_path.is_file() |
| 57 |
assert len(result.dlm_id) == 26 |
| 58 |
# Parse it to confirm the frontmatter is valid |
| 59 |
parsed = parse_file(result.dlm_path) |
| 60 |
assert parsed.frontmatter.dlm_id == result.dlm_id |
| 61 |
assert parsed.frontmatter.base_model == "smollm2-135m" |
| 62 |
|
| 63 |
|
| 64 |
def test_scaffold_default_include_recursive(tmp_path: Path) -> None: |
| 65 |
result = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type] |
| 66 |
parsed = parse_file(result.dlm_path) |
| 67 |
sources = parsed.frontmatter.training.sources |
| 68 |
assert sources is not None |
| 69 |
assert sources[0].include == ("**/*",) |
| 70 |
|
| 71 |
|
| 72 |
def test_scaffold_default_include_non_recursive(tmp_path: Path) -> None: |
| 73 |
kwargs = _default_kwargs() |
| 74 |
kwargs["recursive"] = False |
| 75 |
result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 76 |
parsed = parse_file(result.dlm_path) |
| 77 |
sources = parsed.frontmatter.training.sources |
| 78 |
assert sources is not None |
| 79 |
assert sources[0].include == ("*",) |
| 80 |
|
| 81 |
|
| 82 |
def test_scaffold_explicit_include_passed_through(tmp_path: Path) -> None: |
| 83 |
kwargs = _default_kwargs() |
| 84 |
kwargs["include"] = ("**/*.f90", "**/*.F90") |
| 85 |
result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 86 |
parsed = parse_file(result.dlm_path) |
| 87 |
sources = parsed.frontmatter.training.sources |
| 88 |
assert sources is not None |
| 89 |
assert sources[0].include == ("**/*.f90", "**/*.F90") |
| 90 |
|
| 91 |
|
| 92 |
def test_scaffold_exclude_passed_through(tmp_path: Path) -> None: |
| 93 |
kwargs = _default_kwargs() |
| 94 |
kwargs["exclude"] = ("tests/**", "**/__pycache__/**") |
| 95 |
result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 96 |
parsed = parse_file(result.dlm_path) |
| 97 |
sources = parsed.frontmatter.training.sources |
| 98 |
assert sources is not None |
| 99 |
assert sources[0].exclude == ("tests/**", "**/__pycache__/**") |
| 100 |
|
| 101 |
|
| 102 |
def test_scaffold_policy_persisted(tmp_path: Path) -> None: |
| 103 |
kwargs = _default_kwargs() |
| 104 |
kwargs["policy"] = "permissive" |
| 105 |
result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 106 |
parsed = parse_file(result.dlm_path) |
| 107 |
assert parsed.frontmatter.training.sources_policy == "permissive" |
| 108 |
|
| 109 |
|
| 110 |
# ---- Resume / reuse -------------------------------------------------------- |
| 111 |
|
| 112 |
|
| 113 |
def test_second_run_reuses_existing(tmp_path: Path) -> None: |
| 114 |
first = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type] |
| 115 |
# Second run with base=None (scaffold shouldn't fire) |
| 116 |
kwargs = _default_kwargs() |
| 117 |
kwargs["base"] = None |
| 118 |
second = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 119 |
assert second.scaffolded is False |
| 120 |
assert second.dlm_path == first.dlm_path |
| 121 |
assert second.dlm_id == first.dlm_id |
| 122 |
|
| 123 |
|
| 124 |
def test_second_run_reuses_lone_existing_file_when_default_name_is_unmatched( |
| 125 |
tmp_path: Path, |
| 126 |
) -> None: |
| 127 |
kwargs = _default_kwargs() |
| 128 |
kwargs["name"] = "notes" |
| 129 |
first = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 130 |
|
| 131 |
resume = _default_kwargs() |
| 132 |
resume["base"] = None |
| 133 |
resolved = scaffold_train_target(tmp_path, **resume) # type: ignore[arg-type] |
| 134 |
|
| 135 |
assert resolved.scaffolded is False |
| 136 |
assert resolved.dlm_path == first.dlm_path |
| 137 |
assert resolved.dlm_id == first.dlm_id |
| 138 |
|
| 139 |
|
| 140 |
# ---- Multi-file disambiguation --------------------------------------------- |
| 141 |
|
| 142 |
|
| 143 |
def test_multiple_dlms_refuses_without_explicit_name(tmp_path: Path) -> None: |
| 144 |
# Scaffold two files with non-default names so neither matches |
| 145 |
# the default `corpus` on the resume attempt. |
| 146 |
kwargs_a = _default_kwargs() |
| 147 |
kwargs_a["name"] = "code" |
| 148 |
scaffold_train_target(tmp_path, **kwargs_a) # type: ignore[arg-type] |
| 149 |
kwargs_b = _default_kwargs() |
| 150 |
kwargs_b["name"] = "docs" |
| 151 |
scaffold_train_target(tmp_path, **kwargs_b) # type: ignore[arg-type] |
| 152 |
|
| 153 |
# Resume attempt with default name (corpus) and no match → refuse |
| 154 |
kwargs_resume = _default_kwargs() |
| 155 |
kwargs_resume["base"] = None |
| 156 |
# name defaults to "corpus" which isn't in {code, docs} |
| 157 |
with pytest.raises(ScaffoldError, match="multiple .dlm files"): |
| 158 |
scaffold_train_target(tmp_path, **kwargs_resume) # type: ignore[arg-type] |
| 159 |
|
| 160 |
|
| 161 |
def test_multiple_dlms_name_picks_match(tmp_path: Path) -> None: |
| 162 |
# Scaffold two separate .dlm files |
| 163 |
scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type] |
| 164 |
kwargs = _default_kwargs() |
| 165 |
kwargs["name"] = "docs" |
| 166 |
second = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 167 |
|
| 168 |
# Resume with explicit name picks the matching file |
| 169 |
kwargs3 = _default_kwargs() |
| 170 |
kwargs3["base"] = None |
| 171 |
kwargs3["name"] = "docs" |
| 172 |
resolved = scaffold_train_target(tmp_path, **kwargs3) # type: ignore[arg-type] |
| 173 |
assert resolved.dlm_path == second.dlm_path |
| 174 |
assert resolved.scaffolded is False |
| 175 |
|
| 176 |
|
| 177 |
# ---- --rescaffold ---------------------------------------------------------- |
| 178 |
|
| 179 |
|
| 180 |
def test_rescaffold_preserves_dlm_id(tmp_path: Path) -> None: |
| 181 |
first = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type] |
| 182 |
kwargs = _default_kwargs() |
| 183 |
kwargs["base"] = "qwen2.5-0.5b" |
| 184 |
kwargs["include"] = ("**/*.md",) |
| 185 |
kwargs["rescaffold"] = True |
| 186 |
second = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |
| 187 |
assert second.scaffolded is True |
| 188 |
assert second.dlm_id == first.dlm_id # same ULID |
| 189 |
# New frontmatter |
| 190 |
parsed = parse_file(second.dlm_path) |
| 191 |
assert parsed.frontmatter.base_model == "qwen2.5-0.5b" |
| 192 |
assert parsed.frontmatter.training.sources[0].include == ("**/*.md",) # type: ignore[index] |
| 193 |
|
| 194 |
|
| 195 |
def test_rescaffold_still_needs_base(tmp_path: Path) -> None: |
| 196 |
scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type] |
| 197 |
kwargs = _default_kwargs() |
| 198 |
kwargs["base"] = None |
| 199 |
kwargs["rescaffold"] = True |
| 200 |
with pytest.raises(ScaffoldError, match="--base"): |
| 201 |
scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type] |