Python · 7648 bytes Raw Blame History
1 """`scaffold_train_target` — dir detection, flag→frontmatter mapping,
2 first-run vs resume, --rescaffold, --name disambiguation."""
3
4 from __future__ import annotations
5
6 from pathlib import Path
7
8 import pytest
9
10 from dlm.cli.scaffold import ScaffoldError, scaffold_train_target
11 from dlm.doc.parser import parse_file
12
13
14 def _default_kwargs() -> dict[str, object]:
15 return {
16 "base": "smollm2-135m",
17 "include": (),
18 "exclude": (),
19 "recursive": True,
20 "name": "corpus",
21 "policy": "strict",
22 "rescaffold": False,
23 }
24
25
26 # ---- Dir detection + input validation --------------------------------------
27
28
29 def test_missing_target_raises(tmp_path: Path) -> None:
30 missing = tmp_path / "nope"
31 with pytest.raises(ScaffoldError, match="does not exist"):
32 scaffold_train_target(missing, **_default_kwargs()) # type: ignore[arg-type]
33
34
35 def test_file_target_raises(tmp_path: Path) -> None:
36 f = tmp_path / "file.dlm"
37 f.write_text("x")
38 with pytest.raises(ScaffoldError, match="expects a directory"):
39 scaffold_train_target(f, **_default_kwargs()) # type: ignore[arg-type]
40
41
42 # ---- First-run scaffold ----------------------------------------------------
43
44
45 def test_first_run_requires_base(tmp_path: Path) -> None:
46 kwargs = _default_kwargs()
47 kwargs["base"] = None
48 with pytest.raises(ScaffoldError, match="--base"):
49 scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
50
51
52 def test_first_run_writes_scaffolded_dlm(tmp_path: Path) -> None:
53 result = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type]
54 assert result.scaffolded is True
55 assert result.dlm_path == tmp_path / ".dlm" / "corpus.dlm"
56 assert result.dlm_path.is_file()
57 assert len(result.dlm_id) == 26
58 # Parse it to confirm the frontmatter is valid
59 parsed = parse_file(result.dlm_path)
60 assert parsed.frontmatter.dlm_id == result.dlm_id
61 assert parsed.frontmatter.base_model == "smollm2-135m"
62
63
64 def test_scaffold_default_include_recursive(tmp_path: Path) -> None:
65 result = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type]
66 parsed = parse_file(result.dlm_path)
67 sources = parsed.frontmatter.training.sources
68 assert sources is not None
69 assert sources[0].include == ("**/*",)
70
71
72 def test_scaffold_default_include_non_recursive(tmp_path: Path) -> None:
73 kwargs = _default_kwargs()
74 kwargs["recursive"] = False
75 result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
76 parsed = parse_file(result.dlm_path)
77 sources = parsed.frontmatter.training.sources
78 assert sources is not None
79 assert sources[0].include == ("*",)
80
81
82 def test_scaffold_explicit_include_passed_through(tmp_path: Path) -> None:
83 kwargs = _default_kwargs()
84 kwargs["include"] = ("**/*.f90", "**/*.F90")
85 result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
86 parsed = parse_file(result.dlm_path)
87 sources = parsed.frontmatter.training.sources
88 assert sources is not None
89 assert sources[0].include == ("**/*.f90", "**/*.F90")
90
91
92 def test_scaffold_exclude_passed_through(tmp_path: Path) -> None:
93 kwargs = _default_kwargs()
94 kwargs["exclude"] = ("tests/**", "**/__pycache__/**")
95 result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
96 parsed = parse_file(result.dlm_path)
97 sources = parsed.frontmatter.training.sources
98 assert sources is not None
99 assert sources[0].exclude == ("tests/**", "**/__pycache__/**")
100
101
102 def test_scaffold_policy_persisted(tmp_path: Path) -> None:
103 kwargs = _default_kwargs()
104 kwargs["policy"] = "permissive"
105 result = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
106 parsed = parse_file(result.dlm_path)
107 assert parsed.frontmatter.training.sources_policy == "permissive"
108
109
110 # ---- Resume / reuse --------------------------------------------------------
111
112
113 def test_second_run_reuses_existing(tmp_path: Path) -> None:
114 first = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type]
115 # Second run with base=None (scaffold shouldn't fire)
116 kwargs = _default_kwargs()
117 kwargs["base"] = None
118 second = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
119 assert second.scaffolded is False
120 assert second.dlm_path == first.dlm_path
121 assert second.dlm_id == first.dlm_id
122
123
124 def test_second_run_reuses_lone_existing_file_when_default_name_is_unmatched(
125 tmp_path: Path,
126 ) -> None:
127 kwargs = _default_kwargs()
128 kwargs["name"] = "notes"
129 first = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
130
131 resume = _default_kwargs()
132 resume["base"] = None
133 resolved = scaffold_train_target(tmp_path, **resume) # type: ignore[arg-type]
134
135 assert resolved.scaffolded is False
136 assert resolved.dlm_path == first.dlm_path
137 assert resolved.dlm_id == first.dlm_id
138
139
140 # ---- Multi-file disambiguation ---------------------------------------------
141
142
143 def test_multiple_dlms_refuses_without_explicit_name(tmp_path: Path) -> None:
144 # Scaffold two files with non-default names so neither matches
145 # the default `corpus` on the resume attempt.
146 kwargs_a = _default_kwargs()
147 kwargs_a["name"] = "code"
148 scaffold_train_target(tmp_path, **kwargs_a) # type: ignore[arg-type]
149 kwargs_b = _default_kwargs()
150 kwargs_b["name"] = "docs"
151 scaffold_train_target(tmp_path, **kwargs_b) # type: ignore[arg-type]
152
153 # Resume attempt with default name (corpus) and no match → refuse
154 kwargs_resume = _default_kwargs()
155 kwargs_resume["base"] = None
156 # name defaults to "corpus" which isn't in {code, docs}
157 with pytest.raises(ScaffoldError, match="multiple .dlm files"):
158 scaffold_train_target(tmp_path, **kwargs_resume) # type: ignore[arg-type]
159
160
161 def test_multiple_dlms_name_picks_match(tmp_path: Path) -> None:
162 # Scaffold two separate .dlm files
163 scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type]
164 kwargs = _default_kwargs()
165 kwargs["name"] = "docs"
166 second = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
167
168 # Resume with explicit name picks the matching file
169 kwargs3 = _default_kwargs()
170 kwargs3["base"] = None
171 kwargs3["name"] = "docs"
172 resolved = scaffold_train_target(tmp_path, **kwargs3) # type: ignore[arg-type]
173 assert resolved.dlm_path == second.dlm_path
174 assert resolved.scaffolded is False
175
176
177 # ---- --rescaffold ----------------------------------------------------------
178
179
180 def test_rescaffold_preserves_dlm_id(tmp_path: Path) -> None:
181 first = scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type]
182 kwargs = _default_kwargs()
183 kwargs["base"] = "qwen2.5-0.5b"
184 kwargs["include"] = ("**/*.md",)
185 kwargs["rescaffold"] = True
186 second = scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]
187 assert second.scaffolded is True
188 assert second.dlm_id == first.dlm_id # same ULID
189 # New frontmatter
190 parsed = parse_file(second.dlm_path)
191 assert parsed.frontmatter.base_model == "qwen2.5-0.5b"
192 assert parsed.frontmatter.training.sources[0].include == ("**/*.md",) # type: ignore[index]
193
194
195 def test_rescaffold_still_needs_base(tmp_path: Path) -> None:
196 scaffold_train_target(tmp_path, **_default_kwargs()) # type: ignore[arg-type]
197 kwargs = _default_kwargs()
198 kwargs["base"] = None
199 kwargs["rescaffold"] = True
200 with pytest.raises(ScaffoldError, match="--base"):
201 scaffold_train_target(tmp_path, **kwargs) # type: ignore[arg-type]