Python · 12027 bytes Raw Blame History
1 """Quality-of-output tests for the autogen YAML.
2
3 The audit's B8 finding was that ``style_fingerprint`` got the leading
4 sentence of a prose section as its prompt — which elicits doc
5 *continuation* (a content probe), not stylistic voice. Sprint 05
6 replaces that with a fixed list of stylistic-elicitation prompts. This
7 file pins the new contract.
8 """
9
10 from __future__ import annotations
11
12 from pathlib import Path
13
14 from dlm_sway.core.sections import Section
15 from dlm_sway.integrations.dlm.autogen import (
16 _STYLE_ELICITATION_PROMPTS,
17 build_spec_dict,
18 )
19 from dlm_sway.integrations.dlm.resolver import DlmHandle
20
21
22 def _handle_with_prose_first_sentence() -> DlmHandle:
23 """A handle whose only prose section starts with a strong, doc-specific
24 opener — the kind of sentence that, under the old heuristic, would
25 have leaked into the style probe."""
26 sections = (
27 Section(
28 id="s1",
29 kind="prose",
30 content=(
31 "The mitochondrion is the powerhouse of the cell. "
32 "It generates ATP via oxidative phosphorylation. "
33 "Inner-membrane folds called cristae increase surface area."
34 ),
35 ),
36 )
37 return DlmHandle(
38 dlm_id="x",
39 base_model="HuggingFaceTB/SmolLM2-135M-Instruct",
40 adapter_path=Path("/tmp/adapter"),
41 sections=sections,
42 doc_text="whole document",
43 )
44
45
46 def test_style_prompts_use_elicitation_set_not_doc_content() -> None:
47 """B8: style_fingerprint prompts come from the fixed elicitation set."""
48 spec = build_spec_dict(_handle_with_prose_first_sentence())
49 style_entry = next((e for e in spec["suite"] if e["kind"] == "style_fingerprint"), None)
50 assert style_entry is not None, "autogen should emit a style_fingerprint entry"
51 style_prompts = style_entry["prompts"]
52 # Every prompt comes from the elicitation set.
53 assert set(style_prompts) <= set(_STYLE_ELICITATION_PROMPTS)
54 # No prompt smells like the leading prose sentence.
55 assert not any("mitochondrion" in p.lower() for p in style_prompts)
56 assert not any("powerhouse" in p.lower() for p in style_prompts)
57
58
59 def test_style_prompts_nonempty_even_without_prose() -> None:
60 """The fixed list means the probe always has something to ask the model."""
61 sections = (Section(id="i1", kind="instruction", content="What is X? X is Y.", probes=()),)
62 handle = DlmHandle(
63 dlm_id="x",
64 base_model="b",
65 adapter_path=Path("/tmp/a"),
66 sections=sections,
67 doc_text=None,
68 )
69 spec = build_spec_dict(handle)
70 style_entry = next((e for e in spec["suite"] if e["kind"] == "style_fingerprint"), None)
71 assert style_entry is not None
72 assert len(style_entry["prompts"]) >= 4
73
74
75 def test_elicitation_prompts_are_open_ended() -> None:
76 """A sanity check on the constant itself: each prompt invites prose,
77 not a single-token completion."""
78 for prompt in _STYLE_ELICITATION_PROMPTS:
79 assert len(prompt) >= 30, f"prompt too short to elicit prose: {prompt!r}"
80 assert prompt.endswith(".")
81
82
83 def _handle_with_many_instruction_probes(n: int) -> DlmHandle:
84 """A handle rigged to produce at least ``n`` distinct instruction
85 prompts (used to clear ``cluster_kl``'s 20-prompt floor)."""
86 from dlm_sway.core.sections import SectionProbe
87
88 probes = tuple(SectionProbe(prompt=f"Q{i}: what is topic {i}?", gold=f"A{i}") for i in range(n))
89 sections = (
90 Section(id="i1", kind="instruction", content="…", probes=probes),
91 Section(
92 id="p1",
93 kind="prose",
94 content="Prose sentence one. Prose sentence two. Prose sentence three.",
95 ),
96 )
97 return DlmHandle(
98 dlm_id="x",
99 base_model="b",
100 adapter_path=Path("/tmp/a"),
101 sections=sections,
102 doc_text=None,
103 )
104
105
106 class TestSkippedProbesRollup:
107 """F07 (Audit 03) — ``_render_annotated_yaml`` prepends a
108 ``# skipped: <probe> (<reason>)`` block so users see which probes
109 the autogen intentionally omitted, without diffing this module's
110 docstring."""
111
112 def test_prose_only_handle_omits_instruction_heavy_probes(self) -> None:
113 """A .dlm with only PROSE sections skips adapter_revert +
114 paraphrase_invariance + preference_flip + (with 1 section)
115 section_internalization."""
116 from dlm_sway.integrations.dlm.autogen import collect_skipped_probe_reasons
117
118 handle = DlmHandle(
119 dlm_id="x",
120 base_model="b",
121 adapter_path=Path("/tmp/a"),
122 sections=(
123 Section(
124 id="s1",
125 kind="prose",
126 content="One paragraph of prose. Second sentence.",
127 ),
128 ),
129 doc_text="doc",
130 )
131 skipped = collect_skipped_probe_reasons(handle)
132 skipped_kinds = {k for k, _ in skipped}
133 assert "adapter_revert" in skipped_kinds
134 assert "paraphrase_invariance" in skipped_kinds
135 assert "preference_flip" in skipped_kinds
136 assert "section_internalization" in skipped_kinds
137 # delta_kl should NOT be skipped — prose provides a fallback
138 # prompt pool.
139 assert "delta_kl" not in skipped_kinds
140
141 def test_instruction_only_handle_omits_prose_heavy_probes(self) -> None:
142 """An instruction-only doc skips external_perplexity + leakage."""
143 from dlm_sway.core.sections import SectionProbe
144 from dlm_sway.integrations.dlm.autogen import collect_skipped_probe_reasons
145
146 handle = DlmHandle(
147 dlm_id="x",
148 base_model="b",
149 adapter_path=Path("/tmp/a"),
150 sections=(
151 Section(
152 id="i1",
153 kind="instruction",
154 content="Q/A",
155 probes=(SectionProbe(prompt="Q?", gold="A"),),
156 ),
157 ),
158 doc_text=None,
159 )
160 skipped = collect_skipped_probe_reasons(handle)
161 skipped_kinds = {k for k, _ in skipped}
162 assert "external_perplexity" in skipped_kinds
163 assert "leakage" in skipped_kinds
164
165 def test_rendered_yaml_carries_skipped_block(self, tmp_path: Path) -> None:
166 """End-to-end: on a minimal prose-only .dlm, the rendered YAML
167 header has the ``# skipped:`` lines."""
168 from dlm_sway.integrations.dlm.autogen import (
169 _render_annotated_yaml,
170 build_spec_dict,
171 collect_skipped_probe_reasons,
172 )
173
174 handle = DlmHandle(
175 dlm_id="x",
176 base_model="b",
177 adapter_path=Path("/tmp/a"),
178 sections=(Section(id="s1", kind="prose", content="Short prose."),),
179 doc_text="doc",
180 )
181 dlm_path = tmp_path / "demo.dlm"
182 dlm_path.write_text("# empty")
183 spec = build_spec_dict(handle, dlm_source="demo.dlm")
184 skipped = collect_skipped_probe_reasons(handle)
185 rendered = _render_annotated_yaml(spec, handle, dlm_path, skipped=skipped)
186 assert "# skipped: adapter_revert" in rendered
187 assert "# skipped: preference_flip" in rendered
188 assert "(no " in rendered # reasons start with "no ..."
189
190 def test_rendered_yaml_omits_skipped_block_when_all_probes_fit(self) -> None:
191 """A heavily-populated doc that triggers every probe emits no
192 ``# skipped:`` lines."""
193 from dlm_sway.core.sections import SectionPreference, SectionProbe
194 from dlm_sway.integrations.dlm.autogen import (
195 _render_annotated_yaml,
196 build_spec_dict,
197 collect_skipped_probe_reasons,
198 )
199
200 probes = tuple(SectionProbe(prompt=f"Q{i}?", gold=f"A{i}") for i in range(25))
201 preferences = (SectionPreference(prompt="P1", chosen="good", rejected="bad"),)
202 handle = DlmHandle(
203 dlm_id="x",
204 base_model="b",
205 adapter_path=Path("/tmp/a"),
206 sections=(
207 Section(id="i1", kind="instruction", content="Q/A", probes=probes),
208 Section(
209 id="p1",
210 kind="prose",
211 content="A first prose sentence. A second. A third.",
212 ),
213 Section(
214 id="pref1",
215 kind="preference",
216 content="pref",
217 preferences=preferences,
218 ),
219 ),
220 doc_text="doc",
221 )
222 spec = build_spec_dict(handle)
223 skipped = collect_skipped_probe_reasons(handle)
224 rendered = _render_annotated_yaml(spec, handle, Path("/tmp/demo.dlm"), skipped=skipped)
225 assert "# skipped:" not in rendered
226
227
228 class TestPortableDlmSource:
229 """F09 (Audit 03) — ``_portable_dlm_source`` emits a cwd-relative
230 path when the ``.dlm`` lives inside the cwd (survives CI checkout),
231 absolute path when it lives elsewhere.
232 """
233
234 def test_cwd_relative_when_inside(self, tmp_path: Path, monkeypatch) -> None: # type: ignore[no-untyped-def]
235 from dlm_sway.integrations.dlm.autogen import _portable_dlm_source
236
237 # Set cwd to tmp_path; drop a .dlm inside a subdir.
238 subdir = tmp_path / "src"
239 subdir.mkdir()
240 dlm_file = subdir / "demo.dlm"
241 dlm_file.write_text("# empty\n")
242 monkeypatch.chdir(tmp_path)
243 source = _portable_dlm_source(dlm_file)
244 assert source == "src/demo.dlm"
245 # Not an absolute path — the whole point of F09.
246 assert not Path(source).is_absolute()
247
248 def test_absolute_when_outside(self, tmp_path: Path, monkeypatch) -> None: # type: ignore[no-untyped-def]
249 """A ``.dlm`` somewhere outside the cwd falls back to its
250 absolute path — relative-ization would point at a nonexistent
251 parent directory on a fresh checkout."""
252 from dlm_sway.integrations.dlm.autogen import _portable_dlm_source
253
254 # cwd inside tmp_path; .dlm lives in a sibling tree.
255 cwd = tmp_path / "cwd"
256 cwd.mkdir()
257 sibling = tmp_path / "other"
258 sibling.mkdir()
259 dlm_file = sibling / "demo.dlm"
260 dlm_file.write_text("# empty\n")
261 monkeypatch.chdir(cwd)
262 source = _portable_dlm_source(dlm_file)
263 assert Path(source).is_absolute()
264 assert source == str(dlm_file.resolve())
265
266
267 class TestAutogenClusterKL:
268 """F07 — autogen emits ``cluster_kl`` when the prompt pool has
269 enough entries to clear S16's ``min_prompts=20`` floor, and omits
270 it otherwise."""
271
272 def test_emits_cluster_kl_when_prompt_pool_is_large(self) -> None:
273 spec = build_spec_dict(_handle_with_many_instruction_probes(25))
274 entry = next((e for e in spec["suite"] if e["kind"] == "cluster_kl"), None)
275 assert entry is not None, "autogen should emit cluster_kl on large prompt pools"
276 assert entry["num_clusters"] == 5
277 assert entry["min_prompts"] == 20
278 assert len(entry["prompts"]) >= 20
279 # Cap at 64 so a doc with hundreds of probes doesn't explode
280 # the cluster runtime.
281 assert len(entry["prompts"]) <= 64
282
283 def test_omits_cluster_kl_on_small_prompt_pool(self) -> None:
284 """Under 20 prompts → omit the entry. The probe would SKIP
285 anyway; skipping emission keeps the autogen'd YAML tidy."""
286 spec = build_spec_dict(_handle_with_many_instruction_probes(5))
287 entry = next((e for e in spec["suite"] if e["kind"] == "cluster_kl"), None)
288 assert entry is None
289
290 def test_prompts_deduplicated(self) -> None:
291 """No duplicate entries (instruction prompts + prose leading
292 sentences are merged but must not repeat verbatim)."""
293 spec = build_spec_dict(_handle_with_many_instruction_probes(30))
294 entry = next((e for e in spec["suite"] if e["kind"] == "cluster_kl"), None)
295 assert entry is not None
296 assert len(entry["prompts"]) == len(set(entry["prompts"]))