| 1 | """Quality-of-output tests for the autogen YAML. |
| 2 | |
| 3 | The audit's B8 finding was that ``style_fingerprint`` got the leading |
| 4 | sentence of a prose section as its prompt — which elicits doc |
| 5 | *continuation* (a content probe), not stylistic voice. Sprint 05 |
| 6 | replaces that with a fixed list of stylistic-elicitation prompts. This |
| 7 | file pins the new contract. |
| 8 | """ |
| 9 | |
| 10 | from __future__ import annotations |
| 11 | |
| 12 | from pathlib import Path |
| 13 | |
| 14 | from dlm_sway.core.sections import Section |
| 15 | from dlm_sway.integrations.dlm.autogen import ( |
| 16 | _STYLE_ELICITATION_PROMPTS, |
| 17 | build_spec_dict, |
| 18 | ) |
| 19 | from dlm_sway.integrations.dlm.resolver import DlmHandle |
| 20 | |
| 21 | |
| 22 | def _handle_with_prose_first_sentence() -> DlmHandle: |
| 23 | """A handle whose only prose section starts with a strong, doc-specific |
| 24 | opener — the kind of sentence that, under the old heuristic, would |
| 25 | have leaked into the style probe.""" |
| 26 | sections = ( |
| 27 | Section( |
| 28 | id="s1", |
| 29 | kind="prose", |
| 30 | content=( |
| 31 | "The mitochondrion is the powerhouse of the cell. " |
| 32 | "It generates ATP via oxidative phosphorylation. " |
| 33 | "Inner-membrane folds called cristae increase surface area." |
| 34 | ), |
| 35 | ), |
| 36 | ) |
| 37 | return DlmHandle( |
| 38 | dlm_id="x", |
| 39 | base_model="HuggingFaceTB/SmolLM2-135M-Instruct", |
| 40 | adapter_path=Path("/tmp/adapter"), |
| 41 | sections=sections, |
| 42 | doc_text="whole document", |
| 43 | ) |
| 44 | |
| 45 | |
| 46 | def test_style_prompts_use_elicitation_set_not_doc_content() -> None: |
| 47 | """B8: style_fingerprint prompts come from the fixed elicitation set.""" |
| 48 | spec = build_spec_dict(_handle_with_prose_first_sentence()) |
| 49 | style_entry = next((e for e in spec["suite"] if e["kind"] == "style_fingerprint"), None) |
| 50 | assert style_entry is not None, "autogen should emit a style_fingerprint entry" |
| 51 | style_prompts = style_entry["prompts"] |
| 52 | # Every prompt comes from the elicitation set. |
| 53 | assert set(style_prompts) <= set(_STYLE_ELICITATION_PROMPTS) |
| 54 | # No prompt smells like the leading prose sentence. |
| 55 | assert not any("mitochondrion" in p.lower() for p in style_prompts) |
| 56 | assert not any("powerhouse" in p.lower() for p in style_prompts) |
| 57 | |
| 58 | |
| 59 | def test_style_prompts_nonempty_even_without_prose() -> None: |
| 60 | """The fixed list means the probe always has something to ask the model.""" |
| 61 | sections = (Section(id="i1", kind="instruction", content="What is X? X is Y.", probes=()),) |
| 62 | handle = DlmHandle( |
| 63 | dlm_id="x", |
| 64 | base_model="b", |
| 65 | adapter_path=Path("/tmp/a"), |
| 66 | sections=sections, |
| 67 | doc_text=None, |
| 68 | ) |
| 69 | spec = build_spec_dict(handle) |
| 70 | style_entry = next((e for e in spec["suite"] if e["kind"] == "style_fingerprint"), None) |
| 71 | assert style_entry is not None |
| 72 | assert len(style_entry["prompts"]) >= 4 |
| 73 | |
| 74 | |
| 75 | def test_elicitation_prompts_are_open_ended() -> None: |
| 76 | """A sanity check on the constant itself: each prompt invites prose, |
| 77 | not a single-token completion.""" |
| 78 | for prompt in _STYLE_ELICITATION_PROMPTS: |
| 79 | assert len(prompt) >= 30, f"prompt too short to elicit prose: {prompt!r}" |
| 80 | assert prompt.endswith(".") |
| 81 | |
| 82 | |
| 83 | def _handle_with_many_instruction_probes(n: int) -> DlmHandle: |
| 84 | """A handle rigged to produce at least ``n`` distinct instruction |
| 85 | prompts (used to clear ``cluster_kl``'s 20-prompt floor).""" |
| 86 | from dlm_sway.core.sections import SectionProbe |
| 87 | |
| 88 | probes = tuple(SectionProbe(prompt=f"Q{i}: what is topic {i}?", gold=f"A{i}") for i in range(n)) |
| 89 | sections = ( |
| 90 | Section(id="i1", kind="instruction", content="…", probes=probes), |
| 91 | Section( |
| 92 | id="p1", |
| 93 | kind="prose", |
| 94 | content="Prose sentence one. Prose sentence two. Prose sentence three.", |
| 95 | ), |
| 96 | ) |
| 97 | return DlmHandle( |
| 98 | dlm_id="x", |
| 99 | base_model="b", |
| 100 | adapter_path=Path("/tmp/a"), |
| 101 | sections=sections, |
| 102 | doc_text=None, |
| 103 | ) |
| 104 | |
| 105 | |
| 106 | class TestSkippedProbesRollup: |
| 107 | """F07 (Audit 03) — ``_render_annotated_yaml`` prepends a |
| 108 | ``# skipped: <probe> (<reason>)`` block so users see which probes |
| 109 | the autogen intentionally omitted, without diffing this module's |
| 110 | docstring.""" |
| 111 | |
| 112 | def test_prose_only_handle_omits_instruction_heavy_probes(self) -> None: |
| 113 | """A .dlm with only PROSE sections skips adapter_revert + |
| 114 | paraphrase_invariance + preference_flip + (with 1 section) |
| 115 | section_internalization.""" |
| 116 | from dlm_sway.integrations.dlm.autogen import collect_skipped_probe_reasons |
| 117 | |
| 118 | handle = DlmHandle( |
| 119 | dlm_id="x", |
| 120 | base_model="b", |
| 121 | adapter_path=Path("/tmp/a"), |
| 122 | sections=( |
| 123 | Section( |
| 124 | id="s1", |
| 125 | kind="prose", |
| 126 | content="One paragraph of prose. Second sentence.", |
| 127 | ), |
| 128 | ), |
| 129 | doc_text="doc", |
| 130 | ) |
| 131 | skipped = collect_skipped_probe_reasons(handle) |
| 132 | skipped_kinds = {k for k, _ in skipped} |
| 133 | assert "adapter_revert" in skipped_kinds |
| 134 | assert "paraphrase_invariance" in skipped_kinds |
| 135 | assert "preference_flip" in skipped_kinds |
| 136 | assert "section_internalization" in skipped_kinds |
| 137 | # delta_kl should NOT be skipped — prose provides a fallback |
| 138 | # prompt pool. |
| 139 | assert "delta_kl" not in skipped_kinds |
| 140 | |
| 141 | def test_instruction_only_handle_omits_prose_heavy_probes(self) -> None: |
| 142 | """An instruction-only doc skips external_perplexity + leakage.""" |
| 143 | from dlm_sway.core.sections import SectionProbe |
| 144 | from dlm_sway.integrations.dlm.autogen import collect_skipped_probe_reasons |
| 145 | |
| 146 | handle = DlmHandle( |
| 147 | dlm_id="x", |
| 148 | base_model="b", |
| 149 | adapter_path=Path("/tmp/a"), |
| 150 | sections=( |
| 151 | Section( |
| 152 | id="i1", |
| 153 | kind="instruction", |
| 154 | content="Q/A", |
| 155 | probes=(SectionProbe(prompt="Q?", gold="A"),), |
| 156 | ), |
| 157 | ), |
| 158 | doc_text=None, |
| 159 | ) |
| 160 | skipped = collect_skipped_probe_reasons(handle) |
| 161 | skipped_kinds = {k for k, _ in skipped} |
| 162 | assert "external_perplexity" in skipped_kinds |
| 163 | assert "leakage" in skipped_kinds |
| 164 | |
| 165 | def test_rendered_yaml_carries_skipped_block(self, tmp_path: Path) -> None: |
| 166 | """End-to-end: on a minimal prose-only .dlm, the rendered YAML |
| 167 | header has the ``# skipped:`` lines.""" |
| 168 | from dlm_sway.integrations.dlm.autogen import ( |
| 169 | _render_annotated_yaml, |
| 170 | build_spec_dict, |
| 171 | collect_skipped_probe_reasons, |
| 172 | ) |
| 173 | |
| 174 | handle = DlmHandle( |
| 175 | dlm_id="x", |
| 176 | base_model="b", |
| 177 | adapter_path=Path("/tmp/a"), |
| 178 | sections=(Section(id="s1", kind="prose", content="Short prose."),), |
| 179 | doc_text="doc", |
| 180 | ) |
| 181 | dlm_path = tmp_path / "demo.dlm" |
| 182 | dlm_path.write_text("# empty") |
| 183 | spec = build_spec_dict(handle, dlm_source="demo.dlm") |
| 184 | skipped = collect_skipped_probe_reasons(handle) |
| 185 | rendered = _render_annotated_yaml(spec, handle, dlm_path, skipped=skipped) |
| 186 | assert "# skipped: adapter_revert" in rendered |
| 187 | assert "# skipped: preference_flip" in rendered |
| 188 | assert "(no " in rendered # reasons start with "no ..." |
| 189 | |
| 190 | def test_rendered_yaml_omits_skipped_block_when_all_probes_fit(self) -> None: |
| 191 | """A heavily-populated doc that triggers every probe emits no |
| 192 | ``# skipped:`` lines.""" |
| 193 | from dlm_sway.core.sections import SectionPreference, SectionProbe |
| 194 | from dlm_sway.integrations.dlm.autogen import ( |
| 195 | _render_annotated_yaml, |
| 196 | build_spec_dict, |
| 197 | collect_skipped_probe_reasons, |
| 198 | ) |
| 199 | |
| 200 | probes = tuple(SectionProbe(prompt=f"Q{i}?", gold=f"A{i}") for i in range(25)) |
| 201 | preferences = (SectionPreference(prompt="P1", chosen="good", rejected="bad"),) |
| 202 | handle = DlmHandle( |
| 203 | dlm_id="x", |
| 204 | base_model="b", |
| 205 | adapter_path=Path("/tmp/a"), |
| 206 | sections=( |
| 207 | Section(id="i1", kind="instruction", content="Q/A", probes=probes), |
| 208 | Section( |
| 209 | id="p1", |
| 210 | kind="prose", |
| 211 | content="A first prose sentence. A second. A third.", |
| 212 | ), |
| 213 | Section( |
| 214 | id="pref1", |
| 215 | kind="preference", |
| 216 | content="pref", |
| 217 | preferences=preferences, |
| 218 | ), |
| 219 | ), |
| 220 | doc_text="doc", |
| 221 | ) |
| 222 | spec = build_spec_dict(handle) |
| 223 | skipped = collect_skipped_probe_reasons(handle) |
| 224 | rendered = _render_annotated_yaml(spec, handle, Path("/tmp/demo.dlm"), skipped=skipped) |
| 225 | assert "# skipped:" not in rendered |
| 226 | |
| 227 | |
| 228 | class TestPortableDlmSource: |
| 229 | """F09 (Audit 03) — ``_portable_dlm_source`` emits a cwd-relative |
| 230 | path when the ``.dlm`` lives inside the cwd (survives CI checkout), |
| 231 | absolute path when it lives elsewhere. |
| 232 | """ |
| 233 | |
| 234 | def test_cwd_relative_when_inside(self, tmp_path: Path, monkeypatch) -> None: # type: ignore[no-untyped-def] |
| 235 | from dlm_sway.integrations.dlm.autogen import _portable_dlm_source |
| 236 | |
| 237 | # Set cwd to tmp_path; drop a .dlm inside a subdir. |
| 238 | subdir = tmp_path / "src" |
| 239 | subdir.mkdir() |
| 240 | dlm_file = subdir / "demo.dlm" |
| 241 | dlm_file.write_text("# empty\n") |
| 242 | monkeypatch.chdir(tmp_path) |
| 243 | source = _portable_dlm_source(dlm_file) |
| 244 | assert source == "src/demo.dlm" |
| 245 | # Not an absolute path — the whole point of F09. |
| 246 | assert not Path(source).is_absolute() |
| 247 | |
| 248 | def test_absolute_when_outside(self, tmp_path: Path, monkeypatch) -> None: # type: ignore[no-untyped-def] |
| 249 | """A ``.dlm`` somewhere outside the cwd falls back to its |
| 250 | absolute path — relative-ization would point at a nonexistent |
| 251 | parent directory on a fresh checkout.""" |
| 252 | from dlm_sway.integrations.dlm.autogen import _portable_dlm_source |
| 253 | |
| 254 | # cwd inside tmp_path; .dlm lives in a sibling tree. |
| 255 | cwd = tmp_path / "cwd" |
| 256 | cwd.mkdir() |
| 257 | sibling = tmp_path / "other" |
| 258 | sibling.mkdir() |
| 259 | dlm_file = sibling / "demo.dlm" |
| 260 | dlm_file.write_text("# empty\n") |
| 261 | monkeypatch.chdir(cwd) |
| 262 | source = _portable_dlm_source(dlm_file) |
| 263 | assert Path(source).is_absolute() |
| 264 | assert source == str(dlm_file.resolve()) |
| 265 | |
| 266 | |
| 267 | class TestAutogenClusterKL: |
| 268 | """F07 — autogen emits ``cluster_kl`` when the prompt pool has |
| 269 | enough entries to clear S16's ``min_prompts=20`` floor, and omits |
| 270 | it otherwise.""" |
| 271 | |
| 272 | def test_emits_cluster_kl_when_prompt_pool_is_large(self) -> None: |
| 273 | spec = build_spec_dict(_handle_with_many_instruction_probes(25)) |
| 274 | entry = next((e for e in spec["suite"] if e["kind"] == "cluster_kl"), None) |
| 275 | assert entry is not None, "autogen should emit cluster_kl on large prompt pools" |
| 276 | assert entry["num_clusters"] == 5 |
| 277 | assert entry["min_prompts"] == 20 |
| 278 | assert len(entry["prompts"]) >= 20 |
| 279 | # Cap at 64 so a doc with hundreds of probes doesn't explode |
| 280 | # the cluster runtime. |
| 281 | assert len(entry["prompts"]) <= 64 |
| 282 | |
| 283 | def test_omits_cluster_kl_on_small_prompt_pool(self) -> None: |
| 284 | """Under 20 prompts → omit the entry. The probe would SKIP |
| 285 | anyway; skipping emission keeps the autogen'd YAML tidy.""" |
| 286 | spec = build_spec_dict(_handle_with_many_instruction_probes(5)) |
| 287 | entry = next((e for e in spec["suite"] if e["kind"] == "cluster_kl"), None) |
| 288 | assert entry is None |
| 289 | |
| 290 | def test_prompts_deduplicated(self) -> None: |
| 291 | """No duplicate entries (instruction prompts + prose leading |
| 292 | sentences are merged but must not repeat verbatim).""" |
| 293 | spec = build_spec_dict(_handle_with_many_instruction_probes(30)) |
| 294 | entry = next((e for e in spec["suite"] if e["kind"] == "cluster_kl"), None) |
| 295 | assert entry is not None |
| 296 | assert len(entry["prompts"]) == len(set(entry["prompts"])) |