Python · 3808 bytes Raw Blame History
1 """S11 prove-the-value: committed history fixture catches a planted regression.
2
3 The sprint's audit §F5 experiment: commit a ``sway-history/`` directory
4 with N dated result JSONs, run ``sway compare --fail-on-regression 0.10``,
5 watch it flag the planted regression in the newest run.
6
7 Fixture layout (``tests/fixtures/sway-history/``):
8 01-2026-01-15.json — baseline (overall=0.78)
9 02-2026-01-22.json — improved (overall=0.81) — adapter v2 retrained
10 03-2026-01-29.json — over-trained (overall=0.66) — adapter v3
11
12 The planted story: v3 is an over-trained adapter. ``delta_kl`` rises
13 (more divergence from base) but ``section_internalization`` and
14 ``calibration_drift`` both drop sharply — the adapter memorized one
15 section at the cost of the others, and forgot broader knowledge.
16 ``sway compare --fail-on-regression 0.10`` must exit 1 and surface the
17 two regressions.
18 """
19
20 from __future__ import annotations
21
22 from pathlib import Path
23
24 from typer.testing import CliRunner
25
26 from dlm_sway.cli.app import app
27
28 HISTORY_DIR = Path(__file__).parent.parent / "fixtures" / "sway-history"
29
30
31 def _history_paths() -> list[str]:
32 paths = sorted(HISTORY_DIR.glob("*.json"))
33 assert len(paths) == 3, f"expected 3 history JSONs, got {len(paths)}: {paths}"
34 return [str(p) for p in paths]
35
36
37 def test_history_fixture_exists() -> None:
38 """Sanity: the committed history fixture is three JSONs in order."""
39 paths = sorted(HISTORY_DIR.glob("*.json"))
40 assert [p.name for p in paths] == [
41 "01-2026-01-15.json",
42 "02-2026-01-22.json",
43 "03-2026-01-29.json",
44 ]
45
46
47 def test_compare_catches_planted_regression() -> None:
48 """The prove-the-value run: --fail-on-regression 0.10 exits 1 + flags both probes."""
49 result = CliRunner().invoke(
50 app,
51 ["compare", *_history_paths(), "--format", "json", "--fail-on-regression", "0.10"],
52 )
53 assert result.exit_code == 1, (
54 "expected exit=1 (planted regression ≥ 0.10 on run 03); "
55 f"got exit={result.exit_code}\nstdout:\n{result.stdout}"
56 )
57 # JSON payload is emitted to stdout even though the gate failed.
58 import json as _json
59
60 parsed = _json.loads(result.stdout)
61 regressed_names = [entry["probe"] for entry in parsed["latest_regressions"]]
62 # Both section_internalization and calibration_drift dropped ≥0.10.
63 assert "section_internalization" in regressed_names, regressed_names
64 assert "calibration_drift" in regressed_names, regressed_names
65 # delta_kl rose, not regressed — must not appear.
66 assert "delta_kl" not in regressed_names
67
68 # Shape contract: 4 probes × 3 runs + composite timeline present.
69 assert set(parsed["probe_names"]) == {
70 "adapter_ablation",
71 "calibration_drift",
72 "delta_kl",
73 "section_internalization",
74 }
75 assert len(parsed["composite_series"]) == 3
76 # Composite drops on the third run.
77 assert parsed["composite_series"][2] < parsed["composite_series"][1]
78
79
80 def test_compare_improving_history_exits_0() -> None:
81 """Control: the first two runs alone show an improvement — exit 0."""
82 first_two = _history_paths()[:2]
83 result = CliRunner().invoke(
84 app,
85 ["compare", *first_two, "--format", "json", "--fail-on-regression", "0.10"],
86 )
87 assert result.exit_code == 0, result.stdout
88
89
90 def test_terminal_output_surfaces_regressions() -> None:
91 """Human-facing terminal render lists the regressed probe names."""
92 result = CliRunner().invoke(
93 app,
94 ["compare", *_history_paths(), "--fail-on-regression", "0.10"],
95 )
96 assert result.exit_code == 1, result.stdout
97 assert "regressions" in result.stdout.lower()
98 assert "section_internalization" in result.stdout
99 assert "calibration_drift" in result.stdout