@@ -0,0 +1,99 @@ |
| 1 | +"""S11 prove-the-value: committed history fixture catches a planted regression. |
| 2 | + |
| 3 | +The sprint's audit §F5 experiment: commit a ``sway-history/`` directory |
| 4 | +with N dated result JSONs, run ``sway compare --fail-on-regression 0.10``, |
| 5 | +watch it flag the planted regression in the newest run. |
| 6 | + |
| 7 | +Fixture layout (``tests/fixtures/sway-history/``): |
| 8 | + 01-2026-01-15.json — baseline (overall=0.78) |
| 9 | + 02-2026-01-22.json — improved (overall=0.81) — adapter v2 retrained |
| 10 | + 03-2026-01-29.json — over-trained (overall=0.66) — adapter v3 |
| 11 | + |
| 12 | +The planted story: v3 is an over-trained adapter. ``delta_kl`` rises |
| 13 | +(more divergence from base) but ``section_internalization`` and |
| 14 | +``calibration_drift`` both drop sharply — the adapter memorized one |
| 15 | +section at the cost of the others, and forgot broader knowledge. |
| 16 | +``sway compare --fail-on-regression 0.10`` must exit 1 and surface the |
| 17 | +two regressions. |
| 18 | +""" |
| 19 | + |
| 20 | +from __future__ import annotations |
| 21 | + |
| 22 | +from pathlib import Path |
| 23 | + |
| 24 | +from typer.testing import CliRunner |
| 25 | + |
| 26 | +from dlm_sway.cli.app import app |
| 27 | + |
| 28 | +HISTORY_DIR = Path(__file__).parent.parent / "fixtures" / "sway-history" |
| 29 | + |
| 30 | + |
| 31 | +def _history_paths() -> list[str]: |
| 32 | + paths = sorted(HISTORY_DIR.glob("*.json")) |
| 33 | + assert len(paths) == 3, f"expected 3 history JSONs, got {len(paths)}: {paths}" |
| 34 | + return [str(p) for p in paths] |
| 35 | + |
| 36 | + |
| 37 | +def test_history_fixture_exists() -> None: |
| 38 | + """Sanity: the committed history fixture is three JSONs in order.""" |
| 39 | + paths = sorted(HISTORY_DIR.glob("*.json")) |
| 40 | + assert [p.name for p in paths] == [ |
| 41 | + "01-2026-01-15.json", |
| 42 | + "02-2026-01-22.json", |
| 43 | + "03-2026-01-29.json", |
| 44 | + ] |
| 45 | + |
| 46 | + |
| 47 | +def test_compare_catches_planted_regression() -> None: |
| 48 | + """The prove-the-value run: --fail-on-regression 0.10 exits 1 + flags both probes.""" |
| 49 | + result = CliRunner().invoke( |
| 50 | + app, |
| 51 | + ["compare", *_history_paths(), "--format", "json", "--fail-on-regression", "0.10"], |
| 52 | + ) |
| 53 | + assert result.exit_code == 1, ( |
| 54 | + "expected exit=1 (planted regression ≥ 0.10 on run 03); " |
| 55 | + f"got exit={result.exit_code}\nstdout:\n{result.stdout}" |
| 56 | + ) |
| 57 | + # JSON payload is emitted to stdout even though the gate failed. |
| 58 | + import json as _json |
| 59 | + |
| 60 | + parsed = _json.loads(result.stdout) |
| 61 | + regressed_names = [entry["probe"] for entry in parsed["latest_regressions"]] |
| 62 | + # Both section_internalization and calibration_drift dropped ≥0.10. |
| 63 | + assert "section_internalization" in regressed_names, regressed_names |
| 64 | + assert "calibration_drift" in regressed_names, regressed_names |
| 65 | + # delta_kl rose, not regressed — must not appear. |
| 66 | + assert "delta_kl" not in regressed_names |
| 67 | + |
| 68 | + # Shape contract: 4 probes × 3 runs + composite timeline present. |
| 69 | + assert set(parsed["probe_names"]) == { |
| 70 | + "adapter_ablation", |
| 71 | + "calibration_drift", |
| 72 | + "delta_kl", |
| 73 | + "section_internalization", |
| 74 | + } |
| 75 | + assert len(parsed["composite_series"]) == 3 |
| 76 | + # Composite drops on the third run. |
| 77 | + assert parsed["composite_series"][2] < parsed["composite_series"][1] |
| 78 | + |
| 79 | + |
| 80 | +def test_compare_improving_history_exits_0() -> None: |
| 81 | + """Control: the first two runs alone show an improvement — exit 0.""" |
| 82 | + first_two = _history_paths()[:2] |
| 83 | + result = CliRunner().invoke( |
| 84 | + app, |
| 85 | + ["compare", *first_two, "--format", "json", "--fail-on-regression", "0.10"], |
| 86 | + ) |
| 87 | + assert result.exit_code == 0, result.stdout |
| 88 | + |
| 89 | + |
| 90 | +def test_terminal_output_surfaces_regressions() -> None: |
| 91 | + """Human-facing terminal render lists the regressed probe names.""" |
| 92 | + result = CliRunner().invoke( |
| 93 | + app, |
| 94 | + ["compare", *_history_paths(), "--fail-on-regression", "0.10"], |
| 95 | + ) |
| 96 | + assert result.exit_code == 1, result.stdout |
| 97 | + assert "regressions" in result.stdout.lower() |
| 98 | + assert "section_internalization" in result.stdout |
| 99 | + assert "calibration_drift" in result.stdout |