tenseleyflow/sway / 45360bd

Browse files

tests/unit: prove-the-value — sway compare catches planted regression in committed history

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
45360bdb47ace7c05789bc2909db52dc1d612311
Parents
309c13e
Tree
52649e3

1 changed file

StatusFile+-
A tests/unit/test_compare_prove_value.py 99 0
tests/unit/test_compare_prove_value.pyadded
@@ -0,0 +1,99 @@
1
+"""S11 prove-the-value: committed history fixture catches a planted regression.
2
+
3
+The sprint's audit §F5 experiment: commit a ``sway-history/`` directory
4
+with N dated result JSONs, run ``sway compare --fail-on-regression 0.10``,
5
+watch it flag the planted regression in the newest run.
6
+
7
+Fixture layout (``tests/fixtures/sway-history/``):
8
+    01-2026-01-15.json  — baseline (overall=0.78)
9
+    02-2026-01-22.json  — improved (overall=0.81) — adapter v2 retrained
10
+    03-2026-01-29.json  — over-trained (overall=0.66) — adapter v3
11
+
12
+The planted story: v3 is an over-trained adapter. ``delta_kl`` rises
13
+(more divergence from base) but ``section_internalization`` and
14
+``calibration_drift`` both drop sharply — the adapter memorized one
15
+section at the cost of the others, and forgot broader knowledge.
16
+``sway compare --fail-on-regression 0.10`` must exit 1 and surface the
17
+two regressions.
18
+"""
19
+
20
+from __future__ import annotations
21
+
22
+from pathlib import Path
23
+
24
+from typer.testing import CliRunner
25
+
26
+from dlm_sway.cli.app import app
27
+
28
+HISTORY_DIR = Path(__file__).parent.parent / "fixtures" / "sway-history"
29
+
30
+
31
+def _history_paths() -> list[str]:
32
+    paths = sorted(HISTORY_DIR.glob("*.json"))
33
+    assert len(paths) == 3, f"expected 3 history JSONs, got {len(paths)}: {paths}"
34
+    return [str(p) for p in paths]
35
+
36
+
37
+def test_history_fixture_exists() -> None:
38
+    """Sanity: the committed history fixture is three JSONs in order."""
39
+    paths = sorted(HISTORY_DIR.glob("*.json"))
40
+    assert [p.name for p in paths] == [
41
+        "01-2026-01-15.json",
42
+        "02-2026-01-22.json",
43
+        "03-2026-01-29.json",
44
+    ]
45
+
46
+
47
+def test_compare_catches_planted_regression() -> None:
48
+    """The prove-the-value run: --fail-on-regression 0.10 exits 1 + flags both probes."""
49
+    result = CliRunner().invoke(
50
+        app,
51
+        ["compare", *_history_paths(), "--format", "json", "--fail-on-regression", "0.10"],
52
+    )
53
+    assert result.exit_code == 1, (
54
+        "expected exit=1 (planted regression ≥ 0.10 on run 03); "
55
+        f"got exit={result.exit_code}\nstdout:\n{result.stdout}"
56
+    )
57
+    # JSON payload is emitted to stdout even though the gate failed.
58
+    import json as _json
59
+
60
+    parsed = _json.loads(result.stdout)
61
+    regressed_names = [entry["probe"] for entry in parsed["latest_regressions"]]
62
+    # Both section_internalization and calibration_drift dropped ≥0.10.
63
+    assert "section_internalization" in regressed_names, regressed_names
64
+    assert "calibration_drift" in regressed_names, regressed_names
65
+    # delta_kl rose, not regressed — must not appear.
66
+    assert "delta_kl" not in regressed_names
67
+
68
+    # Shape contract: 4 probes × 3 runs + composite timeline present.
69
+    assert set(parsed["probe_names"]) == {
70
+        "adapter_ablation",
71
+        "calibration_drift",
72
+        "delta_kl",
73
+        "section_internalization",
74
+    }
75
+    assert len(parsed["composite_series"]) == 3
76
+    # Composite drops on the third run.
77
+    assert parsed["composite_series"][2] < parsed["composite_series"][1]
78
+
79
+
80
+def test_compare_improving_history_exits_0() -> None:
81
+    """Control: the first two runs alone show an improvement — exit 0."""
82
+    first_two = _history_paths()[:2]
83
+    result = CliRunner().invoke(
84
+        app,
85
+        ["compare", *first_two, "--format", "json", "--fail-on-regression", "0.10"],
86
+    )
87
+    assert result.exit_code == 0, result.stdout
88
+
89
+
90
+def test_terminal_output_surfaces_regressions() -> None:
91
+    """Human-facing terminal render lists the regressed probe names."""
92
+    result = CliRunner().invoke(
93
+        app,
94
+        ["compare", *_history_paths(), "--fail-on-regression", "0.10"],
95
+    )
96
+    assert result.exit_code == 1, result.stdout
97
+    assert "regressions" in result.stdout.lower()
98
+    assert "section_internalization" in result.stdout
99
+    assert "calibration_drift" in result.stdout