sway Public

Watch 0 Fork 0 Star 0

Python · 3808 bytes Raw Blame History

  
        1
        """S11 prove-the-value: committed history fixture catches a planted regression.
      
        2
        
        3
        The sprint's audit §F5 experiment: commit a ``sway-history/`` directory
      
        4
        with N dated result JSONs, run ``sway compare --fail-on-regression 0.10``,
      
        5
        watch it flag the planted regression in the newest run.
      
        6
        
        7
        Fixture layout (``tests/fixtures/sway-history/``):
      
        8
            01-2026-01-15.json  — baseline (overall=0.78)
      
        9
            02-2026-01-22.json  — improved (overall=0.81) — adapter v2 retrained
      
        10
            03-2026-01-29.json  — over-trained (overall=0.66) — adapter v3
      
        11
        
        12
        The planted story: v3 is an over-trained adapter. ``delta_kl`` rises
      
        13
        (more divergence from base) but ``section_internalization`` and
      
        14
        ``calibration_drift`` both drop sharply — the adapter memorized one
      
        15
        section at the cost of the others, and forgot broader knowledge.
      
        16
        ``sway compare --fail-on-regression 0.10`` must exit 1 and surface the
      
        17
        two regressions.
      
        18
        """
      
        19
        
        20
        from __future__ import annotations
      
        21
        
        22
        from pathlib import Path
      
        23
        
        24
        from typer.testing import CliRunner
      
        25
        
        26
        from dlm_sway.cli.app import app
      
        27
        
        28
        HISTORY_DIR = Path(__file__).parent.parent / "fixtures" / "sway-history"
      
        29
        
        30
        
        31
        def _history_paths() -> list[str]:
      
        32
            paths = sorted(HISTORY_DIR.glob("*.json"))
      
        33
            assert len(paths) == 3, f"expected 3 history JSONs, got {len(paths)}: {paths}"
      
        34
            return [str(p) for p in paths]
      
        35
        
        36
        
        37
        def test_history_fixture_exists() -> None:
      
        38
            """Sanity: the committed history fixture is three JSONs in order."""
      
        39
            paths = sorted(HISTORY_DIR.glob("*.json"))
      
        40
            assert [p.name for p in paths] == [
      
        41
                "01-2026-01-15.json",
      
        42
                "02-2026-01-22.json",
      
        43
                "03-2026-01-29.json",
      
        44
            ]
      
        45
        
        46
        
        47
        def test_compare_catches_planted_regression() -> None:
      
        48
            """The prove-the-value run: --fail-on-regression 0.10 exits 1 + flags both probes."""
      
        49
            result = CliRunner().invoke(
      
        50
                app,
      
        51
                ["compare", *_history_paths(), "--format", "json", "--fail-on-regression", "0.10"],
      
        52
            )
      
        53
            assert result.exit_code == 1, (
      
        54
                "expected exit=1 (planted regression ≥ 0.10 on run 03); "
      
        55
                f"got exit={result.exit_code}\nstdout:\n{result.stdout}"
      
        56
            )
      
        57
            # JSON payload is emitted to stdout even though the gate failed.
      
        58
            import json as _json
      
        59
        
        60
            parsed = _json.loads(result.stdout)
      
        61
            regressed_names = [entry["probe"] for entry in parsed["latest_regressions"]]
      
        62
            # Both section_internalization and calibration_drift dropped ≥0.10.
      
        63
            assert "section_internalization" in regressed_names, regressed_names
      
        64
            assert "calibration_drift" in regressed_names, regressed_names
      
        65
            # delta_kl rose, not regressed — must not appear.
      
        66
            assert "delta_kl" not in regressed_names
      
        67
        
        68
            # Shape contract: 4 probes × 3 runs + composite timeline present.
      
        69
            assert set(parsed["probe_names"]) == {
      
        70
                "adapter_ablation",
      
        71
                "calibration_drift",
      
        72
                "delta_kl",
      
        73
                "section_internalization",
      
        74
            }
      
        75
            assert len(parsed["composite_series"]) == 3
      
        76
            # Composite drops on the third run.
      
        77
            assert parsed["composite_series"][2] < parsed["composite_series"][1]
      
        78
        
        79
        
        80
        def test_compare_improving_history_exits_0() -> None:
      
        81
            """Control: the first two runs alone show an improvement — exit 0."""
      
        82
            first_two = _history_paths()[:2]
      
        83
            result = CliRunner().invoke(
      
        84
                app,
      
        85
                ["compare", *first_two, "--format", "json", "--fail-on-regression", "0.10"],
      
        86
            )
      
        87
            assert result.exit_code == 0, result.stdout
      
        88
        
        89
        
        90
        def test_terminal_output_surfaces_regressions() -> None:
      
        91
            """Human-facing terminal render lists the regressed probe names."""
      
        92
            result = CliRunner().invoke(
      
        93
                app,
      
        94
                ["compare", *_history_paths(), "--fail-on-regression", "0.10"],
      
        95
            )
      
        96
            assert result.exit_code == 1, result.stdout
      
        97
            assert "regressions" in result.stdout.lower()
      
        98
            assert "section_internalization" in result.stdout
      
        99
            assert "calibration_drift" in result.stdout

1	"""S11 prove-the-value: committed history fixture catches a planted regression.
2
3	The sprint's audit §F5 experiment: commit a ``sway-history/`` directory
4	with N dated result JSONs, run ``sway compare --fail-on-regression 0.10``,
5	watch it flag the planted regression in the newest run.
6
7	Fixture layout (``tests/fixtures/sway-history/``):
8	01-2026-01-15.json — baseline (overall=0.78)
9	02-2026-01-22.json — improved (overall=0.81) — adapter v2 retrained
10	03-2026-01-29.json — over-trained (overall=0.66) — adapter v3
11
12	The planted story: v3 is an over-trained adapter. ``delta_kl`` rises
13	(more divergence from base) but ``section_internalization`` and
14	``calibration_drift`` both drop sharply — the adapter memorized one
15	section at the cost of the others, and forgot broader knowledge.
16	``sway compare --fail-on-regression 0.10`` must exit 1 and surface the
17	two regressions.
18	"""
19
20	from __future__ import annotations
21
22	from pathlib import Path
23
24	from typer.testing import CliRunner
25
26	from dlm_sway.cli.app import app
27
28	HISTORY_DIR = Path(__file__).parent.parent / "fixtures" / "sway-history"
29
30
31	def _history_paths() -> list[str]:
32	paths = sorted(HISTORY_DIR.glob("*.json"))
33	assert len(paths) == 3, f"expected 3 history JSONs, got {len(paths)}: {paths}"
34	return [str(p) for p in paths]
35
36
37	def test_history_fixture_exists() -> None:
38	"""Sanity: the committed history fixture is three JSONs in order."""
39	paths = sorted(HISTORY_DIR.glob("*.json"))
40	assert [p.name for p in paths] == [
41	"01-2026-01-15.json",
42	"02-2026-01-22.json",
43	"03-2026-01-29.json",
44	]
45
46
47	def test_compare_catches_planted_regression() -> None:
48	"""The prove-the-value run: --fail-on-regression 0.10 exits 1 + flags both probes."""
49	result = CliRunner().invoke(
50	app,
51	["compare", *_history_paths(), "--format", "json", "--fail-on-regression", "0.10"],
52	)
53	assert result.exit_code == 1, (
54	"expected exit=1 (planted regression ≥ 0.10 on run 03); "
55	f"got exit={result.exit_code}\nstdout:\n{result.stdout}"
56	)
57	# JSON payload is emitted to stdout even though the gate failed.
58	import json as _json
59
60	parsed = _json.loads(result.stdout)
61	regressed_names = [entry["probe"] for entry in parsed["latest_regressions"]]
62	# Both section_internalization and calibration_drift dropped ≥0.10.
63	assert "section_internalization" in regressed_names, regressed_names
64	assert "calibration_drift" in regressed_names, regressed_names
65	# delta_kl rose, not regressed — must not appear.
66	assert "delta_kl" not in regressed_names
67
68	# Shape contract: 4 probes × 3 runs + composite timeline present.
69	assert set(parsed["probe_names"]) == {
70	"adapter_ablation",
71	"calibration_drift",
72	"delta_kl",
73	"section_internalization",
74	}
75	assert len(parsed["composite_series"]) == 3
76	# Composite drops on the third run.
77	assert parsed["composite_series"][2] < parsed["composite_series"][1]
78
79
80	def test_compare_improving_history_exits_0() -> None:
81	"""Control: the first two runs alone show an improvement — exit 0."""
82	first_two = _history_paths()[:2]
83	result = CliRunner().invoke(
84	app,
85	["compare", *first_two, "--format", "json", "--fail-on-regression", "0.10"],
86	)
87	assert result.exit_code == 0, result.stdout
88
89
90	def test_terminal_output_surfaces_regressions() -> None:
91	"""Human-facing terminal render lists the regressed probe names."""
92	result = CliRunner().invoke(
93	app,
94	["compare", *_history_paths(), "--fail-on-regression", "0.10"],
95	)
96	assert result.exit_code == 1, result.stdout
97	assert "regressions" in result.stdout.lower()
98	assert "section_internalization" in result.stdout
99	assert "calibration_drift" in result.stdout