`45360bd`

tests/unit: prove-the-value — sway compare catches planted regression in committed history

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 3 weeks ago

SHA: 45360bdb47ace7c05789bc2909db52dc1d612311
Parents: 309c13e
Tree: 52649e3

1 changed file

Status	File	+	-
A	`tests/unit/test_compare_prove_value.py`	99	0

tests/unit/test_compare_prove_value.pyadded

 +"""S11 prove-the-value: committed history fixture catches a planted regression.
++
 +The sprint's audit §F5 experiment: commit a ``sway-history/`` directory
 +with N dated result JSONs, run ``sway compare --fail-on-regression 0.10``,
 +watch it flag the planted regression in the newest run.
++
 +Fixture layout (``tests/fixtures/sway-history/``):
 +    01-2026-01-15.json  — baseline (overall=0.78)
 +    02-2026-01-22.json  — improved (overall=0.81) — adapter v2 retrained
 +    03-2026-01-29.json  — over-trained (overall=0.66) — adapter v3
++
 +The planted story: v3 is an over-trained adapter. ``delta_kl`` rises
 +(more divergence from base) but ``section_internalization`` and
 +``calibration_drift`` both drop sharply — the adapter memorized one
 +section at the cost of the others, and forgot broader knowledge.
 +``sway compare --fail-on-regression 0.10`` must exit 1 and surface the
 +two regressions.
 +"""
++
 +from __future__ import annotations
++
 +from pathlib import Path
++
 +from typer.testing import CliRunner
++
 +from dlm_sway.cli.app import app
++
 +HISTORY_DIR = Path(__file__).parent.parent / "fixtures" / "sway-history"
++
++
 +def _history_paths() -> list[str]:
 +    paths = sorted(HISTORY_DIR.glob("*.json"))
 +    assert len(paths) == 3, f"expected 3 history JSONs, got {len(paths)}: {paths}"
 +    return [str(p) for p in paths]
++
++
 +def test_history_fixture_exists() -> None:
 +    """Sanity: the committed history fixture is three JSONs in order."""
 +    paths = sorted(HISTORY_DIR.glob("*.json"))
 +    assert [p.name for p in paths] == [
 +        "01-2026-01-15.json",
 +        "02-2026-01-22.json",
 +        "03-2026-01-29.json",
 +    ]
++
++
 +def test_compare_catches_planted_regression() -> None:
 +    """The prove-the-value run: --fail-on-regression 0.10 exits 1 + flags both probes."""
 +    result = CliRunner().invoke(
 +        app,
 +        ["compare", *_history_paths(), "--format", "json", "--fail-on-regression", "0.10"],
 +    )
 +    assert result.exit_code == 1, (
 +        "expected exit=1 (planted regression ≥ 0.10 on run 03); "
 +        f"got exit={result.exit_code}\nstdout:\n{result.stdout}"
 +    )
 +    # JSON payload is emitted to stdout even though the gate failed.
 +    import json as _json
++
 +    parsed = _json.loads(result.stdout)
 +    regressed_names = [entry["probe"] for entry in parsed["latest_regressions"]]
 +    # Both section_internalization and calibration_drift dropped ≥0.10.
 +    assert "section_internalization" in regressed_names, regressed_names
 +    assert "calibration_drift" in regressed_names, regressed_names
 +    # delta_kl rose, not regressed — must not appear.
 +    assert "delta_kl" not in regressed_names
++
 +    # Shape contract: 4 probes × 3 runs + composite timeline present.
 +    assert set(parsed["probe_names"]) == {
 +        "adapter_ablation",
 +        "calibration_drift",
 +        "delta_kl",
 +        "section_internalization",
 +    }
 +    assert len(parsed["composite_series"]) == 3
 +    # Composite drops on the third run.
 +    assert parsed["composite_series"][2] < parsed["composite_series"][1]
++
++
 +def test_compare_improving_history_exits_0() -> None:
 +    """Control: the first two runs alone show an improvement — exit 0."""
 +    first_two = _history_paths()[:2]
 +    result = CliRunner().invoke(
 +        app,
 +        ["compare", *first_two, "--format", "json", "--fail-on-regression", "0.10"],
 +    )
 +    assert result.exit_code == 0, result.stdout
++
++
 +def test_terminal_output_surfaces_regressions() -> None:
 +    """Human-facing terminal render lists the regressed probe names."""
 +    result = CliRunner().invoke(
 +        app,
 +        ["compare", *_history_paths(), "--fail-on-regression", "0.10"],
 +    )
 +    assert result.exit_code == 1, result.stdout
 +    assert "regressions" in result.stdout.lower()
 +    assert "section_internalization" in result.stdout
 +    assert "calibration_drift" in result.stdout