| 1 | """Tests for :mod:`dlm_sway.suite.compare` (S11 / F5).""" |
| 2 | |
| 3 | from __future__ import annotations |
| 4 | |
| 5 | import json |
| 6 | import os |
| 7 | from datetime import UTC, datetime |
| 8 | from pathlib import Path |
| 9 | |
| 10 | import pytest |
| 11 | from rich.console import Console |
| 12 | |
| 13 | from dlm_sway.core.result import ( |
| 14 | ProbeResult, |
| 15 | SuiteResult, |
| 16 | SwayScore, |
| 17 | Verdict, |
| 18 | ) |
| 19 | from dlm_sway.suite.compare import ( |
| 20 | CompareMatrix, |
| 21 | build_matrix, |
| 22 | render_json, |
| 23 | render_markdown, |
| 24 | render_terminal, |
| 25 | ) |
| 26 | |
| 27 | SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots" |
| 28 | |
| 29 | |
| 30 | def _probe(name: str, score: float | None, verdict: Verdict = Verdict.PASS) -> ProbeResult: |
| 31 | return ProbeResult( |
| 32 | name=name, |
| 33 | kind=name.split("_")[0] if "_" in name else "delta_kl", |
| 34 | verdict=verdict, |
| 35 | score=score, |
| 36 | raw=score, |
| 37 | message=f"{name} score={score}", |
| 38 | ) |
| 39 | |
| 40 | |
| 41 | def _suite( |
| 42 | probes: list[ProbeResult], *, started_minute: int, overall: float |
| 43 | ) -> tuple[SuiteResult, SwayScore]: |
| 44 | # ``started_minute`` is treated as offset-from-noon in 30-minute |
| 45 | # increments (run-a=0, run-b=30, run-c=60 maps to 12:00, 12:30, 13:00). |
| 46 | hours_offset, mins = divmod(started_minute, 60) |
| 47 | started = datetime(2026, 1, 1, 12 + hours_offset, mins, 0, tzinfo=UTC) |
| 48 | finished = datetime(2026, 1, 1, 12 + hours_offset, mins, 30, tzinfo=UTC) |
| 49 | suite = SuiteResult( |
| 50 | spec_path="fixture.yaml", |
| 51 | started_at=started, |
| 52 | finished_at=finished, |
| 53 | base_model_id="test/base", |
| 54 | adapter_id="adapter-v1", |
| 55 | sway_version="0.0.0", |
| 56 | probes=tuple(probes), |
| 57 | ) |
| 58 | score = SwayScore( |
| 59 | overall=overall, |
| 60 | components={"adherence": overall, "attribution": overall}, |
| 61 | band=SwayScore.band_for(overall), |
| 62 | ) |
| 63 | return suite, score |
| 64 | |
| 65 | |
| 66 | def _three_run_history() -> list[tuple[SuiteResult, SwayScore]]: |
| 67 | """Three synthetic runs with overlapping-but-not-identical probe sets: |
| 68 | |
| 69 | - run-a: {delta_kl, section_internalization} |
| 70 | - run-b: {delta_kl, section_internalization, leakage} (leakage *added*) |
| 71 | - run-c: {delta_kl, leakage} (section_internalization *removed*) |
| 72 | """ |
| 73 | run_a = _suite( |
| 74 | [ |
| 75 | _probe("delta_kl", 0.80), |
| 76 | _probe("section_internalization", 0.70), |
| 77 | ], |
| 78 | started_minute=0, |
| 79 | overall=0.75, |
| 80 | ) |
| 81 | run_b = _suite( |
| 82 | [ |
| 83 | _probe("delta_kl", 0.82), |
| 84 | _probe("section_internalization", 0.72), |
| 85 | _probe("leakage", 0.90), |
| 86 | ], |
| 87 | started_minute=30, |
| 88 | overall=0.81, |
| 89 | ) |
| 90 | run_c = _suite( |
| 91 | [ |
| 92 | _probe("delta_kl", 0.65), # dropped 0.17 |
| 93 | _probe("leakage", 0.88), # dropped 0.02 |
| 94 | ], |
| 95 | started_minute=60, |
| 96 | overall=0.72, |
| 97 | ) |
| 98 | return [run_a, run_b, run_c] |
| 99 | |
| 100 | |
| 101 | class TestBuildMatrix: |
| 102 | def test_union_probe_names_sorted(self) -> None: |
| 103 | matrix = build_matrix(_three_run_history()) |
| 104 | assert matrix.probe_names == ("delta_kl", "leakage", "section_internalization") |
| 105 | |
| 106 | def test_missing_cells_are_none(self) -> None: |
| 107 | matrix = build_matrix(_three_run_history()) |
| 108 | # leakage did not exist in run-a. |
| 109 | assert matrix.scores["leakage"][0] is None |
| 110 | assert matrix.scores["leakage"][1] == pytest.approx(0.90) |
| 111 | # section_internalization did not exist in run-c. |
| 112 | assert matrix.scores["section_internalization"][2] is None |
| 113 | |
| 114 | def test_deltas_skip_none_neighbors(self) -> None: |
| 115 | matrix = build_matrix(_three_run_history()) |
| 116 | # leakage: None → 0.90 → 0.88. First delta None (None neighbor); |
| 117 | # second delta ≈ -0.02. |
| 118 | assert matrix.deltas["leakage"][0] is None |
| 119 | assert matrix.deltas["leakage"][1] == pytest.approx(-0.02) |
| 120 | # section_internalization: 0.70 → 0.72 → None. First +0.02; second None. |
| 121 | assert matrix.deltas["section_internalization"][0] == pytest.approx(0.02) |
| 122 | assert matrix.deltas["section_internalization"][1] is None |
| 123 | |
| 124 | def test_composite_series_parallels_labels(self) -> None: |
| 125 | matrix = build_matrix(_three_run_history()) |
| 126 | assert matrix.n_runs == 3 |
| 127 | assert matrix.composite_series == pytest.approx([0.75, 0.81, 0.72]) |
| 128 | |
| 129 | def test_labels_default_to_timestamps(self) -> None: |
| 130 | matrix = build_matrix(_three_run_history()) |
| 131 | # Default labels come from finished_at ISO strings. |
| 132 | assert matrix.labels[0].startswith("2026-01-01T12:00") |
| 133 | |
| 134 | def test_labels_override(self) -> None: |
| 135 | runs = _three_run_history() |
| 136 | matrix = build_matrix(runs, labels=["v1", "v2", "v3"]) |
| 137 | assert matrix.labels == ("v1", "v2", "v3") |
| 138 | |
| 139 | def test_labels_length_mismatch_raises(self) -> None: |
| 140 | runs = _three_run_history() |
| 141 | with pytest.raises(ValueError, match="labels length"): |
| 142 | build_matrix(runs, labels=["v1", "v2"]) |
| 143 | |
| 144 | def test_empty_results_raises(self) -> None: |
| 145 | with pytest.raises(ValueError, match="at least one run"): |
| 146 | build_matrix([]) |
| 147 | |
| 148 | def test_non_finite_score_coerced_to_none(self) -> None: |
| 149 | suite, score = _suite([_probe("delta_kl", float("nan"))], started_minute=0, overall=0.5) |
| 150 | matrix = build_matrix([(suite, score)]) |
| 151 | assert matrix.scores["delta_kl"] == [None] |
| 152 | |
| 153 | |
| 154 | class TestLatestRegressions: |
| 155 | def test_single_run_yields_empty(self) -> None: |
| 156 | suite, score = _suite([_probe("dk", 0.5)], started_minute=0, overall=0.5) |
| 157 | matrix = build_matrix([(suite, score)]) |
| 158 | assert matrix.latest_regressions(threshold=0.1) == [] |
| 159 | |
| 160 | def test_catches_newest_regression(self) -> None: |
| 161 | matrix = build_matrix(_three_run_history()) |
| 162 | regs = matrix.latest_regressions(threshold=0.10) |
| 163 | # delta_kl dropped 0.17; leakage only 0.02 (below threshold). |
| 164 | assert [name for name, _ in regs] == ["delta_kl"] |
| 165 | assert regs[0][1] == pytest.approx(-0.17) |
| 166 | |
| 167 | def test_zero_threshold_disables(self) -> None: |
| 168 | matrix = build_matrix(_three_run_history()) |
| 169 | assert matrix.latest_regressions(threshold=0.0) == [] |
| 170 | |
| 171 | def test_sorted_by_severity(self) -> None: |
| 172 | run_a = _suite( |
| 173 | [_probe("p1", 0.9), _probe("p2", 0.9), _probe("p3", 0.9)], |
| 174 | started_minute=0, |
| 175 | overall=0.9, |
| 176 | ) |
| 177 | run_b = _suite( |
| 178 | [_probe("p1", 0.6), _probe("p2", 0.4), _probe("p3", 0.75)], |
| 179 | started_minute=30, |
| 180 | overall=0.6, |
| 181 | ) |
| 182 | matrix = build_matrix([run_a, run_b]) |
| 183 | regs = matrix.latest_regressions(threshold=0.10) |
| 184 | # p2 dropped 0.5, p1 dropped 0.3, p3 dropped 0.15 → order p2, p1, p3 |
| 185 | assert [name for name, _ in regs] == ["p2", "p1", "p3"] |
| 186 | |
| 187 | |
| 188 | class TestRenderJson: |
| 189 | def test_round_trip(self) -> None: |
| 190 | matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"]) |
| 191 | raw = render_json(matrix, regression_threshold=0.10) |
| 192 | parsed = json.loads(raw) |
| 193 | assert parsed["labels"] == ["v1", "v2", "v3"] |
| 194 | assert parsed["composite_series"] == pytest.approx([0.75, 0.81, 0.72]) |
| 195 | assert parsed["latest_regressions"][0]["probe"] == "delta_kl" |
| 196 | assert parsed["latest_regressions"][0]["delta"] == pytest.approx(-0.17) |
| 197 | assert parsed["regression_threshold"] == pytest.approx(0.10) |
| 198 | |
| 199 | def test_no_regression_at_higher_threshold(self) -> None: |
| 200 | matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"]) |
| 201 | raw = render_json(matrix, regression_threshold=0.50) |
| 202 | parsed = json.loads(raw) |
| 203 | assert parsed["latest_regressions"] == [] |
| 204 | |
| 205 | |
| 206 | class TestRenderTerminal: |
| 207 | def test_renders_without_error(self) -> None: |
| 208 | matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"]) |
| 209 | console = Console(record=True, width=160) |
| 210 | render_terminal(matrix, console=console, regression_threshold=0.10) |
| 211 | out = console.export_text() |
| 212 | # Smoke: headers + probe names + composite all appear. |
| 213 | assert "sway compare" in out |
| 214 | assert "delta_kl" in out |
| 215 | assert "composite" in out |
| 216 | assert "regressions" in out # regression footer emitted |
| 217 | |
| 218 | def test_suppresses_regression_block_when_none(self) -> None: |
| 219 | matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"]) |
| 220 | console = Console(record=True, width=160) |
| 221 | render_terminal(matrix, console=console, regression_threshold=0.90) |
| 222 | out = console.export_text() |
| 223 | assert "regressions" not in out |
| 224 | |
| 225 | |
| 226 | class TestRenderMarkdownSnapshot: |
| 227 | def test_markdown_snapshot(self) -> None: |
| 228 | """Lock the markdown layout. Run |
| 229 | ``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py`` |
| 230 | to regenerate after an intentional format change. |
| 231 | """ |
| 232 | matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"]) |
| 233 | actual = render_markdown(matrix, regression_threshold=0.10) |
| 234 | |
| 235 | path = SNAPSHOT_DIR / "compare.md" |
| 236 | if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists(): |
| 237 | path.parent.mkdir(parents=True, exist_ok=True) |
| 238 | path.write_text(actual, encoding="utf-8") |
| 239 | pytest.skip( |
| 240 | "snapshot compare.md written — re-run without SWAY_UPDATE_SNAPSHOTS to verify" |
| 241 | ) |
| 242 | expected = path.read_text(encoding="utf-8") |
| 243 | assert actual == expected, ( |
| 244 | "compare.md drifted from snapshot.\n" |
| 245 | "To accept the new output intentionally, run:\n" |
| 246 | " SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py\n" |
| 247 | "and commit the updated file.\n" |
| 248 | ) |
| 249 | |
| 250 | |
| 251 | class TestCompareMatrixProperties: |
| 252 | def test_n_runs_matches_labels(self) -> None: |
| 253 | matrix = CompareMatrix( |
| 254 | labels=("a", "b"), |
| 255 | timestamps=("", ""), |
| 256 | probe_names=(), |
| 257 | ) |
| 258 | assert matrix.n_runs == 2 |