Python · 9755 bytes Raw Blame History
1 """Tests for :mod:`dlm_sway.suite.compare` (S11 / F5)."""
2
3 from __future__ import annotations
4
5 import json
6 import os
7 from datetime import UTC, datetime
8 from pathlib import Path
9
10 import pytest
11 from rich.console import Console
12
13 from dlm_sway.core.result import (
14 ProbeResult,
15 SuiteResult,
16 SwayScore,
17 Verdict,
18 )
19 from dlm_sway.suite.compare import (
20 CompareMatrix,
21 build_matrix,
22 render_json,
23 render_markdown,
24 render_terminal,
25 )
26
27 SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
28
29
30 def _probe(name: str, score: float | None, verdict: Verdict = Verdict.PASS) -> ProbeResult:
31 return ProbeResult(
32 name=name,
33 kind=name.split("_")[0] if "_" in name else "delta_kl",
34 verdict=verdict,
35 score=score,
36 raw=score,
37 message=f"{name} score={score}",
38 )
39
40
41 def _suite(
42 probes: list[ProbeResult], *, started_minute: int, overall: float
43 ) -> tuple[SuiteResult, SwayScore]:
44 # ``started_minute`` is treated as offset-from-noon in 30-minute
45 # increments (run-a=0, run-b=30, run-c=60 maps to 12:00, 12:30, 13:00).
46 hours_offset, mins = divmod(started_minute, 60)
47 started = datetime(2026, 1, 1, 12 + hours_offset, mins, 0, tzinfo=UTC)
48 finished = datetime(2026, 1, 1, 12 + hours_offset, mins, 30, tzinfo=UTC)
49 suite = SuiteResult(
50 spec_path="fixture.yaml",
51 started_at=started,
52 finished_at=finished,
53 base_model_id="test/base",
54 adapter_id="adapter-v1",
55 sway_version="0.0.0",
56 probes=tuple(probes),
57 )
58 score = SwayScore(
59 overall=overall,
60 components={"adherence": overall, "attribution": overall},
61 band=SwayScore.band_for(overall),
62 )
63 return suite, score
64
65
66 def _three_run_history() -> list[tuple[SuiteResult, SwayScore]]:
67 """Three synthetic runs with overlapping-but-not-identical probe sets:
68
69 - run-a: {delta_kl, section_internalization}
70 - run-b: {delta_kl, section_internalization, leakage} (leakage *added*)
71 - run-c: {delta_kl, leakage} (section_internalization *removed*)
72 """
73 run_a = _suite(
74 [
75 _probe("delta_kl", 0.80),
76 _probe("section_internalization", 0.70),
77 ],
78 started_minute=0,
79 overall=0.75,
80 )
81 run_b = _suite(
82 [
83 _probe("delta_kl", 0.82),
84 _probe("section_internalization", 0.72),
85 _probe("leakage", 0.90),
86 ],
87 started_minute=30,
88 overall=0.81,
89 )
90 run_c = _suite(
91 [
92 _probe("delta_kl", 0.65), # dropped 0.17
93 _probe("leakage", 0.88), # dropped 0.02
94 ],
95 started_minute=60,
96 overall=0.72,
97 )
98 return [run_a, run_b, run_c]
99
100
101 class TestBuildMatrix:
102 def test_union_probe_names_sorted(self) -> None:
103 matrix = build_matrix(_three_run_history())
104 assert matrix.probe_names == ("delta_kl", "leakage", "section_internalization")
105
106 def test_missing_cells_are_none(self) -> None:
107 matrix = build_matrix(_three_run_history())
108 # leakage did not exist in run-a.
109 assert matrix.scores["leakage"][0] is None
110 assert matrix.scores["leakage"][1] == pytest.approx(0.90)
111 # section_internalization did not exist in run-c.
112 assert matrix.scores["section_internalization"][2] is None
113
114 def test_deltas_skip_none_neighbors(self) -> None:
115 matrix = build_matrix(_three_run_history())
116 # leakage: None → 0.90 → 0.88. First delta None (None neighbor);
117 # second delta ≈ -0.02.
118 assert matrix.deltas["leakage"][0] is None
119 assert matrix.deltas["leakage"][1] == pytest.approx(-0.02)
120 # section_internalization: 0.70 → 0.72 → None. First +0.02; second None.
121 assert matrix.deltas["section_internalization"][0] == pytest.approx(0.02)
122 assert matrix.deltas["section_internalization"][1] is None
123
124 def test_composite_series_parallels_labels(self) -> None:
125 matrix = build_matrix(_three_run_history())
126 assert matrix.n_runs == 3
127 assert matrix.composite_series == pytest.approx([0.75, 0.81, 0.72])
128
129 def test_labels_default_to_timestamps(self) -> None:
130 matrix = build_matrix(_three_run_history())
131 # Default labels come from finished_at ISO strings.
132 assert matrix.labels[0].startswith("2026-01-01T12:00")
133
134 def test_labels_override(self) -> None:
135 runs = _three_run_history()
136 matrix = build_matrix(runs, labels=["v1", "v2", "v3"])
137 assert matrix.labels == ("v1", "v2", "v3")
138
139 def test_labels_length_mismatch_raises(self) -> None:
140 runs = _three_run_history()
141 with pytest.raises(ValueError, match="labels length"):
142 build_matrix(runs, labels=["v1", "v2"])
143
144 def test_empty_results_raises(self) -> None:
145 with pytest.raises(ValueError, match="at least one run"):
146 build_matrix([])
147
148 def test_non_finite_score_coerced_to_none(self) -> None:
149 suite, score = _suite([_probe("delta_kl", float("nan"))], started_minute=0, overall=0.5)
150 matrix = build_matrix([(suite, score)])
151 assert matrix.scores["delta_kl"] == [None]
152
153
154 class TestLatestRegressions:
155 def test_single_run_yields_empty(self) -> None:
156 suite, score = _suite([_probe("dk", 0.5)], started_minute=0, overall=0.5)
157 matrix = build_matrix([(suite, score)])
158 assert matrix.latest_regressions(threshold=0.1) == []
159
160 def test_catches_newest_regression(self) -> None:
161 matrix = build_matrix(_three_run_history())
162 regs = matrix.latest_regressions(threshold=0.10)
163 # delta_kl dropped 0.17; leakage only 0.02 (below threshold).
164 assert [name for name, _ in regs] == ["delta_kl"]
165 assert regs[0][1] == pytest.approx(-0.17)
166
167 def test_zero_threshold_disables(self) -> None:
168 matrix = build_matrix(_three_run_history())
169 assert matrix.latest_regressions(threshold=0.0) == []
170
171 def test_sorted_by_severity(self) -> None:
172 run_a = _suite(
173 [_probe("p1", 0.9), _probe("p2", 0.9), _probe("p3", 0.9)],
174 started_minute=0,
175 overall=0.9,
176 )
177 run_b = _suite(
178 [_probe("p1", 0.6), _probe("p2", 0.4), _probe("p3", 0.75)],
179 started_minute=30,
180 overall=0.6,
181 )
182 matrix = build_matrix([run_a, run_b])
183 regs = matrix.latest_regressions(threshold=0.10)
184 # p2 dropped 0.5, p1 dropped 0.3, p3 dropped 0.15 → order p2, p1, p3
185 assert [name for name, _ in regs] == ["p2", "p1", "p3"]
186
187
188 class TestRenderJson:
189 def test_round_trip(self) -> None:
190 matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
191 raw = render_json(matrix, regression_threshold=0.10)
192 parsed = json.loads(raw)
193 assert parsed["labels"] == ["v1", "v2", "v3"]
194 assert parsed["composite_series"] == pytest.approx([0.75, 0.81, 0.72])
195 assert parsed["latest_regressions"][0]["probe"] == "delta_kl"
196 assert parsed["latest_regressions"][0]["delta"] == pytest.approx(-0.17)
197 assert parsed["regression_threshold"] == pytest.approx(0.10)
198
199 def test_no_regression_at_higher_threshold(self) -> None:
200 matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
201 raw = render_json(matrix, regression_threshold=0.50)
202 parsed = json.loads(raw)
203 assert parsed["latest_regressions"] == []
204
205
206 class TestRenderTerminal:
207 def test_renders_without_error(self) -> None:
208 matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
209 console = Console(record=True, width=160)
210 render_terminal(matrix, console=console, regression_threshold=0.10)
211 out = console.export_text()
212 # Smoke: headers + probe names + composite all appear.
213 assert "sway compare" in out
214 assert "delta_kl" in out
215 assert "composite" in out
216 assert "regressions" in out # regression footer emitted
217
218 def test_suppresses_regression_block_when_none(self) -> None:
219 matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
220 console = Console(record=True, width=160)
221 render_terminal(matrix, console=console, regression_threshold=0.90)
222 out = console.export_text()
223 assert "regressions" not in out
224
225
226 class TestRenderMarkdownSnapshot:
227 def test_markdown_snapshot(self) -> None:
228 """Lock the markdown layout. Run
229 ``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py``
230 to regenerate after an intentional format change.
231 """
232 matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
233 actual = render_markdown(matrix, regression_threshold=0.10)
234
235 path = SNAPSHOT_DIR / "compare.md"
236 if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
237 path.parent.mkdir(parents=True, exist_ok=True)
238 path.write_text(actual, encoding="utf-8")
239 pytest.skip(
240 "snapshot compare.md written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
241 )
242 expected = path.read_text(encoding="utf-8")
243 assert actual == expected, (
244 "compare.md drifted from snapshot.\n"
245 "To accept the new output intentionally, run:\n"
246 " SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py\n"
247 "and commit the updated file.\n"
248 )
249
250
251 class TestCompareMatrixProperties:
252 def test_n_runs_matches_labels(self) -> None:
253 matrix = CompareMatrix(
254 labels=("a", "b"),
255 timestamps=("", ""),
256 probe_names=(),
257 )
258 assert matrix.n_runs == 2