tenseleyflow/sway / 51870c6

Browse files

tests/unit: compare — build_matrix + renderers + regression gate

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
51870c61aef9f60d16619cdb2f28d18ad2850920
Parents
957d724
Tree
990f23f

1 changed file

StatusFile+-
A tests/unit/test_compare.py 258 0
tests/unit/test_compare.pyadded
@@ -0,0 +1,258 @@
1
+"""Tests for :mod:`dlm_sway.suite.compare` (S11 / F5)."""
2
+
3
+from __future__ import annotations
4
+
5
+import json
6
+import os
7
+from datetime import UTC, datetime
8
+from pathlib import Path
9
+
10
+import pytest
11
+from rich.console import Console
12
+
13
+from dlm_sway.core.result import (
14
+    ProbeResult,
15
+    SuiteResult,
16
+    SwayScore,
17
+    Verdict,
18
+)
19
+from dlm_sway.suite.compare import (
20
+    CompareMatrix,
21
+    build_matrix,
22
+    render_json,
23
+    render_markdown,
24
+    render_terminal,
25
+)
26
+
27
+SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
28
+
29
+
30
+def _probe(name: str, score: float | None, verdict: Verdict = Verdict.PASS) -> ProbeResult:
31
+    return ProbeResult(
32
+        name=name,
33
+        kind=name.split("_")[0] if "_" in name else "delta_kl",
34
+        verdict=verdict,
35
+        score=score,
36
+        raw=score,
37
+        message=f"{name} score={score}",
38
+    )
39
+
40
+
41
+def _suite(
42
+    probes: list[ProbeResult], *, started_minute: int, overall: float
43
+) -> tuple[SuiteResult, SwayScore]:
44
+    # ``started_minute`` is treated as offset-from-noon in 30-minute
45
+    # increments (run-a=0, run-b=30, run-c=60 maps to 12:00, 12:30, 13:00).
46
+    hours_offset, mins = divmod(started_minute, 60)
47
+    started = datetime(2026, 1, 1, 12 + hours_offset, mins, 0, tzinfo=UTC)
48
+    finished = datetime(2026, 1, 1, 12 + hours_offset, mins, 30, tzinfo=UTC)
49
+    suite = SuiteResult(
50
+        spec_path="fixture.yaml",
51
+        started_at=started,
52
+        finished_at=finished,
53
+        base_model_id="test/base",
54
+        adapter_id="adapter-v1",
55
+        sway_version="0.0.0",
56
+        probes=tuple(probes),
57
+    )
58
+    score = SwayScore(
59
+        overall=overall,
60
+        components={"adherence": overall, "attribution": overall},
61
+        band=SwayScore.band_for(overall),
62
+    )
63
+    return suite, score
64
+
65
+
66
+def _three_run_history() -> list[tuple[SuiteResult, SwayScore]]:
67
+    """Three synthetic runs with overlapping-but-not-identical probe sets:
68
+
69
+    - run-a: {delta_kl, section_internalization}
70
+    - run-b: {delta_kl, section_internalization, leakage} (leakage *added*)
71
+    - run-c: {delta_kl, leakage}                         (section_internalization *removed*)
72
+    """
73
+    run_a = _suite(
74
+        [
75
+            _probe("delta_kl", 0.80),
76
+            _probe("section_internalization", 0.70),
77
+        ],
78
+        started_minute=0,
79
+        overall=0.75,
80
+    )
81
+    run_b = _suite(
82
+        [
83
+            _probe("delta_kl", 0.82),
84
+            _probe("section_internalization", 0.72),
85
+            _probe("leakage", 0.90),
86
+        ],
87
+        started_minute=30,
88
+        overall=0.81,
89
+    )
90
+    run_c = _suite(
91
+        [
92
+            _probe("delta_kl", 0.65),  # dropped 0.17
93
+            _probe("leakage", 0.88),  # dropped 0.02
94
+        ],
95
+        started_minute=60,
96
+        overall=0.72,
97
+    )
98
+    return [run_a, run_b, run_c]
99
+
100
+
101
+class TestBuildMatrix:
102
+    def test_union_probe_names_sorted(self) -> None:
103
+        matrix = build_matrix(_three_run_history())
104
+        assert matrix.probe_names == ("delta_kl", "leakage", "section_internalization")
105
+
106
+    def test_missing_cells_are_none(self) -> None:
107
+        matrix = build_matrix(_three_run_history())
108
+        # leakage did not exist in run-a.
109
+        assert matrix.scores["leakage"][0] is None
110
+        assert matrix.scores["leakage"][1] == pytest.approx(0.90)
111
+        # section_internalization did not exist in run-c.
112
+        assert matrix.scores["section_internalization"][2] is None
113
+
114
+    def test_deltas_skip_none_neighbors(self) -> None:
115
+        matrix = build_matrix(_three_run_history())
116
+        # leakage: None → 0.90 → 0.88. First delta None (None neighbor);
117
+        # second delta ≈ -0.02.
118
+        assert matrix.deltas["leakage"][0] is None
119
+        assert matrix.deltas["leakage"][1] == pytest.approx(-0.02)
120
+        # section_internalization: 0.70 → 0.72 → None. First +0.02; second None.
121
+        assert matrix.deltas["section_internalization"][0] == pytest.approx(0.02)
122
+        assert matrix.deltas["section_internalization"][1] is None
123
+
124
+    def test_composite_series_parallels_labels(self) -> None:
125
+        matrix = build_matrix(_three_run_history())
126
+        assert matrix.n_runs == 3
127
+        assert matrix.composite_series == pytest.approx([0.75, 0.81, 0.72])
128
+
129
+    def test_labels_default_to_timestamps(self) -> None:
130
+        matrix = build_matrix(_three_run_history())
131
+        # Default labels come from finished_at ISO strings.
132
+        assert matrix.labels[0].startswith("2026-01-01T12:00")
133
+
134
+    def test_labels_override(self) -> None:
135
+        runs = _three_run_history()
136
+        matrix = build_matrix(runs, labels=["v1", "v2", "v3"])
137
+        assert matrix.labels == ("v1", "v2", "v3")
138
+
139
+    def test_labels_length_mismatch_raises(self) -> None:
140
+        runs = _three_run_history()
141
+        with pytest.raises(ValueError, match="labels length"):
142
+            build_matrix(runs, labels=["v1", "v2"])
143
+
144
+    def test_empty_results_raises(self) -> None:
145
+        with pytest.raises(ValueError, match="at least one run"):
146
+            build_matrix([])
147
+
148
+    def test_non_finite_score_coerced_to_none(self) -> None:
149
+        suite, score = _suite([_probe("delta_kl", float("nan"))], started_minute=0, overall=0.5)
150
+        matrix = build_matrix([(suite, score)])
151
+        assert matrix.scores["delta_kl"] == [None]
152
+
153
+
154
+class TestLatestRegressions:
155
+    def test_single_run_yields_empty(self) -> None:
156
+        suite, score = _suite([_probe("dk", 0.5)], started_minute=0, overall=0.5)
157
+        matrix = build_matrix([(suite, score)])
158
+        assert matrix.latest_regressions(threshold=0.1) == []
159
+
160
+    def test_catches_newest_regression(self) -> None:
161
+        matrix = build_matrix(_three_run_history())
162
+        regs = matrix.latest_regressions(threshold=0.10)
163
+        # delta_kl dropped 0.17; leakage only 0.02 (below threshold).
164
+        assert [name for name, _ in regs] == ["delta_kl"]
165
+        assert regs[0][1] == pytest.approx(-0.17)
166
+
167
+    def test_zero_threshold_disables(self) -> None:
168
+        matrix = build_matrix(_three_run_history())
169
+        assert matrix.latest_regressions(threshold=0.0) == []
170
+
171
+    def test_sorted_by_severity(self) -> None:
172
+        run_a = _suite(
173
+            [_probe("p1", 0.9), _probe("p2", 0.9), _probe("p3", 0.9)],
174
+            started_minute=0,
175
+            overall=0.9,
176
+        )
177
+        run_b = _suite(
178
+            [_probe("p1", 0.6), _probe("p2", 0.4), _probe("p3", 0.75)],
179
+            started_minute=30,
180
+            overall=0.6,
181
+        )
182
+        matrix = build_matrix([run_a, run_b])
183
+        regs = matrix.latest_regressions(threshold=0.10)
184
+        # p2 dropped 0.5, p1 dropped 0.3, p3 dropped 0.15 → order p2, p1, p3
185
+        assert [name for name, _ in regs] == ["p2", "p1", "p3"]
186
+
187
+
188
+class TestRenderJson:
189
+    def test_round_trip(self) -> None:
190
+        matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
191
+        raw = render_json(matrix, regression_threshold=0.10)
192
+        parsed = json.loads(raw)
193
+        assert parsed["labels"] == ["v1", "v2", "v3"]
194
+        assert parsed["composite_series"] == pytest.approx([0.75, 0.81, 0.72])
195
+        assert parsed["latest_regressions"][0]["probe"] == "delta_kl"
196
+        assert parsed["latest_regressions"][0]["delta"] == pytest.approx(-0.17)
197
+        assert parsed["regression_threshold"] == pytest.approx(0.10)
198
+
199
+    def test_no_regression_at_higher_threshold(self) -> None:
200
+        matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
201
+        raw = render_json(matrix, regression_threshold=0.50)
202
+        parsed = json.loads(raw)
203
+        assert parsed["latest_regressions"] == []
204
+
205
+
206
+class TestRenderTerminal:
207
+    def test_renders_without_error(self) -> None:
208
+        matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
209
+        console = Console(record=True, width=160)
210
+        render_terminal(matrix, console=console, regression_threshold=0.10)
211
+        out = console.export_text()
212
+        # Smoke: headers + probe names + composite all appear.
213
+        assert "sway compare" in out
214
+        assert "delta_kl" in out
215
+        assert "composite" in out
216
+        assert "regressions" in out  # regression footer emitted
217
+
218
+    def test_suppresses_regression_block_when_none(self) -> None:
219
+        matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
220
+        console = Console(record=True, width=160)
221
+        render_terminal(matrix, console=console, regression_threshold=0.90)
222
+        out = console.export_text()
223
+        assert "regressions" not in out
224
+
225
+
226
+class TestRenderMarkdownSnapshot:
227
+    def test_markdown_snapshot(self) -> None:
228
+        """Lock the markdown layout. Run
229
+        ``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py``
230
+        to regenerate after an intentional format change.
231
+        """
232
+        matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
233
+        actual = render_markdown(matrix, regression_threshold=0.10)
234
+
235
+        path = SNAPSHOT_DIR / "compare.md"
236
+        if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
237
+            path.parent.mkdir(parents=True, exist_ok=True)
238
+            path.write_text(actual, encoding="utf-8")
239
+            pytest.skip(
240
+                "snapshot compare.md written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
241
+            )
242
+        expected = path.read_text(encoding="utf-8")
243
+        assert actual == expected, (
244
+            "compare.md drifted from snapshot.\n"
245
+            "To accept the new output intentionally, run:\n"
246
+            "    SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py\n"
247
+            "and commit the updated file.\n"
248
+        )
249
+
250
+
251
+class TestCompareMatrixProperties:
252
+    def test_n_runs_matches_labels(self) -> None:
253
+        matrix = CompareMatrix(
254
+            labels=("a", "b"),
255
+            timestamps=("", ""),
256
+            probe_names=(),
257
+        )
258
+        assert matrix.n_runs == 2