sway Public

Watch 0 Fork 0 Star 0

Python · 9755 bytes Raw Blame History

  
        1
        """Tests for :mod:`dlm_sway.suite.compare` (S11 / F5)."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import json
      
        6
        import os
      
        7
        from datetime import UTC, datetime
      
        8
        from pathlib import Path
      
        9
        
        10
        import pytest
      
        11
        from rich.console import Console
      
        12
        
        13
        from dlm_sway.core.result import (
      
        14
            ProbeResult,
      
        15
            SuiteResult,
      
        16
            SwayScore,
      
        17
            Verdict,
      
        18
        )
      
        19
        from dlm_sway.suite.compare import (
      
        20
            CompareMatrix,
      
        21
            build_matrix,
      
        22
            render_json,
      
        23
            render_markdown,
      
        24
            render_terminal,
      
        25
        )
      
        26
        
        27
        SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
      
        28
        
        29
        
        30
        def _probe(name: str, score: float | None, verdict: Verdict = Verdict.PASS) -> ProbeResult:
      
        31
            return ProbeResult(
      
        32
                name=name,
      
        33
                kind=name.split("_")[0] if "_" in name else "delta_kl",
      
        34
                verdict=verdict,
      
        35
                score=score,
      
        36
                raw=score,
      
        37
                message=f"{name} score={score}",
      
        38
            )
      
        39
        
        40
        
        41
        def _suite(
      
        42
            probes: list[ProbeResult], *, started_minute: int, overall: float
      
        43
        ) -> tuple[SuiteResult, SwayScore]:
      
        44
            # ``started_minute`` is treated as offset-from-noon in 30-minute
      
        45
            # increments (run-a=0, run-b=30, run-c=60 maps to 12:00, 12:30, 13:00).
      
        46
            hours_offset, mins = divmod(started_minute, 60)
      
        47
            started = datetime(2026, 1, 1, 12 + hours_offset, mins, 0, tzinfo=UTC)
      
        48
            finished = datetime(2026, 1, 1, 12 + hours_offset, mins, 30, tzinfo=UTC)
      
        49
            suite = SuiteResult(
      
        50
                spec_path="fixture.yaml",
      
        51
                started_at=started,
      
        52
                finished_at=finished,
      
        53
                base_model_id="test/base",
      
        54
                adapter_id="adapter-v1",
      
        55
                sway_version="0.0.0",
      
        56
                probes=tuple(probes),
      
        57
            )
      
        58
            score = SwayScore(
      
        59
                overall=overall,
      
        60
                components={"adherence": overall, "attribution": overall},
      
        61
                band=SwayScore.band_for(overall),
      
        62
            )
      
        63
            return suite, score
      
        64
        
        65
        
        66
        def _three_run_history() -> list[tuple[SuiteResult, SwayScore]]:
      
        67
            """Three synthetic runs with overlapping-but-not-identical probe sets:
      
        68
        
        69
            - run-a: {delta_kl, section_internalization}
      
        70
            - run-b: {delta_kl, section_internalization, leakage} (leakage *added*)
      
        71
            - run-c: {delta_kl, leakage}                         (section_internalization *removed*)
      
        72
            """
      
        73
            run_a = _suite(
      
        74
                [
      
        75
                    _probe("delta_kl", 0.80),
      
        76
                    _probe("section_internalization", 0.70),
      
        77
                ],
      
        78
                started_minute=0,
      
        79
                overall=0.75,
      
        80
            )
      
        81
            run_b = _suite(
      
        82
                [
      
        83
                    _probe("delta_kl", 0.82),
      
        84
                    _probe("section_internalization", 0.72),
      
        85
                    _probe("leakage", 0.90),
      
        86
                ],
      
        87
                started_minute=30,
      
        88
                overall=0.81,
      
        89
            )
      
        90
            run_c = _suite(
      
        91
                [
      
        92
                    _probe("delta_kl", 0.65),  # dropped 0.17
      
        93
                    _probe("leakage", 0.88),  # dropped 0.02
      
        94
                ],
      
        95
                started_minute=60,
      
        96
                overall=0.72,
      
        97
            )
      
        98
            return [run_a, run_b, run_c]
      
        99
        
        100
        
        101
        class TestBuildMatrix:
      
        102
            def test_union_probe_names_sorted(self) -> None:
      
        103
                matrix = build_matrix(_three_run_history())
      
        104
                assert matrix.probe_names == ("delta_kl", "leakage", "section_internalization")
      
        105
        
        106
            def test_missing_cells_are_none(self) -> None:
      
        107
                matrix = build_matrix(_three_run_history())
      
        108
                # leakage did not exist in run-a.
      
        109
                assert matrix.scores["leakage"][0] is None
      
        110
                assert matrix.scores["leakage"][1] == pytest.approx(0.90)
      
        111
                # section_internalization did not exist in run-c.
      
        112
                assert matrix.scores["section_internalization"][2] is None
      
        113
        
        114
            def test_deltas_skip_none_neighbors(self) -> None:
      
        115
                matrix = build_matrix(_three_run_history())
      
        116
                # leakage: None → 0.90 → 0.88. First delta None (None neighbor);
      
        117
                # second delta ≈ -0.02.
      
        118
                assert matrix.deltas["leakage"][0] is None
      
        119
                assert matrix.deltas["leakage"][1] == pytest.approx(-0.02)
      
        120
                # section_internalization: 0.70 → 0.72 → None. First +0.02; second None.
      
        121
                assert matrix.deltas["section_internalization"][0] == pytest.approx(0.02)
      
        122
                assert matrix.deltas["section_internalization"][1] is None
      
        123
        
        124
            def test_composite_series_parallels_labels(self) -> None:
      
        125
                matrix = build_matrix(_three_run_history())
      
        126
                assert matrix.n_runs == 3
      
        127
                assert matrix.composite_series == pytest.approx([0.75, 0.81, 0.72])
      
        128
        
        129
            def test_labels_default_to_timestamps(self) -> None:
      
        130
                matrix = build_matrix(_three_run_history())
      
        131
                # Default labels come from finished_at ISO strings.
      
        132
                assert matrix.labels[0].startswith("2026-01-01T12:00")
      
        133
        
        134
            def test_labels_override(self) -> None:
      
        135
                runs = _three_run_history()
      
        136
                matrix = build_matrix(runs, labels=["v1", "v2", "v3"])
      
        137
                assert matrix.labels == ("v1", "v2", "v3")
      
        138
        
        139
            def test_labels_length_mismatch_raises(self) -> None:
      
        140
                runs = _three_run_history()
      
        141
                with pytest.raises(ValueError, match="labels length"):
      
        142
                    build_matrix(runs, labels=["v1", "v2"])
      
        143
        
        144
            def test_empty_results_raises(self) -> None:
      
        145
                with pytest.raises(ValueError, match="at least one run"):
      
        146
                    build_matrix([])
      
        147
        
        148
            def test_non_finite_score_coerced_to_none(self) -> None:
      
        149
                suite, score = _suite([_probe("delta_kl", float("nan"))], started_minute=0, overall=0.5)
      
        150
                matrix = build_matrix([(suite, score)])
      
        151
                assert matrix.scores["delta_kl"] == [None]
      
        152
        
        153
        
        154
        class TestLatestRegressions:
      
        155
            def test_single_run_yields_empty(self) -> None:
      
        156
                suite, score = _suite([_probe("dk", 0.5)], started_minute=0, overall=0.5)
      
        157
                matrix = build_matrix([(suite, score)])
      
        158
                assert matrix.latest_regressions(threshold=0.1) == []
      
        159
        
        160
            def test_catches_newest_regression(self) -> None:
      
        161
                matrix = build_matrix(_three_run_history())
      
        162
                regs = matrix.latest_regressions(threshold=0.10)
      
        163
                # delta_kl dropped 0.17; leakage only 0.02 (below threshold).
      
        164
                assert [name for name, _ in regs] == ["delta_kl"]
      
        165
                assert regs[0][1] == pytest.approx(-0.17)
      
        166
        
        167
            def test_zero_threshold_disables(self) -> None:
      
        168
                matrix = build_matrix(_three_run_history())
      
        169
                assert matrix.latest_regressions(threshold=0.0) == []
      
        170
        
        171
            def test_sorted_by_severity(self) -> None:
      
        172
                run_a = _suite(
      
        173
                    [_probe("p1", 0.9), _probe("p2", 0.9), _probe("p3", 0.9)],
      
        174
                    started_minute=0,
      
        175
                    overall=0.9,
      
        176
                )
      
        177
                run_b = _suite(
      
        178
                    [_probe("p1", 0.6), _probe("p2", 0.4), _probe("p3", 0.75)],
      
        179
                    started_minute=30,
      
        180
                    overall=0.6,
      
        181
                )
      
        182
                matrix = build_matrix([run_a, run_b])
      
        183
                regs = matrix.latest_regressions(threshold=0.10)
      
        184
                # p2 dropped 0.5, p1 dropped 0.3, p3 dropped 0.15 → order p2, p1, p3
      
        185
                assert [name for name, _ in regs] == ["p2", "p1", "p3"]
      
        186
        
        187
        
        188
        class TestRenderJson:
      
        189
            def test_round_trip(self) -> None:
      
        190
                matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
      
        191
                raw = render_json(matrix, regression_threshold=0.10)
      
        192
                parsed = json.loads(raw)
      
        193
                assert parsed["labels"] == ["v1", "v2", "v3"]
      
        194
                assert parsed["composite_series"] == pytest.approx([0.75, 0.81, 0.72])
      
        195
                assert parsed["latest_regressions"][0]["probe"] == "delta_kl"
      
        196
                assert parsed["latest_regressions"][0]["delta"] == pytest.approx(-0.17)
      
        197
                assert parsed["regression_threshold"] == pytest.approx(0.10)
      
        198
        
        199
            def test_no_regression_at_higher_threshold(self) -> None:
      
        200
                matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
      
        201
                raw = render_json(matrix, regression_threshold=0.50)
      
        202
                parsed = json.loads(raw)
      
        203
                assert parsed["latest_regressions"] == []
      
        204
        
        205
        
        206
        class TestRenderTerminal:
      
        207
            def test_renders_without_error(self) -> None:
      
        208
                matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
      
        209
                console = Console(record=True, width=160)
      
        210
                render_terminal(matrix, console=console, regression_threshold=0.10)
      
        211
                out = console.export_text()
      
        212
                # Smoke: headers + probe names + composite all appear.
      
        213
                assert "sway compare" in out
      
        214
                assert "delta_kl" in out
      
        215
                assert "composite" in out
      
        216
                assert "regressions" in out  # regression footer emitted
      
        217
        
        218
            def test_suppresses_regression_block_when_none(self) -> None:
      
        219
                matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
      
        220
                console = Console(record=True, width=160)
      
        221
                render_terminal(matrix, console=console, regression_threshold=0.90)
      
        222
                out = console.export_text()
      
        223
                assert "regressions" not in out
      
        224
        
        225
        
        226
        class TestRenderMarkdownSnapshot:
      
        227
            def test_markdown_snapshot(self) -> None:
      
        228
                """Lock the markdown layout. Run
      
        229
                ``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py``
      
        230
                to regenerate after an intentional format change.
      
        231
                """
      
        232
                matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
      
        233
                actual = render_markdown(matrix, regression_threshold=0.10)
      
        234
        
        235
                path = SNAPSHOT_DIR / "compare.md"
      
        236
                if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
      
        237
                    path.parent.mkdir(parents=True, exist_ok=True)
      
        238
                    path.write_text(actual, encoding="utf-8")
      
        239
                    pytest.skip(
      
        240
                        "snapshot compare.md written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
      
        241
                    )
      
        242
                expected = path.read_text(encoding="utf-8")
      
        243
                assert actual == expected, (
      
        244
                    "compare.md drifted from snapshot.\n"
      
        245
                    "To accept the new output intentionally, run:\n"
      
        246
                    "    SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py\n"
      
        247
                    "and commit the updated file.\n"
      
        248
                )
      
        249
        
        250
        
        251
        class TestCompareMatrixProperties:
      
        252
            def test_n_runs_matches_labels(self) -> None:
      
        253
                matrix = CompareMatrix(
      
        254
                    labels=("a", "b"),
      
        255
                    timestamps=("", ""),
      
        256
                    probe_names=(),
      
        257
                )
      
        258
                assert matrix.n_runs == 2

1	"""Tests for :mod:`dlm_sway.suite.compare` (S11 / F5)."""
2
3	from __future__ import annotations
4
5	import json
6	import os
7	from datetime import UTC, datetime
8	from pathlib import Path
9
10	import pytest
11	from rich.console import Console
12
13	from dlm_sway.core.result import (
14	ProbeResult,
15	SuiteResult,
16	SwayScore,
17	Verdict,
18	)
19	from dlm_sway.suite.compare import (
20	CompareMatrix,
21	build_matrix,
22	render_json,
23	render_markdown,
24	render_terminal,
25	)
26
27	SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
28
29
30	def _probe(name: str, score: float \| None, verdict: Verdict = Verdict.PASS) -> ProbeResult:
31	return ProbeResult(
32	name=name,
33	kind=name.split("_")[0] if "_" in name else "delta_kl",
34	verdict=verdict,
35	score=score,
36	raw=score,
37	message=f"{name} score={score}",
38	)
39
40
41	def _suite(
42	probes: list[ProbeResult], *, started_minute: int, overall: float
43	) -> tuple[SuiteResult, SwayScore]:
44	# ``started_minute`` is treated as offset-from-noon in 30-minute
45	# increments (run-a=0, run-b=30, run-c=60 maps to 12:00, 12:30, 13:00).
46	hours_offset, mins = divmod(started_minute, 60)
47	started = datetime(2026, 1, 1, 12 + hours_offset, mins, 0, tzinfo=UTC)
48	finished = datetime(2026, 1, 1, 12 + hours_offset, mins, 30, tzinfo=UTC)
49	suite = SuiteResult(
50	spec_path="fixture.yaml",
51	started_at=started,
52	finished_at=finished,
53	base_model_id="test/base",
54	adapter_id="adapter-v1",
55	sway_version="0.0.0",
56	probes=tuple(probes),
57	)
58	score = SwayScore(
59	overall=overall,
60	components={"adherence": overall, "attribution": overall},
61	band=SwayScore.band_for(overall),
62	)
63	return suite, score
64
65
66	def _three_run_history() -> list[tuple[SuiteResult, SwayScore]]:
67	"""Three synthetic runs with overlapping-but-not-identical probe sets:
68
69	- run-a: {delta_kl, section_internalization}
70	- run-b: {delta_kl, section_internalization, leakage} (leakage added)
71	- run-c: {delta_kl, leakage} (section_internalization removed)
72	"""
73	run_a = _suite(
74	[
75	_probe("delta_kl", 0.80),
76	_probe("section_internalization", 0.70),
77	],
78	started_minute=0,
79	overall=0.75,
80	)
81	run_b = _suite(
82	[
83	_probe("delta_kl", 0.82),
84	_probe("section_internalization", 0.72),
85	_probe("leakage", 0.90),
86	],
87	started_minute=30,
88	overall=0.81,
89	)
90	run_c = _suite(
91	[
92	_probe("delta_kl", 0.65), # dropped 0.17
93	_probe("leakage", 0.88), # dropped 0.02
94	],
95	started_minute=60,
96	overall=0.72,
97	)
98	return [run_a, run_b, run_c]
99
100
101	class TestBuildMatrix:
102	def test_union_probe_names_sorted(self) -> None:
103	matrix = build_matrix(_three_run_history())
104	assert matrix.probe_names == ("delta_kl", "leakage", "section_internalization")
105
106	def test_missing_cells_are_none(self) -> None:
107	matrix = build_matrix(_three_run_history())
108	# leakage did not exist in run-a.
109	assert matrix.scores["leakage"][0] is None
110	assert matrix.scores["leakage"][1] == pytest.approx(0.90)
111	# section_internalization did not exist in run-c.
112	assert matrix.scores["section_internalization"][2] is None
113
114	def test_deltas_skip_none_neighbors(self) -> None:
115	matrix = build_matrix(_three_run_history())
116	# leakage: None → 0.90 → 0.88. First delta None (None neighbor);
117	# second delta ≈ -0.02.
118	assert matrix.deltas["leakage"][0] is None
119	assert matrix.deltas["leakage"][1] == pytest.approx(-0.02)
120	# section_internalization: 0.70 → 0.72 → None. First +0.02; second None.
121	assert matrix.deltas["section_internalization"][0] == pytest.approx(0.02)
122	assert matrix.deltas["section_internalization"][1] is None
123
124	def test_composite_series_parallels_labels(self) -> None:
125	matrix = build_matrix(_three_run_history())
126	assert matrix.n_runs == 3
127	assert matrix.composite_series == pytest.approx([0.75, 0.81, 0.72])
128
129	def test_labels_default_to_timestamps(self) -> None:
130	matrix = build_matrix(_three_run_history())
131	# Default labels come from finished_at ISO strings.
132	assert matrix.labels[0].startswith("2026-01-01T12:00")
133
134	def test_labels_override(self) -> None:
135	runs = _three_run_history()
136	matrix = build_matrix(runs, labels=["v1", "v2", "v3"])
137	assert matrix.labels == ("v1", "v2", "v3")
138
139	def test_labels_length_mismatch_raises(self) -> None:
140	runs = _three_run_history()
141	with pytest.raises(ValueError, match="labels length"):
142	build_matrix(runs, labels=["v1", "v2"])
143
144	def test_empty_results_raises(self) -> None:
145	with pytest.raises(ValueError, match="at least one run"):
146	build_matrix([])
147
148	def test_non_finite_score_coerced_to_none(self) -> None:
149	suite, score = _suite([_probe("delta_kl", float("nan"))], started_minute=0, overall=0.5)
150	matrix = build_matrix([(suite, score)])
151	assert matrix.scores["delta_kl"] == [None]
152
153
154	class TestLatestRegressions:
155	def test_single_run_yields_empty(self) -> None:
156	suite, score = _suite([_probe("dk", 0.5)], started_minute=0, overall=0.5)
157	matrix = build_matrix([(suite, score)])
158	assert matrix.latest_regressions(threshold=0.1) == []
159
160	def test_catches_newest_regression(self) -> None:
161	matrix = build_matrix(_three_run_history())
162	regs = matrix.latest_regressions(threshold=0.10)
163	# delta_kl dropped 0.17; leakage only 0.02 (below threshold).
164	assert [name for name, _ in regs] == ["delta_kl"]
165	assert regs[0][1] == pytest.approx(-0.17)
166
167	def test_zero_threshold_disables(self) -> None:
168	matrix = build_matrix(_three_run_history())
169	assert matrix.latest_regressions(threshold=0.0) == []
170
171	def test_sorted_by_severity(self) -> None:
172	run_a = _suite(
173	[_probe("p1", 0.9), _probe("p2", 0.9), _probe("p3", 0.9)],
174	started_minute=0,
175	overall=0.9,
176	)
177	run_b = _suite(
178	[_probe("p1", 0.6), _probe("p2", 0.4), _probe("p3", 0.75)],
179	started_minute=30,
180	overall=0.6,
181	)
182	matrix = build_matrix([run_a, run_b])
183	regs = matrix.latest_regressions(threshold=0.10)
184	# p2 dropped 0.5, p1 dropped 0.3, p3 dropped 0.15 → order p2, p1, p3
185	assert [name for name, _ in regs] == ["p2", "p1", "p3"]
186
187
188	class TestRenderJson:
189	def test_round_trip(self) -> None:
190	matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
191	raw = render_json(matrix, regression_threshold=0.10)
192	parsed = json.loads(raw)
193	assert parsed["labels"] == ["v1", "v2", "v3"]
194	assert parsed["composite_series"] == pytest.approx([0.75, 0.81, 0.72])
195	assert parsed["latest_regressions"][0]["probe"] == "delta_kl"
196	assert parsed["latest_regressions"][0]["delta"] == pytest.approx(-0.17)
197	assert parsed["regression_threshold"] == pytest.approx(0.10)
198
199	def test_no_regression_at_higher_threshold(self) -> None:
200	matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
201	raw = render_json(matrix, regression_threshold=0.50)
202	parsed = json.loads(raw)
203	assert parsed["latest_regressions"] == []
204
205
206	class TestRenderTerminal:
207	def test_renders_without_error(self) -> None:
208	matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
209	console = Console(record=True, width=160)
210	render_terminal(matrix, console=console, regression_threshold=0.10)
211	out = console.export_text()
212	# Smoke: headers + probe names + composite all appear.
213	assert "sway compare" in out
214	assert "delta_kl" in out
215	assert "composite" in out
216	assert "regressions" in out # regression footer emitted
217
218	def test_suppresses_regression_block_when_none(self) -> None:
219	matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
220	console = Console(record=True, width=160)
221	render_terminal(matrix, console=console, regression_threshold=0.90)
222	out = console.export_text()
223	assert "regressions" not in out
224
225
226	class TestRenderMarkdownSnapshot:
227	def test_markdown_snapshot(self) -> None:
228	"""Lock the markdown layout. Run
229	``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py``
230	to regenerate after an intentional format change.
231	"""
232	matrix = build_matrix(_three_run_history(), labels=["v1", "v2", "v3"])
233	actual = render_markdown(matrix, regression_threshold=0.10)
234
235	path = SNAPSHOT_DIR / "compare.md"
236	if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
237	path.parent.mkdir(parents=True, exist_ok=True)
238	path.write_text(actual, encoding="utf-8")
239	pytest.skip(
240	"snapshot compare.md written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
241	)
242	expected = path.read_text(encoding="utf-8")
243	assert actual == expected, (
244	"compare.md drifted from snapshot.\n"
245	"To accept the new output intentionally, run:\n"
246	" SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_compare.py\n"
247	"and commit the updated file.\n"
248	)
249
250
251	class TestCompareMatrixProperties:
252	def test_n_runs_matches_labels(self) -> None:
253	matrix = CompareMatrix(
254	labels=("a", "b"),
255	timestamps=("", ""),
256	probe_names=(),
257	)
258	assert matrix.n_runs == 2