sway Public

Watch 0 Fork 0 Star 0

Python · 7076 bytes Raw Blame History

  
        1
        """Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import json
      
        6
        from datetime import timedelta
      
        7
        from typing import Literal
      
        8
        
        9
        import pytest
      
        10
        
        11
        from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
      
        12
        from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
      
        13
        from dlm_sway.suite import report, score
      
        14
        from dlm_sway.suite.spec import SwaySpec
      
        15
        
        16
        
        17
        class _AdherenceSpec(ProbeSpec):
      
        18
            kind: Literal["__score_adherence"] = "__score_adherence"
      
        19
        
        20
        
        21
        class _AdherenceProbe(Probe):
      
        22
            kind = "__score_adherence"
      
        23
            spec_cls = _AdherenceSpec
      
        24
            category = "adherence"
      
        25
        
        26
            def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
      
        27
                raise NotImplementedError  # never executed; registered for category lookup
      
        28
        
        29
        
        30
        class _AttributionSpec(ProbeSpec):
      
        31
            kind: Literal["__score_attribution"] = "__score_attribution"
      
        32
        
        33
        
        34
        class _AttributionProbe(Probe):
      
        35
            kind = "__score_attribution"
      
        36
            spec_cls = _AttributionSpec
      
        37
            category = "attribution"
      
        38
        
        39
            def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
      
        40
                raise NotImplementedError
      
        41
        
        42
        
        43
        def _synth_suite(*probes: ProbeResult) -> SuiteResult:
      
        44
            started = utcnow()
      
        45
            return SuiteResult(
      
        46
                spec_path="sway.yaml",
      
        47
                started_at=started,
      
        48
                finished_at=started + timedelta(seconds=1),
      
        49
                base_model_id="base",
      
        50
                adapter_id="adapter",
      
        51
                sway_version="0.1.0.dev0",
      
        52
                probes=probes,
      
        53
            )
      
        54
        
        55
        
        56
        class TestCompute:
      
        57
            def test_single_passing_probe(self) -> None:
      
        58
                suite = _synth_suite(
      
        59
                    ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
      
        60
                )
      
        61
                s = score.compute(suite)
      
        62
                assert s.overall == pytest.approx(0.8)
      
        63
                assert s.components["adherence"] == pytest.approx(0.8)
      
        64
                assert s.band == "healthy"
      
        65
        
        66
            def test_mixed_categories_weighted(self) -> None:
      
        67
                suite = _synth_suite(
      
        68
                    ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
      
        69
                    ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3),
      
        70
                )
      
        71
                s = score.compute(suite)
      
        72
                # Active categories: adherence (0.30) + attribution (0.35). Normalized.
      
        73
                expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35)
      
        74
                assert s.overall == pytest.approx(expected)
      
        75
        
        76
            def test_errors_and_skips_excluded(self) -> None:
      
        77
                suite = _synth_suite(
      
        78
                    ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
      
        79
                    ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None),
      
        80
                    ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None),
      
        81
                )
      
        82
                s = score.compute(suite)
      
        83
                assert s.components["adherence"] == pytest.approx(0.9)
      
        84
        
        85
            def test_per_probe_weights_override_uniform(self) -> None:
      
        86
                suite = _synth_suite(
      
        87
                    ProbeResult(
      
        88
                        name="a",
      
        89
                        kind="__score_adherence",
      
        90
                        verdict=Verdict.PASS,
      
        91
                        score=1.0,
      
        92
                        evidence={"weight": 3.0},
      
        93
                    ),
      
        94
                    ProbeResult(
      
        95
                        name="b",
      
        96
                        kind="__score_adherence",
      
        97
                        verdict=Verdict.PASS,
      
        98
                        score=0.0,
      
        99
                        evidence={"weight": 1.0},
      
        100
                    ),
      
        101
                )
      
        102
                s = score.compute(suite)
      
        103
                # Weighted mean: (3·1 + 1·0) / 4 = 0.75
      
        104
                assert s.components["adherence"] == pytest.approx(0.75)
      
        105
        
        106
            def test_failed_probe_surfaces_in_findings(self) -> None:
      
        107
                suite = _synth_suite(
      
        108
                    ProbeResult(
      
        109
                        name="bad",
      
        110
                        kind="__score_adherence",
      
        111
                        verdict=Verdict.FAIL,
      
        112
                        score=0.1,
      
        113
                        message="nope",
      
        114
                    )
      
        115
                )
      
        116
                s = score.compute(suite)
      
        117
                assert any("bad" in f for f in s.findings)
      
        118
        
        119
        
        120
        class TestJsonReport:
      
        121
            def test_schema_fields(self) -> None:
      
        122
                suite = _synth_suite(
      
        123
                    ProbeResult(
      
        124
                        name="p1",
      
        125
                        kind="__score_adherence",
      
        126
                        verdict=Verdict.PASS,
      
        127
                        score=0.75,
      
        128
                        raw=0.12,
      
        129
                        z_score=3.1,
      
        130
                    )
      
        131
                )
      
        132
                s = score.compute(suite)
      
        133
                out = json.loads(report.to_json(suite, s))
      
        134
                assert out["schema_version"] == 1
      
        135
                assert out["score"]["overall"] == pytest.approx(0.75)
      
        136
                assert out["probes"][0]["verdict"] == "pass"
      
        137
                assert out["probes"][0]["z_score"] == pytest.approx(3.1)
      
        138
        
        139
        
        140
        class TestJunit:
      
        141
            def test_counts_populated(self) -> None:
      
        142
                suite = _synth_suite(
      
        143
                    ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0),
      
        144
                    ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0),
      
        145
                    ProbeResult(
      
        146
                        name="p3",
      
        147
                        kind="__score_adherence",
      
        148
                        verdict=Verdict.ERROR,
      
        149
                        score=None,
      
        150
                    ),
      
        151
                )
      
        152
                s = score.compute(suite)
      
        153
                xml = report.to_junit(suite, s)
      
        154
                assert 'tests="3"' in xml
      
        155
                assert 'failures="1"' in xml
      
        156
                assert 'errors="1"' in xml
      
        157
                assert "<failure" in xml
      
        158
                assert "<error" in xml
      
        159
        
        160
        
        161
        class TestMarkdown:
      
        162
            def test_contains_probe_table(self) -> None:
      
        163
                suite = _synth_suite(
      
        164
                    ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
      
        165
                )
      
        166
                s = score.compute(suite)
      
        167
                md = report.to_markdown(suite, s)
      
        168
                assert "sway report" in md
      
        169
                assert "| p1 | `__score_adherence`" in md
      
        170
        
        171
        
        172
        class TestTerminal:
      
        173
            def test_renders_without_error(self) -> None:
      
        174
                import io
      
        175
        
        176
                from rich.console import Console
      
        177
        
        178
                suite = _synth_suite(
      
        179
                    ProbeResult(
      
        180
                        name="p1",
      
        181
                        kind="__score_adherence",
      
        182
                        verdict=Verdict.PASS,
      
        183
                        score=0.8,
      
        184
                        raw=0.12,
      
        185
                        z_score=3.1,
      
        186
                        message="looks fine",
      
        187
                    ),
      
        188
                    ProbeResult(
      
        189
                        name="p2",
      
        190
                        kind="__score_attribution",
      
        191
                        verdict=Verdict.FAIL,
      
        192
                        score=0.1,
      
        193
                        message="a very long message that will be truncated — " * 5,
      
        194
                    ),
      
        195
                    ProbeResult(
      
        196
                        name="p3",
      
        197
                        kind="__score_adherence",
      
        198
                        verdict=Verdict.SKIP,
      
        199
                        score=None,
      
        200
                    ),
      
        201
                )
      
        202
                s = score.compute(suite)
      
        203
                buf = io.StringIO()
      
        204
                console = Console(file=buf, force_terminal=False, width=120)
      
        205
                report.to_terminal(suite, s, console=console)
      
        206
                out = buf.getvalue()
      
        207
                assert "sway report" in out
      
        208
                assert "overall:" in out
      
        209
                assert "p1" in out
      
        210
                assert "p2" in out
      
        211
                # Top findings section kicks in because p2 failed.
      
        212
                assert "top findings" in out
      
        213
        
        214
        
        215
        # Force the SwaySpec model to stay reachable from tests (keeps mypy happy
      
        216
        # on the eventual CLI path that calls into both).
      
        217
        assert SwaySpec is not None

1	"""Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`."""
2
3	from __future__ import annotations
4
5	import json
6	from datetime import timedelta
7	from typing import Literal
8
9	import pytest
10
11	from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
12	from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
13	from dlm_sway.suite import report, score
14	from dlm_sway.suite.spec import SwaySpec
15
16
17	class _AdherenceSpec(ProbeSpec):
18	kind: Literal["__score_adherence"] = "__score_adherence"
19
20
21	class _AdherenceProbe(Probe):
22	kind = "__score_adherence"
23	spec_cls = _AdherenceSpec
24	category = "adherence"
25
26	def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
27	raise NotImplementedError # never executed; registered for category lookup
28
29
30	class _AttributionSpec(ProbeSpec):
31	kind: Literal["__score_attribution"] = "__score_attribution"
32
33
34	class _AttributionProbe(Probe):
35	kind = "__score_attribution"
36	spec_cls = _AttributionSpec
37	category = "attribution"
38
39	def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
40	raise NotImplementedError
41
42
43	def _synth_suite(*probes: ProbeResult) -> SuiteResult:
44	started = utcnow()
45	return SuiteResult(
46	spec_path="sway.yaml",
47	started_at=started,
48	finished_at=started + timedelta(seconds=1),
49	base_model_id="base",
50	adapter_id="adapter",
51	sway_version="0.1.0.dev0",
52	probes=probes,
53	)
54
55
56	class TestCompute:
57	def test_single_passing_probe(self) -> None:
58	suite = _synth_suite(
59	ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
60	)
61	s = score.compute(suite)
62	assert s.overall == pytest.approx(0.8)
63	assert s.components["adherence"] == pytest.approx(0.8)
64	assert s.band == "healthy"
65
66	def test_mixed_categories_weighted(self) -> None:
67	suite = _synth_suite(
68	ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
69	ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3),
70	)
71	s = score.compute(suite)
72	# Active categories: adherence (0.30) + attribution (0.35). Normalized.
73	expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35)
74	assert s.overall == pytest.approx(expected)
75
76	def test_errors_and_skips_excluded(self) -> None:
77	suite = _synth_suite(
78	ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
79	ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None),
80	ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None),
81	)
82	s = score.compute(suite)
83	assert s.components["adherence"] == pytest.approx(0.9)
84
85	def test_per_probe_weights_override_uniform(self) -> None:
86	suite = _synth_suite(
87	ProbeResult(
88	name="a",
89	kind="__score_adherence",
90	verdict=Verdict.PASS,
91	score=1.0,
92	evidence={"weight": 3.0},
93	),
94	ProbeResult(
95	name="b",
96	kind="__score_adherence",
97	verdict=Verdict.PASS,
98	score=0.0,
99	evidence={"weight": 1.0},
100	),
101	)
102	s = score.compute(suite)
103	# Weighted mean: (3·1 + 1·0) / 4 = 0.75
104	assert s.components["adherence"] == pytest.approx(0.75)
105
106	def test_failed_probe_surfaces_in_findings(self) -> None:
107	suite = _synth_suite(
108	ProbeResult(
109	name="bad",
110	kind="__score_adherence",
111	verdict=Verdict.FAIL,
112	score=0.1,
113	message="nope",
114	)
115	)
116	s = score.compute(suite)
117	assert any("bad" in f for f in s.findings)
118
119
120	class TestJsonReport:
121	def test_schema_fields(self) -> None:
122	suite = _synth_suite(
123	ProbeResult(
124	name="p1",
125	kind="__score_adherence",
126	verdict=Verdict.PASS,
127	score=0.75,
128	raw=0.12,
129	z_score=3.1,
130	)
131	)
132	s = score.compute(suite)
133	out = json.loads(report.to_json(suite, s))
134	assert out["schema_version"] == 1
135	assert out["score"]["overall"] == pytest.approx(0.75)
136	assert out["probes"][0]["verdict"] == "pass"
137	assert out["probes"][0]["z_score"] == pytest.approx(3.1)
138
139
140	class TestJunit:
141	def test_counts_populated(self) -> None:
142	suite = _synth_suite(
143	ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0),
144	ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0),
145	ProbeResult(
146	name="p3",
147	kind="__score_adherence",
148	verdict=Verdict.ERROR,
149	score=None,
150	),
151	)
152	s = score.compute(suite)
153	xml = report.to_junit(suite, s)
154	assert 'tests="3"' in xml
155	assert 'failures="1"' in xml
156	assert 'errors="1"' in xml
157	assert "<failure" in xml
158	assert "<error" in xml
159
160
161	class TestMarkdown:
162	def test_contains_probe_table(self) -> None:
163	suite = _synth_suite(
164	ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
165	)
166	s = score.compute(suite)
167	md = report.to_markdown(suite, s)
168	assert "sway report" in md
169	assert "\| p1 \| `__score_adherence`" in md
170
171
172	class TestTerminal:
173	def test_renders_without_error(self) -> None:
174	import io
175
176	from rich.console import Console
177
178	suite = _synth_suite(
179	ProbeResult(
180	name="p1",
181	kind="__score_adherence",
182	verdict=Verdict.PASS,
183	score=0.8,
184	raw=0.12,
185	z_score=3.1,
186	message="looks fine",
187	),
188	ProbeResult(
189	name="p2",
190	kind="__score_attribution",
191	verdict=Verdict.FAIL,
192	score=0.1,
193	message="a very long message that will be truncated — " * 5,
194	),
195	ProbeResult(
196	name="p3",
197	kind="__score_adherence",
198	verdict=Verdict.SKIP,
199	score=None,
200	),
201	)
202	s = score.compute(suite)
203	buf = io.StringIO()
204	console = Console(file=buf, force_terminal=False, width=120)
205	report.to_terminal(suite, s, console=console)
206	out = buf.getvalue()
207	assert "sway report" in out
208	assert "overall:" in out
209	assert "p1" in out
210	assert "p2" in out
211	# Top findings section kicks in because p2 failed.
212	assert "top findings" in out
213
214
215	# Force the SwaySpec model to stay reachable from tests (keeps mypy happy
216	# on the eventual CLI path that calls into both).
217	assert SwaySpec is not None