Python · 7076 bytes Raw Blame History
1 """Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`."""
2
3 from __future__ import annotations
4
5 import json
6 from datetime import timedelta
7 from typing import Literal
8
9 import pytest
10
11 from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow
12 from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
13 from dlm_sway.suite import report, score
14 from dlm_sway.suite.spec import SwaySpec
15
16
17 class _AdherenceSpec(ProbeSpec):
18 kind: Literal["__score_adherence"] = "__score_adherence"
19
20
21 class _AdherenceProbe(Probe):
22 kind = "__score_adherence"
23 spec_cls = _AdherenceSpec
24 category = "adherence"
25
26 def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
27 raise NotImplementedError # never executed; registered for category lookup
28
29
30 class _AttributionSpec(ProbeSpec):
31 kind: Literal["__score_attribution"] = "__score_attribution"
32
33
34 class _AttributionProbe(Probe):
35 kind = "__score_attribution"
36 spec_cls = _AttributionSpec
37 category = "attribution"
38
39 def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
40 raise NotImplementedError
41
42
43 def _synth_suite(*probes: ProbeResult) -> SuiteResult:
44 started = utcnow()
45 return SuiteResult(
46 spec_path="sway.yaml",
47 started_at=started,
48 finished_at=started + timedelta(seconds=1),
49 base_model_id="base",
50 adapter_id="adapter",
51 sway_version="0.1.0.dev0",
52 probes=probes,
53 )
54
55
56 class TestCompute:
57 def test_single_passing_probe(self) -> None:
58 suite = _synth_suite(
59 ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
60 )
61 s = score.compute(suite)
62 assert s.overall == pytest.approx(0.8)
63 assert s.components["adherence"] == pytest.approx(0.8)
64 assert s.band == "healthy"
65
66 def test_mixed_categories_weighted(self) -> None:
67 suite = _synth_suite(
68 ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
69 ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3),
70 )
71 s = score.compute(suite)
72 # Active categories: adherence (0.30) + attribution (0.35). Normalized.
73 expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35)
74 assert s.overall == pytest.approx(expected)
75
76 def test_errors_and_skips_excluded(self) -> None:
77 suite = _synth_suite(
78 ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9),
79 ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None),
80 ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None),
81 )
82 s = score.compute(suite)
83 assert s.components["adherence"] == pytest.approx(0.9)
84
85 def test_per_probe_weights_override_uniform(self) -> None:
86 suite = _synth_suite(
87 ProbeResult(
88 name="a",
89 kind="__score_adherence",
90 verdict=Verdict.PASS,
91 score=1.0,
92 evidence={"weight": 3.0},
93 ),
94 ProbeResult(
95 name="b",
96 kind="__score_adherence",
97 verdict=Verdict.PASS,
98 score=0.0,
99 evidence={"weight": 1.0},
100 ),
101 )
102 s = score.compute(suite)
103 # Weighted mean: (3·1 + 1·0) / 4 = 0.75
104 assert s.components["adherence"] == pytest.approx(0.75)
105
106 def test_failed_probe_surfaces_in_findings(self) -> None:
107 suite = _synth_suite(
108 ProbeResult(
109 name="bad",
110 kind="__score_adherence",
111 verdict=Verdict.FAIL,
112 score=0.1,
113 message="nope",
114 )
115 )
116 s = score.compute(suite)
117 assert any("bad" in f for f in s.findings)
118
119
120 class TestJsonReport:
121 def test_schema_fields(self) -> None:
122 suite = _synth_suite(
123 ProbeResult(
124 name="p1",
125 kind="__score_adherence",
126 verdict=Verdict.PASS,
127 score=0.75,
128 raw=0.12,
129 z_score=3.1,
130 )
131 )
132 s = score.compute(suite)
133 out = json.loads(report.to_json(suite, s))
134 assert out["schema_version"] == 1
135 assert out["score"]["overall"] == pytest.approx(0.75)
136 assert out["probes"][0]["verdict"] == "pass"
137 assert out["probes"][0]["z_score"] == pytest.approx(3.1)
138
139
140 class TestJunit:
141 def test_counts_populated(self) -> None:
142 suite = _synth_suite(
143 ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0),
144 ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0),
145 ProbeResult(
146 name="p3",
147 kind="__score_adherence",
148 verdict=Verdict.ERROR,
149 score=None,
150 ),
151 )
152 s = score.compute(suite)
153 xml = report.to_junit(suite, s)
154 assert 'tests="3"' in xml
155 assert 'failures="1"' in xml
156 assert 'errors="1"' in xml
157 assert "<failure" in xml
158 assert "<error" in xml
159
160
161 class TestMarkdown:
162 def test_contains_probe_table(self) -> None:
163 suite = _synth_suite(
164 ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8)
165 )
166 s = score.compute(suite)
167 md = report.to_markdown(suite, s)
168 assert "sway report" in md
169 assert "| p1 | `__score_adherence`" in md
170
171
172 class TestTerminal:
173 def test_renders_without_error(self) -> None:
174 import io
175
176 from rich.console import Console
177
178 suite = _synth_suite(
179 ProbeResult(
180 name="p1",
181 kind="__score_adherence",
182 verdict=Verdict.PASS,
183 score=0.8,
184 raw=0.12,
185 z_score=3.1,
186 message="looks fine",
187 ),
188 ProbeResult(
189 name="p2",
190 kind="__score_attribution",
191 verdict=Verdict.FAIL,
192 score=0.1,
193 message="a very long message that will be truncated — " * 5,
194 ),
195 ProbeResult(
196 name="p3",
197 kind="__score_adherence",
198 verdict=Verdict.SKIP,
199 score=None,
200 ),
201 )
202 s = score.compute(suite)
203 buf = io.StringIO()
204 console = Console(file=buf, force_terminal=False, width=120)
205 report.to_terminal(suite, s, console=console)
206 out = buf.getvalue()
207 assert "sway report" in out
208 assert "overall:" in out
209 assert "p1" in out
210 assert "p2" in out
211 # Top findings section kicks in because p2 failed.
212 assert "top findings" in out
213
214
215 # Force the SwaySpec model to stay reachable from tests (keeps mypy happy
216 # on the eventual CLI path that calls into both).
217 assert SwaySpec is not None