| 1 | """Tests for :mod:`dlm_sway.suite.score` + :mod:`dlm_sway.suite.report`.""" |
| 2 | |
| 3 | from __future__ import annotations |
| 4 | |
| 5 | import json |
| 6 | from datetime import timedelta |
| 7 | from typing import Literal |
| 8 | |
| 9 | import pytest |
| 10 | |
| 11 | from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict, utcnow |
| 12 | from dlm_sway.probes.base import Probe, ProbeSpec, RunContext |
| 13 | from dlm_sway.suite import report, score |
| 14 | from dlm_sway.suite.spec import SwaySpec |
| 15 | |
| 16 | |
| 17 | class _AdherenceSpec(ProbeSpec): |
| 18 | kind: Literal["__score_adherence"] = "__score_adherence" |
| 19 | |
| 20 | |
| 21 | class _AdherenceProbe(Probe): |
| 22 | kind = "__score_adherence" |
| 23 | spec_cls = _AdherenceSpec |
| 24 | category = "adherence" |
| 25 | |
| 26 | def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: |
| 27 | raise NotImplementedError # never executed; registered for category lookup |
| 28 | |
| 29 | |
| 30 | class _AttributionSpec(ProbeSpec): |
| 31 | kind: Literal["__score_attribution"] = "__score_attribution" |
| 32 | |
| 33 | |
| 34 | class _AttributionProbe(Probe): |
| 35 | kind = "__score_attribution" |
| 36 | spec_cls = _AttributionSpec |
| 37 | category = "attribution" |
| 38 | |
| 39 | def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult: |
| 40 | raise NotImplementedError |
| 41 | |
| 42 | |
| 43 | def _synth_suite(*probes: ProbeResult) -> SuiteResult: |
| 44 | started = utcnow() |
| 45 | return SuiteResult( |
| 46 | spec_path="sway.yaml", |
| 47 | started_at=started, |
| 48 | finished_at=started + timedelta(seconds=1), |
| 49 | base_model_id="base", |
| 50 | adapter_id="adapter", |
| 51 | sway_version="0.1.0.dev0", |
| 52 | probes=probes, |
| 53 | ) |
| 54 | |
| 55 | |
| 56 | class TestCompute: |
| 57 | def test_single_passing_probe(self) -> None: |
| 58 | suite = _synth_suite( |
| 59 | ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.8) |
| 60 | ) |
| 61 | s = score.compute(suite) |
| 62 | assert s.overall == pytest.approx(0.8) |
| 63 | assert s.components["adherence"] == pytest.approx(0.8) |
| 64 | assert s.band == "healthy" |
| 65 | |
| 66 | def test_mixed_categories_weighted(self) -> None: |
| 67 | suite = _synth_suite( |
| 68 | ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9), |
| 69 | ProbeResult(name="b", kind="__score_attribution", verdict=Verdict.PASS, score=0.3), |
| 70 | ) |
| 71 | s = score.compute(suite) |
| 72 | # Active categories: adherence (0.30) + attribution (0.35). Normalized. |
| 73 | expected = (0.30 * 0.9 + 0.35 * 0.3) / (0.30 + 0.35) |
| 74 | assert s.overall == pytest.approx(expected) |
| 75 | |
| 76 | def test_errors_and_skips_excluded(self) -> None: |
| 77 | suite = _synth_suite( |
| 78 | ProbeResult(name="a", kind="__score_adherence", verdict=Verdict.PASS, score=0.9), |
| 79 | ProbeResult(name="b", kind="__score_adherence", verdict=Verdict.SKIP, score=None), |
| 80 | ProbeResult(name="c", kind="__score_adherence", verdict=Verdict.ERROR, score=None), |
| 81 | ) |
| 82 | s = score.compute(suite) |
| 83 | assert s.components["adherence"] == pytest.approx(0.9) |
| 84 | |
| 85 | def test_per_probe_weights_override_uniform(self) -> None: |
| 86 | suite = _synth_suite( |
| 87 | ProbeResult( |
| 88 | name="a", |
| 89 | kind="__score_adherence", |
| 90 | verdict=Verdict.PASS, |
| 91 | score=1.0, |
| 92 | evidence={"weight": 3.0}, |
| 93 | ), |
| 94 | ProbeResult( |
| 95 | name="b", |
| 96 | kind="__score_adherence", |
| 97 | verdict=Verdict.PASS, |
| 98 | score=0.0, |
| 99 | evidence={"weight": 1.0}, |
| 100 | ), |
| 101 | ) |
| 102 | s = score.compute(suite) |
| 103 | # Weighted mean: (3·1 + 1·0) / 4 = 0.75 |
| 104 | assert s.components["adherence"] == pytest.approx(0.75) |
| 105 | |
| 106 | def test_failed_probe_surfaces_in_findings(self) -> None: |
| 107 | suite = _synth_suite( |
| 108 | ProbeResult( |
| 109 | name="bad", |
| 110 | kind="__score_adherence", |
| 111 | verdict=Verdict.FAIL, |
| 112 | score=0.1, |
| 113 | message="nope", |
| 114 | ) |
| 115 | ) |
| 116 | s = score.compute(suite) |
| 117 | assert any("bad" in f for f in s.findings) |
| 118 | |
| 119 | |
| 120 | class TestJsonReport: |
| 121 | def test_schema_fields(self) -> None: |
| 122 | suite = _synth_suite( |
| 123 | ProbeResult( |
| 124 | name="p1", |
| 125 | kind="__score_adherence", |
| 126 | verdict=Verdict.PASS, |
| 127 | score=0.75, |
| 128 | raw=0.12, |
| 129 | z_score=3.1, |
| 130 | ) |
| 131 | ) |
| 132 | s = score.compute(suite) |
| 133 | out = json.loads(report.to_json(suite, s)) |
| 134 | assert out["schema_version"] == 1 |
| 135 | assert out["score"]["overall"] == pytest.approx(0.75) |
| 136 | assert out["probes"][0]["verdict"] == "pass" |
| 137 | assert out["probes"][0]["z_score"] == pytest.approx(3.1) |
| 138 | |
| 139 | |
| 140 | class TestJunit: |
| 141 | def test_counts_populated(self) -> None: |
| 142 | suite = _synth_suite( |
| 143 | ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=1.0), |
| 144 | ProbeResult(name="p2", kind="__score_adherence", verdict=Verdict.FAIL, score=0.0), |
| 145 | ProbeResult( |
| 146 | name="p3", |
| 147 | kind="__score_adherence", |
| 148 | verdict=Verdict.ERROR, |
| 149 | score=None, |
| 150 | ), |
| 151 | ) |
| 152 | s = score.compute(suite) |
| 153 | xml = report.to_junit(suite, s) |
| 154 | assert 'tests="3"' in xml |
| 155 | assert 'failures="1"' in xml |
| 156 | assert 'errors="1"' in xml |
| 157 | assert "<failure" in xml |
| 158 | assert "<error" in xml |
| 159 | |
| 160 | |
| 161 | class TestMarkdown: |
| 162 | def test_contains_probe_table(self) -> None: |
| 163 | suite = _synth_suite( |
| 164 | ProbeResult(name="p1", kind="__score_adherence", verdict=Verdict.PASS, score=0.8) |
| 165 | ) |
| 166 | s = score.compute(suite) |
| 167 | md = report.to_markdown(suite, s) |
| 168 | assert "sway report" in md |
| 169 | assert "| p1 | `__score_adherence`" in md |
| 170 | |
| 171 | |
| 172 | class TestTerminal: |
| 173 | def test_renders_without_error(self) -> None: |
| 174 | import io |
| 175 | |
| 176 | from rich.console import Console |
| 177 | |
| 178 | suite = _synth_suite( |
| 179 | ProbeResult( |
| 180 | name="p1", |
| 181 | kind="__score_adherence", |
| 182 | verdict=Verdict.PASS, |
| 183 | score=0.8, |
| 184 | raw=0.12, |
| 185 | z_score=3.1, |
| 186 | message="looks fine", |
| 187 | ), |
| 188 | ProbeResult( |
| 189 | name="p2", |
| 190 | kind="__score_attribution", |
| 191 | verdict=Verdict.FAIL, |
| 192 | score=0.1, |
| 193 | message="a very long message that will be truncated — " * 5, |
| 194 | ), |
| 195 | ProbeResult( |
| 196 | name="p3", |
| 197 | kind="__score_adherence", |
| 198 | verdict=Verdict.SKIP, |
| 199 | score=None, |
| 200 | ), |
| 201 | ) |
| 202 | s = score.compute(suite) |
| 203 | buf = io.StringIO() |
| 204 | console = Console(file=buf, force_terminal=False, width=120) |
| 205 | report.to_terminal(suite, s, console=console) |
| 206 | out = buf.getvalue() |
| 207 | assert "sway report" in out |
| 208 | assert "overall:" in out |
| 209 | assert "p1" in out |
| 210 | assert "p2" in out |
| 211 | # Top findings section kicks in because p2 failed. |
| 212 | assert "top findings" in out |
| 213 | |
| 214 | |
| 215 | # Force the SwaySpec model to stay reachable from tests (keeps mypy happy |
| 216 | # on the eventual CLI path that calls into both). |
| 217 | assert SwaySpec is not None |