Python · 8729 bytes Raw Blame History
1 """Unit tests for `dlm.harvest.sway_reader` (Sprint 33.2)."""
2
3 from __future__ import annotations
4
5 import json
6 from pathlib import Path
7
8 import pytest
9
10 from dlm.harvest import (
11 HarvestCandidate,
12 MalformedSwayReportError,
13 NoReferenceError,
14 read_sway_report,
15 )
16
17
18 def _write(tmp_path: Path, payload: object) -> Path:
19 report = tmp_path / "sway.json"
20 report.write_text(json.dumps(payload), encoding="utf-8")
21 return report
22
23
24 _PROBE_FAIL_WITH_REF = {
25 "name": "fortran_subroutine_semantics",
26 "kind": "section_internalization",
27 "verdict": "fail",
28 "score": 0.22,
29 "raw": 0.22,
30 "z_score": -1.7,
31 "evidence": {
32 "prompt": "What does SUBROUTINE DGEMM compute?",
33 "reference": "A double-precision general matrix multiplication.",
34 "confidence": 0.9,
35 },
36 "message": "adapter failed semantic recall",
37 "duration_s": 0.4,
38 }
39 _PROBE_FAIL_NO_REF = {
40 "name": "docstring_recall",
41 "kind": "prompt_collapse",
42 "verdict": "fail",
43 "score": 0.1,
44 "evidence": {"per_section_scores": [0.1, 0.15]},
45 "message": "probe has no Q/A pair to harvest",
46 "duration_s": 0.2,
47 }
48 _PROBE_PASS = {
49 "name": "calibration",
50 "kind": "calibration_drift",
51 "verdict": "pass",
52 "score": 0.95,
53 "evidence": {"delta": 0.01},
54 "message": "calibration healthy",
55 "duration_s": 0.1,
56 }
57
58
59 def _full_report(probes: list[dict]) -> dict:
60 return {
61 "schema_version": 1,
62 "sway_version": "0.1.0.dev0",
63 "base_model_id": "smollm2-135m",
64 "adapter_id": "run_7",
65 "started_at": "2026-04-21T00:00:00Z",
66 "finished_at": "2026-04-21T00:05:00Z",
67 "wall_seconds": 300.0,
68 "probes": probes,
69 }
70
71
72 class TestHappyPath:
73 def test_single_failing_probe_lifts_cleanly(self, tmp_path: Path) -> None:
74 report = _write(tmp_path, _full_report([_PROBE_FAIL_WITH_REF, _PROBE_PASS]))
75 candidates = read_sway_report(report)
76
77 assert len(candidates) == 1
78 c = candidates[0]
79 assert isinstance(c, HarvestCandidate)
80 assert c.prompt == "What does SUBROUTINE DGEMM compute?"
81 assert c.reference == "A double-precision general matrix multiplication."
82 assert c.confidence == pytest.approx(0.9)
83 assert c.probe_name == "fortran_subroutine_semantics"
84 assert c.probe_kind == "section_internalization"
85 assert c.source_adapter_version == "run_7"
86
87 def test_empty_probes_list_yields_empty(self, tmp_path: Path) -> None:
88 report = _write(tmp_path, _full_report([]))
89 assert read_sway_report(report) == []
90
91 def test_all_passing_yields_empty(self, tmp_path: Path) -> None:
92 report = _write(tmp_path, _full_report([_PROBE_PASS, _PROBE_PASS]))
93 assert read_sway_report(report) == []
94
95 def test_missing_adapter_id_leaves_source_version_none(self, tmp_path: Path) -> None:
96 payload = _full_report([_PROBE_FAIL_WITH_REF])
97 del payload["adapter_id"]
98 report = _write(tmp_path, payload)
99 candidates = read_sway_report(report)
100 assert len(candidates) == 1
101 assert candidates[0].source_adapter_version is None
102
103 def test_min_confidence_filters(self, tmp_path: Path) -> None:
104 low_conf = {**_PROBE_FAIL_WITH_REF}
105 low_conf["evidence"] = {
106 "prompt": "q?",
107 "reference": "a.",
108 "confidence": 0.5,
109 }
110 report = _write(tmp_path, _full_report([low_conf]))
111 assert read_sway_report(report, min_confidence=0.8) == []
112 assert len(read_sway_report(report, min_confidence=0.4)) == 1
113
114 def test_missing_confidence_defaults_to_one(self, tmp_path: Path) -> None:
115 no_conf = {**_PROBE_FAIL_WITH_REF}
116 no_conf["evidence"] = {"prompt": "q?", "reference": "a."}
117 report = _write(tmp_path, _full_report([no_conf]))
118 candidates = read_sway_report(report)
119 assert len(candidates) == 1
120 assert candidates[0].confidence == 1.0
121
122 def test_invalid_confidence_defaults_to_one(self, tmp_path: Path) -> None:
123 broken_conf = {**_PROBE_FAIL_WITH_REF}
124 broken_conf["evidence"] = {
125 "prompt": "q?",
126 "reference": "a.",
127 "confidence": {"not": "numeric"},
128 }
129 report = _write(tmp_path, _full_report([broken_conf]))
130 candidates = read_sway_report(report)
131 assert len(candidates) == 1
132 assert candidates[0].confidence == 1.0
133
134
135 class TestMissingReference:
136 def test_strict_raises(self, tmp_path: Path) -> None:
137 report = _write(tmp_path, _full_report([_PROBE_FAIL_NO_REF]))
138 with pytest.raises(NoReferenceError):
139 read_sway_report(report, strict=True)
140
141 def test_lax_skips_with_log(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
142 report = _write(
143 tmp_path,
144 _full_report([_PROBE_FAIL_NO_REF, _PROBE_FAIL_WITH_REF]),
145 )
146 with caplog.at_level("WARNING"):
147 candidates = read_sway_report(report, strict=False)
148 assert len(candidates) == 1
149 assert candidates[0].probe_name == "fortran_subroutine_semantics"
150 assert any("carries no reference" in rec.message for rec in caplog.records)
151
152 def test_empty_reference_string_refused(self, tmp_path: Path) -> None:
153 empty = {**_PROBE_FAIL_WITH_REF}
154 empty["evidence"] = {"prompt": "q?", "reference": " "}
155 report = _write(tmp_path, _full_report([empty]))
156 with pytest.raises(NoReferenceError):
157 read_sway_report(report, strict=True)
158
159
160 class TestMalformed:
161 def test_file_missing(self, tmp_path: Path) -> None:
162 with pytest.raises(MalformedSwayReportError, match="cannot read"):
163 read_sway_report(tmp_path / "does-not-exist.json")
164
165 def test_not_json(self, tmp_path: Path) -> None:
166 bad = tmp_path / "bad.json"
167 bad.write_text("this is not json {", encoding="utf-8")
168 with pytest.raises(MalformedSwayReportError, match="not valid JSON"):
169 read_sway_report(bad)
170
171 def test_top_level_array_rejected(self, tmp_path: Path) -> None:
172 report = _write(tmp_path, [])
173 with pytest.raises(MalformedSwayReportError, match="must be a JSON object"):
174 read_sway_report(report)
175
176 def test_missing_schema_version(self, tmp_path: Path) -> None:
177 report = _write(tmp_path, {"sway_version": "0.1", "probes": []})
178 with pytest.raises(MalformedSwayReportError, match="schema_version"):
179 read_sway_report(report)
180
181 def test_newer_schema_refused(self, tmp_path: Path) -> None:
182 payload = _full_report([])
183 payload["schema_version"] = 99
184 report = _write(tmp_path, payload)
185 with pytest.raises(MalformedSwayReportError, match="newer than this reader"):
186 read_sway_report(report)
187
188 def test_supported_schema_pinned_to_current_sway_version(self) -> None:
189 """Cross-repo bump-gate: `_SUPPORTED_SWAY_SCHEMA` must match sway's.
190
191 Sway's shipping schema is v1. If a sway bump lands but we forget
192 to bump this constant (or vice versa), any operator harvesting
193 newer sway output hits `test_newer_schema_refused`'s refusal
194 path. This test pins the constant itself so a casual edit to
195 the reader can't silently drop the support floor — the golden
196 fails and forces a code review.
197 """
198 from dlm.harvest.sway_reader import _SUPPORTED_SWAY_SCHEMA
199
200 assert _SUPPORTED_SWAY_SCHEMA == 1, (
201 "sway schema pin changed — verify `sway/src/dlm_sway/suite/report.py` "
202 "still emits this version and bump the docs pointer in `dlm.lock`"
203 )
204
205 def test_missing_probes_array(self, tmp_path: Path) -> None:
206 report = _write(tmp_path, {"schema_version": 1, "sway_version": "0.1"})
207 with pytest.raises(MalformedSwayReportError, match="`probes` array"):
208 read_sway_report(report)
209
210 def test_probe_evidence_not_object(self, tmp_path: Path) -> None:
211 broken = {**_PROBE_FAIL_WITH_REF}
212 broken["evidence"] = "not-an-object"
213 report = _write(tmp_path, _full_report([broken]))
214 with pytest.raises(NoReferenceError, match="evidence is not an object"):
215 read_sway_report(report)
216
217 def test_probe_is_not_object(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
218 report = _write(
219 tmp_path,
220 _full_report([_PROBE_FAIL_WITH_REF, "garbage"]), # type: ignore[list-item]
221 )
222 with caplog.at_level("WARNING"):
223 candidates = read_sway_report(report)
224 assert len(candidates) == 1
225 assert any("not an object" in rec.message for rec in caplog.records)