| 1 |
"""Unit tests for `dlm.harvest.sway_reader` (Sprint 33.2).""" |
| 2 |
|
| 3 |
from __future__ import annotations |
| 4 |
|
| 5 |
import json |
| 6 |
from pathlib import Path |
| 7 |
|
| 8 |
import pytest |
| 9 |
|
| 10 |
from dlm.harvest import ( |
| 11 |
HarvestCandidate, |
| 12 |
MalformedSwayReportError, |
| 13 |
NoReferenceError, |
| 14 |
read_sway_report, |
| 15 |
) |
| 16 |
|
| 17 |
|
| 18 |
def _write(tmp_path: Path, payload: object) -> Path: |
| 19 |
report = tmp_path / "sway.json" |
| 20 |
report.write_text(json.dumps(payload), encoding="utf-8") |
| 21 |
return report |
| 22 |
|
| 23 |
|
| 24 |
_PROBE_FAIL_WITH_REF = { |
| 25 |
"name": "fortran_subroutine_semantics", |
| 26 |
"kind": "section_internalization", |
| 27 |
"verdict": "fail", |
| 28 |
"score": 0.22, |
| 29 |
"raw": 0.22, |
| 30 |
"z_score": -1.7, |
| 31 |
"evidence": { |
| 32 |
"prompt": "What does SUBROUTINE DGEMM compute?", |
| 33 |
"reference": "A double-precision general matrix multiplication.", |
| 34 |
"confidence": 0.9, |
| 35 |
}, |
| 36 |
"message": "adapter failed semantic recall", |
| 37 |
"duration_s": 0.4, |
| 38 |
} |
| 39 |
_PROBE_FAIL_NO_REF = { |
| 40 |
"name": "docstring_recall", |
| 41 |
"kind": "prompt_collapse", |
| 42 |
"verdict": "fail", |
| 43 |
"score": 0.1, |
| 44 |
"evidence": {"per_section_scores": [0.1, 0.15]}, |
| 45 |
"message": "probe has no Q/A pair to harvest", |
| 46 |
"duration_s": 0.2, |
| 47 |
} |
| 48 |
_PROBE_PASS = { |
| 49 |
"name": "calibration", |
| 50 |
"kind": "calibration_drift", |
| 51 |
"verdict": "pass", |
| 52 |
"score": 0.95, |
| 53 |
"evidence": {"delta": 0.01}, |
| 54 |
"message": "calibration healthy", |
| 55 |
"duration_s": 0.1, |
| 56 |
} |
| 57 |
|
| 58 |
|
| 59 |
def _full_report(probes: list[dict]) -> dict: |
| 60 |
return { |
| 61 |
"schema_version": 1, |
| 62 |
"sway_version": "0.1.0.dev0", |
| 63 |
"base_model_id": "smollm2-135m", |
| 64 |
"adapter_id": "run_7", |
| 65 |
"started_at": "2026-04-21T00:00:00Z", |
| 66 |
"finished_at": "2026-04-21T00:05:00Z", |
| 67 |
"wall_seconds": 300.0, |
| 68 |
"probes": probes, |
| 69 |
} |
| 70 |
|
| 71 |
|
| 72 |
class TestHappyPath: |
| 73 |
def test_single_failing_probe_lifts_cleanly(self, tmp_path: Path) -> None: |
| 74 |
report = _write(tmp_path, _full_report([_PROBE_FAIL_WITH_REF, _PROBE_PASS])) |
| 75 |
candidates = read_sway_report(report) |
| 76 |
|
| 77 |
assert len(candidates) == 1 |
| 78 |
c = candidates[0] |
| 79 |
assert isinstance(c, HarvestCandidate) |
| 80 |
assert c.prompt == "What does SUBROUTINE DGEMM compute?" |
| 81 |
assert c.reference == "A double-precision general matrix multiplication." |
| 82 |
assert c.confidence == pytest.approx(0.9) |
| 83 |
assert c.probe_name == "fortran_subroutine_semantics" |
| 84 |
assert c.probe_kind == "section_internalization" |
| 85 |
assert c.source_adapter_version == "run_7" |
| 86 |
|
| 87 |
def test_empty_probes_list_yields_empty(self, tmp_path: Path) -> None: |
| 88 |
report = _write(tmp_path, _full_report([])) |
| 89 |
assert read_sway_report(report) == [] |
| 90 |
|
| 91 |
def test_all_passing_yields_empty(self, tmp_path: Path) -> None: |
| 92 |
report = _write(tmp_path, _full_report([_PROBE_PASS, _PROBE_PASS])) |
| 93 |
assert read_sway_report(report) == [] |
| 94 |
|
| 95 |
def test_missing_adapter_id_leaves_source_version_none(self, tmp_path: Path) -> None: |
| 96 |
payload = _full_report([_PROBE_FAIL_WITH_REF]) |
| 97 |
del payload["adapter_id"] |
| 98 |
report = _write(tmp_path, payload) |
| 99 |
candidates = read_sway_report(report) |
| 100 |
assert len(candidates) == 1 |
| 101 |
assert candidates[0].source_adapter_version is None |
| 102 |
|
| 103 |
def test_min_confidence_filters(self, tmp_path: Path) -> None: |
| 104 |
low_conf = {**_PROBE_FAIL_WITH_REF} |
| 105 |
low_conf["evidence"] = { |
| 106 |
"prompt": "q?", |
| 107 |
"reference": "a.", |
| 108 |
"confidence": 0.5, |
| 109 |
} |
| 110 |
report = _write(tmp_path, _full_report([low_conf])) |
| 111 |
assert read_sway_report(report, min_confidence=0.8) == [] |
| 112 |
assert len(read_sway_report(report, min_confidence=0.4)) == 1 |
| 113 |
|
| 114 |
def test_missing_confidence_defaults_to_one(self, tmp_path: Path) -> None: |
| 115 |
no_conf = {**_PROBE_FAIL_WITH_REF} |
| 116 |
no_conf["evidence"] = {"prompt": "q?", "reference": "a."} |
| 117 |
report = _write(tmp_path, _full_report([no_conf])) |
| 118 |
candidates = read_sway_report(report) |
| 119 |
assert len(candidates) == 1 |
| 120 |
assert candidates[0].confidence == 1.0 |
| 121 |
|
| 122 |
def test_invalid_confidence_defaults_to_one(self, tmp_path: Path) -> None: |
| 123 |
broken_conf = {**_PROBE_FAIL_WITH_REF} |
| 124 |
broken_conf["evidence"] = { |
| 125 |
"prompt": "q?", |
| 126 |
"reference": "a.", |
| 127 |
"confidence": {"not": "numeric"}, |
| 128 |
} |
| 129 |
report = _write(tmp_path, _full_report([broken_conf])) |
| 130 |
candidates = read_sway_report(report) |
| 131 |
assert len(candidates) == 1 |
| 132 |
assert candidates[0].confidence == 1.0 |
| 133 |
|
| 134 |
|
| 135 |
class TestMissingReference: |
| 136 |
def test_strict_raises(self, tmp_path: Path) -> None: |
| 137 |
report = _write(tmp_path, _full_report([_PROBE_FAIL_NO_REF])) |
| 138 |
with pytest.raises(NoReferenceError): |
| 139 |
read_sway_report(report, strict=True) |
| 140 |
|
| 141 |
def test_lax_skips_with_log(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: |
| 142 |
report = _write( |
| 143 |
tmp_path, |
| 144 |
_full_report([_PROBE_FAIL_NO_REF, _PROBE_FAIL_WITH_REF]), |
| 145 |
) |
| 146 |
with caplog.at_level("WARNING"): |
| 147 |
candidates = read_sway_report(report, strict=False) |
| 148 |
assert len(candidates) == 1 |
| 149 |
assert candidates[0].probe_name == "fortran_subroutine_semantics" |
| 150 |
assert any("carries no reference" in rec.message for rec in caplog.records) |
| 151 |
|
| 152 |
def test_empty_reference_string_refused(self, tmp_path: Path) -> None: |
| 153 |
empty = {**_PROBE_FAIL_WITH_REF} |
| 154 |
empty["evidence"] = {"prompt": "q?", "reference": " "} |
| 155 |
report = _write(tmp_path, _full_report([empty])) |
| 156 |
with pytest.raises(NoReferenceError): |
| 157 |
read_sway_report(report, strict=True) |
| 158 |
|
| 159 |
|
| 160 |
class TestMalformed: |
| 161 |
def test_file_missing(self, tmp_path: Path) -> None: |
| 162 |
with pytest.raises(MalformedSwayReportError, match="cannot read"): |
| 163 |
read_sway_report(tmp_path / "does-not-exist.json") |
| 164 |
|
| 165 |
def test_not_json(self, tmp_path: Path) -> None: |
| 166 |
bad = tmp_path / "bad.json" |
| 167 |
bad.write_text("this is not json {", encoding="utf-8") |
| 168 |
with pytest.raises(MalformedSwayReportError, match="not valid JSON"): |
| 169 |
read_sway_report(bad) |
| 170 |
|
| 171 |
def test_top_level_array_rejected(self, tmp_path: Path) -> None: |
| 172 |
report = _write(tmp_path, []) |
| 173 |
with pytest.raises(MalformedSwayReportError, match="must be a JSON object"): |
| 174 |
read_sway_report(report) |
| 175 |
|
| 176 |
def test_missing_schema_version(self, tmp_path: Path) -> None: |
| 177 |
report = _write(tmp_path, {"sway_version": "0.1", "probes": []}) |
| 178 |
with pytest.raises(MalformedSwayReportError, match="schema_version"): |
| 179 |
read_sway_report(report) |
| 180 |
|
| 181 |
def test_newer_schema_refused(self, tmp_path: Path) -> None: |
| 182 |
payload = _full_report([]) |
| 183 |
payload["schema_version"] = 99 |
| 184 |
report = _write(tmp_path, payload) |
| 185 |
with pytest.raises(MalformedSwayReportError, match="newer than this reader"): |
| 186 |
read_sway_report(report) |
| 187 |
|
| 188 |
def test_supported_schema_pinned_to_current_sway_version(self) -> None: |
| 189 |
"""Cross-repo bump-gate: `_SUPPORTED_SWAY_SCHEMA` must match sway's. |
| 190 |
|
| 191 |
Sway's shipping schema is v1. If a sway bump lands but we forget |
| 192 |
to bump this constant (or vice versa), any operator harvesting |
| 193 |
newer sway output hits `test_newer_schema_refused`'s refusal |
| 194 |
path. This test pins the constant itself so a casual edit to |
| 195 |
the reader can't silently drop the support floor — the golden |
| 196 |
fails and forces a code review. |
| 197 |
""" |
| 198 |
from dlm.harvest.sway_reader import _SUPPORTED_SWAY_SCHEMA |
| 199 |
|
| 200 |
assert _SUPPORTED_SWAY_SCHEMA == 1, ( |
| 201 |
"sway schema pin changed — verify `sway/src/dlm_sway/suite/report.py` " |
| 202 |
"still emits this version and bump the docs pointer in `dlm.lock`" |
| 203 |
) |
| 204 |
|
| 205 |
def test_missing_probes_array(self, tmp_path: Path) -> None: |
| 206 |
report = _write(tmp_path, {"schema_version": 1, "sway_version": "0.1"}) |
| 207 |
with pytest.raises(MalformedSwayReportError, match="`probes` array"): |
| 208 |
read_sway_report(report) |
| 209 |
|
| 210 |
def test_probe_evidence_not_object(self, tmp_path: Path) -> None: |
| 211 |
broken = {**_PROBE_FAIL_WITH_REF} |
| 212 |
broken["evidence"] = "not-an-object" |
| 213 |
report = _write(tmp_path, _full_report([broken])) |
| 214 |
with pytest.raises(NoReferenceError, match="evidence is not an object"): |
| 215 |
read_sway_report(report) |
| 216 |
|
| 217 |
def test_probe_is_not_object(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None: |
| 218 |
report = _write( |
| 219 |
tmp_path, |
| 220 |
_full_report([_PROBE_FAIL_WITH_REF, "garbage"]), # type: ignore[list-item] |
| 221 |
) |
| 222 |
with caplog.at_level("WARNING"): |
| 223 |
candidates = read_sway_report(report) |
| 224 |
assert len(candidates) == 1 |
| 225 |
assert any("not an object" in rec.message for rec in caplog.records) |