documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 8729 bytes Raw Blame History

  
        1
        """Unit tests for `dlm.harvest.sway_reader` (Sprint 33.2)."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import json
      
        6
        from pathlib import Path
      
        7
        
        8
        import pytest
      
        9
        
        10
        from dlm.harvest import (
      
        11
            HarvestCandidate,
      
        12
            MalformedSwayReportError,
      
        13
            NoReferenceError,
      
        14
            read_sway_report,
      
        15
        )
      
        16
        
        17
        
        18
        def _write(tmp_path: Path, payload: object) -> Path:
      
        19
            report = tmp_path / "sway.json"
      
        20
            report.write_text(json.dumps(payload), encoding="utf-8")
      
        21
            return report
      
        22
        
        23
        
        24
        _PROBE_FAIL_WITH_REF = {
      
        25
            "name": "fortran_subroutine_semantics",
      
        26
            "kind": "section_internalization",
      
        27
            "verdict": "fail",
      
        28
            "score": 0.22,
      
        29
            "raw": 0.22,
      
        30
            "z_score": -1.7,
      
        31
            "evidence": {
      
        32
                "prompt": "What does SUBROUTINE DGEMM compute?",
      
        33
                "reference": "A double-precision general matrix multiplication.",
      
        34
                "confidence": 0.9,
      
        35
            },
      
        36
            "message": "adapter failed semantic recall",
      
        37
            "duration_s": 0.4,
      
        38
        }
      
        39
        _PROBE_FAIL_NO_REF = {
      
        40
            "name": "docstring_recall",
      
        41
            "kind": "prompt_collapse",
      
        42
            "verdict": "fail",
      
        43
            "score": 0.1,
      
        44
            "evidence": {"per_section_scores": [0.1, 0.15]},
      
        45
            "message": "probe has no Q/A pair to harvest",
      
        46
            "duration_s": 0.2,
      
        47
        }
      
        48
        _PROBE_PASS = {
      
        49
            "name": "calibration",
      
        50
            "kind": "calibration_drift",
      
        51
            "verdict": "pass",
      
        52
            "score": 0.95,
      
        53
            "evidence": {"delta": 0.01},
      
        54
            "message": "calibration healthy",
      
        55
            "duration_s": 0.1,
      
        56
        }
      
        57
        
        58
        
        59
        def _full_report(probes: list[dict]) -> dict:
      
        60
            return {
      
        61
                "schema_version": 1,
      
        62
                "sway_version": "0.1.0.dev0",
      
        63
                "base_model_id": "smollm2-135m",
      
        64
                "adapter_id": "run_7",
      
        65
                "started_at": "2026-04-21T00:00:00Z",
      
        66
                "finished_at": "2026-04-21T00:05:00Z",
      
        67
                "wall_seconds": 300.0,
      
        68
                "probes": probes,
      
        69
            }
      
        70
        
        71
        
        72
        class TestHappyPath:
      
        73
            def test_single_failing_probe_lifts_cleanly(self, tmp_path: Path) -> None:
      
        74
                report = _write(tmp_path, _full_report([_PROBE_FAIL_WITH_REF, _PROBE_PASS]))
      
        75
                candidates = read_sway_report(report)
      
        76
        
        77
                assert len(candidates) == 1
      
        78
                c = candidates[0]
      
        79
                assert isinstance(c, HarvestCandidate)
      
        80
                assert c.prompt == "What does SUBROUTINE DGEMM compute?"
      
        81
                assert c.reference == "A double-precision general matrix multiplication."
      
        82
                assert c.confidence == pytest.approx(0.9)
      
        83
                assert c.probe_name == "fortran_subroutine_semantics"
      
        84
                assert c.probe_kind == "section_internalization"
      
        85
                assert c.source_adapter_version == "run_7"
      
        86
        
        87
            def test_empty_probes_list_yields_empty(self, tmp_path: Path) -> None:
      
        88
                report = _write(tmp_path, _full_report([]))
      
        89
                assert read_sway_report(report) == []
      
        90
        
        91
            def test_all_passing_yields_empty(self, tmp_path: Path) -> None:
      
        92
                report = _write(tmp_path, _full_report([_PROBE_PASS, _PROBE_PASS]))
      
        93
                assert read_sway_report(report) == []
      
        94
        
        95
            def test_missing_adapter_id_leaves_source_version_none(self, tmp_path: Path) -> None:
      
        96
                payload = _full_report([_PROBE_FAIL_WITH_REF])
      
        97
                del payload["adapter_id"]
      
        98
                report = _write(tmp_path, payload)
      
        99
                candidates = read_sway_report(report)
      
        100
                assert len(candidates) == 1
      
        101
                assert candidates[0].source_adapter_version is None
      
        102
        
        103
            def test_min_confidence_filters(self, tmp_path: Path) -> None:
      
        104
                low_conf = {**_PROBE_FAIL_WITH_REF}
      
        105
                low_conf["evidence"] = {
      
        106
                    "prompt": "q?",
      
        107
                    "reference": "a.",
      
        108
                    "confidence": 0.5,
      
        109
                }
      
        110
                report = _write(tmp_path, _full_report([low_conf]))
      
        111
                assert read_sway_report(report, min_confidence=0.8) == []
      
        112
                assert len(read_sway_report(report, min_confidence=0.4)) == 1
      
        113
        
        114
            def test_missing_confidence_defaults_to_one(self, tmp_path: Path) -> None:
      
        115
                no_conf = {**_PROBE_FAIL_WITH_REF}
      
        116
                no_conf["evidence"] = {"prompt": "q?", "reference": "a."}
      
        117
                report = _write(tmp_path, _full_report([no_conf]))
      
        118
                candidates = read_sway_report(report)
      
        119
                assert len(candidates) == 1
      
        120
                assert candidates[0].confidence == 1.0
      
        121
        
        122
            def test_invalid_confidence_defaults_to_one(self, tmp_path: Path) -> None:
      
        123
                broken_conf = {**_PROBE_FAIL_WITH_REF}
      
        124
                broken_conf["evidence"] = {
      
        125
                    "prompt": "q?",
      
        126
                    "reference": "a.",
      
        127
                    "confidence": {"not": "numeric"},
      
        128
                }
      
        129
                report = _write(tmp_path, _full_report([broken_conf]))
      
        130
                candidates = read_sway_report(report)
      
        131
                assert len(candidates) == 1
      
        132
                assert candidates[0].confidence == 1.0
      
        133
        
        134
        
        135
        class TestMissingReference:
      
        136
            def test_strict_raises(self, tmp_path: Path) -> None:
      
        137
                report = _write(tmp_path, _full_report([_PROBE_FAIL_NO_REF]))
      
        138
                with pytest.raises(NoReferenceError):
      
        139
                    read_sway_report(report, strict=True)
      
        140
        
        141
            def test_lax_skips_with_log(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
      
        142
                report = _write(
      
        143
                    tmp_path,
      
        144
                    _full_report([_PROBE_FAIL_NO_REF, _PROBE_FAIL_WITH_REF]),
      
        145
                )
      
        146
                with caplog.at_level("WARNING"):
      
        147
                    candidates = read_sway_report(report, strict=False)
      
        148
                assert len(candidates) == 1
      
        149
                assert candidates[0].probe_name == "fortran_subroutine_semantics"
      
        150
                assert any("carries no reference" in rec.message for rec in caplog.records)
      
        151
        
        152
            def test_empty_reference_string_refused(self, tmp_path: Path) -> None:
      
        153
                empty = {**_PROBE_FAIL_WITH_REF}
      
        154
                empty["evidence"] = {"prompt": "q?", "reference": "   "}
      
        155
                report = _write(tmp_path, _full_report([empty]))
      
        156
                with pytest.raises(NoReferenceError):
      
        157
                    read_sway_report(report, strict=True)
      
        158
        
        159
        
        160
        class TestMalformed:
      
        161
            def test_file_missing(self, tmp_path: Path) -> None:
      
        162
                with pytest.raises(MalformedSwayReportError, match="cannot read"):
      
        163
                    read_sway_report(tmp_path / "does-not-exist.json")
      
        164
        
        165
            def test_not_json(self, tmp_path: Path) -> None:
      
        166
                bad = tmp_path / "bad.json"
      
        167
                bad.write_text("this is not json {", encoding="utf-8")
      
        168
                with pytest.raises(MalformedSwayReportError, match="not valid JSON"):
      
        169
                    read_sway_report(bad)
      
        170
        
        171
            def test_top_level_array_rejected(self, tmp_path: Path) -> None:
      
        172
                report = _write(tmp_path, [])
      
        173
                with pytest.raises(MalformedSwayReportError, match="must be a JSON object"):
      
        174
                    read_sway_report(report)
      
        175
        
        176
            def test_missing_schema_version(self, tmp_path: Path) -> None:
      
        177
                report = _write(tmp_path, {"sway_version": "0.1", "probes": []})
      
        178
                with pytest.raises(MalformedSwayReportError, match="schema_version"):
      
        179
                    read_sway_report(report)
      
        180
        
        181
            def test_newer_schema_refused(self, tmp_path: Path) -> None:
      
        182
                payload = _full_report([])
      
        183
                payload["schema_version"] = 99
      
        184
                report = _write(tmp_path, payload)
      
        185
                with pytest.raises(MalformedSwayReportError, match="newer than this reader"):
      
        186
                    read_sway_report(report)
      
        187
        
        188
            def test_supported_schema_pinned_to_current_sway_version(self) -> None:
      
        189
                """Cross-repo bump-gate: `_SUPPORTED_SWAY_SCHEMA` must match sway's.
      
        190
        
        191
                Sway's shipping schema is v1. If a sway bump lands but we forget
      
        192
                to bump this constant (or vice versa), any operator harvesting
      
        193
                newer sway output hits `test_newer_schema_refused`'s refusal
      
        194
                path. This test pins the constant itself so a casual edit to
      
        195
                the reader can't silently drop the support floor — the golden
      
        196
                fails and forces a code review.
      
        197
                """
      
        198
                from dlm.harvest.sway_reader import _SUPPORTED_SWAY_SCHEMA
      
        199
        
        200
                assert _SUPPORTED_SWAY_SCHEMA == 1, (
      
        201
                    "sway schema pin changed — verify `sway/src/dlm_sway/suite/report.py` "
      
        202
                    "still emits this version and bump the docs pointer in `dlm.lock`"
      
        203
                )
      
        204
        
        205
            def test_missing_probes_array(self, tmp_path: Path) -> None:
      
        206
                report = _write(tmp_path, {"schema_version": 1, "sway_version": "0.1"})
      
        207
                with pytest.raises(MalformedSwayReportError, match="`probes` array"):
      
        208
                    read_sway_report(report)
      
        209
        
        210
            def test_probe_evidence_not_object(self, tmp_path: Path) -> None:
      
        211
                broken = {**_PROBE_FAIL_WITH_REF}
      
        212
                broken["evidence"] = "not-an-object"
      
        213
                report = _write(tmp_path, _full_report([broken]))
      
        214
                with pytest.raises(NoReferenceError, match="evidence is not an object"):
      
        215
                    read_sway_report(report)
      
        216
        
        217
            def test_probe_is_not_object(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
      
        218
                report = _write(
      
        219
                    tmp_path,
      
        220
                    _full_report([_PROBE_FAIL_WITH_REF, "garbage"]),  # type: ignore[list-item]
      
        221
                )
      
        222
                with caplog.at_level("WARNING"):
      
        223
                    candidates = read_sway_report(report)
      
        224
                assert len(candidates) == 1
      
        225
                assert any("not an object" in rec.message for rec in caplog.records)

1	"""Unit tests for `dlm.harvest.sway_reader` (Sprint 33.2)."""
2
3	from __future__ import annotations
4
5	import json
6	from pathlib import Path
7
8	import pytest
9
10	from dlm.harvest import (
11	HarvestCandidate,
12	MalformedSwayReportError,
13	NoReferenceError,
14	read_sway_report,
15	)
16
17
18	def _write(tmp_path: Path, payload: object) -> Path:
19	report = tmp_path / "sway.json"
20	report.write_text(json.dumps(payload), encoding="utf-8")
21	return report
22
23
24	_PROBE_FAIL_WITH_REF = {
25	"name": "fortran_subroutine_semantics",
26	"kind": "section_internalization",
27	"verdict": "fail",
28	"score": 0.22,
29	"raw": 0.22,
30	"z_score": -1.7,
31	"evidence": {
32	"prompt": "What does SUBROUTINE DGEMM compute?",
33	"reference": "A double-precision general matrix multiplication.",
34	"confidence": 0.9,
35	},
36	"message": "adapter failed semantic recall",
37	"duration_s": 0.4,
38	}
39	_PROBE_FAIL_NO_REF = {
40	"name": "docstring_recall",
41	"kind": "prompt_collapse",
42	"verdict": "fail",
43	"score": 0.1,
44	"evidence": {"per_section_scores": [0.1, 0.15]},
45	"message": "probe has no Q/A pair to harvest",
46	"duration_s": 0.2,
47	}
48	_PROBE_PASS = {
49	"name": "calibration",
50	"kind": "calibration_drift",
51	"verdict": "pass",
52	"score": 0.95,
53	"evidence": {"delta": 0.01},
54	"message": "calibration healthy",
55	"duration_s": 0.1,
56	}
57
58
59	def _full_report(probes: list[dict]) -> dict:
60	return {
61	"schema_version": 1,
62	"sway_version": "0.1.0.dev0",
63	"base_model_id": "smollm2-135m",
64	"adapter_id": "run_7",
65	"started_at": "2026-04-21T00:00:00Z",
66	"finished_at": "2026-04-21T00:05:00Z",
67	"wall_seconds": 300.0,
68	"probes": probes,
69	}
70
71
72	class TestHappyPath:
73	def test_single_failing_probe_lifts_cleanly(self, tmp_path: Path) -> None:
74	report = _write(tmp_path, _full_report([_PROBE_FAIL_WITH_REF, _PROBE_PASS]))
75	candidates = read_sway_report(report)
76
77	assert len(candidates) == 1
78	c = candidates[0]
79	assert isinstance(c, HarvestCandidate)
80	assert c.prompt == "What does SUBROUTINE DGEMM compute?"
81	assert c.reference == "A double-precision general matrix multiplication."
82	assert c.confidence == pytest.approx(0.9)
83	assert c.probe_name == "fortran_subroutine_semantics"
84	assert c.probe_kind == "section_internalization"
85	assert c.source_adapter_version == "run_7"
86
87	def test_empty_probes_list_yields_empty(self, tmp_path: Path) -> None:
88	report = _write(tmp_path, _full_report([]))
89	assert read_sway_report(report) == []
90
91	def test_all_passing_yields_empty(self, tmp_path: Path) -> None:
92	report = _write(tmp_path, _full_report([_PROBE_PASS, _PROBE_PASS]))
93	assert read_sway_report(report) == []
94
95	def test_missing_adapter_id_leaves_source_version_none(self, tmp_path: Path) -> None:
96	payload = _full_report([_PROBE_FAIL_WITH_REF])
97	del payload["adapter_id"]
98	report = _write(tmp_path, payload)
99	candidates = read_sway_report(report)
100	assert len(candidates) == 1
101	assert candidates[0].source_adapter_version is None
102
103	def test_min_confidence_filters(self, tmp_path: Path) -> None:
104	low_conf = {**_PROBE_FAIL_WITH_REF}
105	low_conf["evidence"] = {
106	"prompt": "q?",
107	"reference": "a.",
108	"confidence": 0.5,
109	}
110	report = _write(tmp_path, _full_report([low_conf]))
111	assert read_sway_report(report, min_confidence=0.8) == []
112	assert len(read_sway_report(report, min_confidence=0.4)) == 1
113
114	def test_missing_confidence_defaults_to_one(self, tmp_path: Path) -> None:
115	no_conf = {**_PROBE_FAIL_WITH_REF}
116	no_conf["evidence"] = {"prompt": "q?", "reference": "a."}
117	report = _write(tmp_path, _full_report([no_conf]))
118	candidates = read_sway_report(report)
119	assert len(candidates) == 1
120	assert candidates[0].confidence == 1.0
121
122	def test_invalid_confidence_defaults_to_one(self, tmp_path: Path) -> None:
123	broken_conf = {**_PROBE_FAIL_WITH_REF}
124	broken_conf["evidence"] = {
125	"prompt": "q?",
126	"reference": "a.",
127	"confidence": {"not": "numeric"},
128	}
129	report = _write(tmp_path, _full_report([broken_conf]))
130	candidates = read_sway_report(report)
131	assert len(candidates) == 1
132	assert candidates[0].confidence == 1.0
133
134
135	class TestMissingReference:
136	def test_strict_raises(self, tmp_path: Path) -> None:
137	report = _write(tmp_path, _full_report([_PROBE_FAIL_NO_REF]))
138	with pytest.raises(NoReferenceError):
139	read_sway_report(report, strict=True)
140
141	def test_lax_skips_with_log(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
142	report = _write(
143	tmp_path,
144	_full_report([_PROBE_FAIL_NO_REF, _PROBE_FAIL_WITH_REF]),
145	)
146	with caplog.at_level("WARNING"):
147	candidates = read_sway_report(report, strict=False)
148	assert len(candidates) == 1
149	assert candidates[0].probe_name == "fortran_subroutine_semantics"
150	assert any("carries no reference" in rec.message for rec in caplog.records)
151
152	def test_empty_reference_string_refused(self, tmp_path: Path) -> None:
153	empty = {**_PROBE_FAIL_WITH_REF}
154	empty["evidence"] = {"prompt": "q?", "reference": " "}
155	report = _write(tmp_path, _full_report([empty]))
156	with pytest.raises(NoReferenceError):
157	read_sway_report(report, strict=True)
158
159
160	class TestMalformed:
161	def test_file_missing(self, tmp_path: Path) -> None:
162	with pytest.raises(MalformedSwayReportError, match="cannot read"):
163	read_sway_report(tmp_path / "does-not-exist.json")
164
165	def test_not_json(self, tmp_path: Path) -> None:
166	bad = tmp_path / "bad.json"
167	bad.write_text("this is not json {", encoding="utf-8")
168	with pytest.raises(MalformedSwayReportError, match="not valid JSON"):
169	read_sway_report(bad)
170
171	def test_top_level_array_rejected(self, tmp_path: Path) -> None:
172	report = _write(tmp_path, [])
173	with pytest.raises(MalformedSwayReportError, match="must be a JSON object"):
174	read_sway_report(report)
175
176	def test_missing_schema_version(self, tmp_path: Path) -> None:
177	report = _write(tmp_path, {"sway_version": "0.1", "probes": []})
178	with pytest.raises(MalformedSwayReportError, match="schema_version"):
179	read_sway_report(report)
180
181	def test_newer_schema_refused(self, tmp_path: Path) -> None:
182	payload = _full_report([])
183	payload["schema_version"] = 99
184	report = _write(tmp_path, payload)
185	with pytest.raises(MalformedSwayReportError, match="newer than this reader"):
186	read_sway_report(report)
187
188	def test_supported_schema_pinned_to_current_sway_version(self) -> None:
189	"""Cross-repo bump-gate: `_SUPPORTED_SWAY_SCHEMA` must match sway's.
190
191	Sway's shipping schema is v1. If a sway bump lands but we forget
192	to bump this constant (or vice versa), any operator harvesting
193	newer sway output hits `test_newer_schema_refused`'s refusal
194	path. This test pins the constant itself so a casual edit to
195	the reader can't silently drop the support floor — the golden
196	fails and forces a code review.
197	"""
198	from dlm.harvest.sway_reader import _SUPPORTED_SWAY_SCHEMA
199
200	assert _SUPPORTED_SWAY_SCHEMA == 1, (
201	"sway schema pin changed — verify `sway/src/dlm_sway/suite/report.py` "
202	"still emits this version and bump the docs pointer in `dlm.lock`"
203	)
204
205	def test_missing_probes_array(self, tmp_path: Path) -> None:
206	report = _write(tmp_path, {"schema_version": 1, "sway_version": "0.1"})
207	with pytest.raises(MalformedSwayReportError, match="`probes` array"):
208	read_sway_report(report)
209
210	def test_probe_evidence_not_object(self, tmp_path: Path) -> None:
211	broken = {**_PROBE_FAIL_WITH_REF}
212	broken["evidence"] = "not-an-object"
213	report = _write(tmp_path, _full_report([broken]))
214	with pytest.raises(NoReferenceError, match="evidence is not an object"):
215	read_sway_report(report)
216
217	def test_probe_is_not_object(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
218	report = _write(
219	tmp_path,
220	_full_report([_PROBE_FAIL_WITH_REF, "garbage"]), # type: ignore[list-item]
221	)
222	with caplog.at_level("WARNING"):
223	candidates = read_sway_report(report)
224	assert len(candidates) == 1
225	assert any("not an object" in rec.message for rec in caplog.records)