tenseleyflow/documentlanguagemodel / b465948

Browse files

feat(harvest): package skeleton — errors + sway report reader

Authored by espadonne
SHA
b465948d2932d4910c5e8268edf5f7fac3465a3f
Parents
b24a3b5
Tree
8df468f

5 changed files

StatusFile+-
A src/dlm/harvest/__init__.py 35 0
A src/dlm/harvest/errors.py 31 0
A src/dlm/harvest/sway_reader.py 232 0
A tests/unit/harvest/__init__.py 0 0
A tests/unit/harvest/test_sway_reader.py 196 0
src/dlm/harvest/__init__.pyadded
@@ -0,0 +1,35 @@
1
+"""Adversarial replay harvest — pull mode.
2
+
3
+Post-training, `dlm harvest` reads a sway JSON report, extracts
4
+failing probes with known references, and writes them back as
5
+`!probe`-tagged `::instruction::` sections. The document grows to
6
+contain its own weaknesses; the next retrain picks them up via the
7
+existing probe-sampling path.
8
+
9
+Public surface:
10
+
11
+- :class:`HarvestCandidate` — one failing probe with its
12
+  reference answer, ready to be materialized as a Section.
13
+- :func:`read_sway_report` — parse a sway JSON file into
14
+  candidates.
15
+
16
+Writer + differ land in ``dlm.harvest.diff`` and ``dlm.harvest.applier``
17
+in the next sprint tick.
18
+"""
19
+
20
+from __future__ import annotations
21
+
22
+from dlm.harvest.errors import (
23
+    HarvestError,
24
+    MalformedSwayReportError,
25
+    NoReferenceError,
26
+)
27
+from dlm.harvest.sway_reader import HarvestCandidate, read_sway_report
28
+
29
+__all__ = [
30
+    "HarvestCandidate",
31
+    "HarvestError",
32
+    "MalformedSwayReportError",
33
+    "NoReferenceError",
34
+    "read_sway_report",
35
+]
src/dlm/harvest/errors.pyadded
@@ -0,0 +1,31 @@
1
+"""Typed errors for `dlm harvest`.
2
+
3
+These surface through the CLI reporter with the same
4
+``file:line:col`` treatment as other `DlmError` subclasses.
5
+"""
6
+
7
+from __future__ import annotations
8
+
9
+
10
+class HarvestError(Exception):
11
+    """Base exception for the harvest pull path."""
12
+
13
+
14
+class MalformedSwayReportError(HarvestError):
15
+    """The sway JSON report is unreadable or has an unexpected shape.
16
+
17
+    Raised when:
18
+    - The file doesn't parse as JSON.
19
+    - Required top-level keys are missing (``schema_version``,
20
+      ``probes``).
21
+    - ``schema_version`` is newer than this reader supports.
22
+    """
23
+
24
+
25
+class NoReferenceError(HarvestError):
26
+    """A failing probe has no reference answer we can harvest.
27
+
28
+    Harvest refuses probes without references rather than silently
29
+    writing incomplete `!probe` sections. Callers can downgrade this
30
+    to a WARN via ``read_sway_report(..., strict=False)``.
31
+    """
src/dlm/harvest/sway_reader.pyadded
@@ -0,0 +1,232 @@
1
+"""Parse a sway JSON report into harvest candidates.
2
+
3
+Sway emits reports with this shape (see
4
+``sway/src/dlm_sway/suite/report.py``):
5
+
6
+.. code-block:: json
7
+
8
+    {
9
+      "schema_version": 1,
10
+      "sway_version": "...",
11
+      "base_model_id": "...",
12
+      "adapter_id": "...",
13
+      "probes": [
14
+        {
15
+          "name": "...",
16
+          "kind": "...",
17
+          "verdict": "pass" | "fail" | "warn" | "skip" | "error",
18
+          "score": 0.0,
19
+          "evidence": {...},
20
+          "message": "...",
21
+          ...
22
+        }
23
+      ]
24
+    }
25
+
26
+The harvest pull path filters for ``verdict == "fail"`` and lifts
27
+out ``evidence.prompt`` + ``evidence.reference`` as the Q/A pair for
28
+the next retrain. Probes without both fields are skipped with a
29
+:class:`NoReferenceError` under strict mode (default) or a log line
30
+under ``strict=False``.
31
+
32
+``evidence.confidence`` (optional, 0-1) gates candidates via the
33
+caller's ``--min-confidence``. Absent confidence is treated as 1.0
34
+— the probe itself already failed, which is our signal.
35
+"""
36
+
37
+from __future__ import annotations
38
+
39
+import json
40
+import logging
41
+from dataclasses import dataclass
42
+from pathlib import Path
43
+from typing import Any, Final
44
+
45
+from dlm.harvest.errors import MalformedSwayReportError, NoReferenceError
46
+
47
+_LOG = logging.getLogger(__name__)
48
+
49
+# Sway's JSON schema version we know how to parse. A higher version
50
+# in a report triggers a refusal with a clear pointer — sway's schema
51
+# is stable but not fixed forever.
52
+_SUPPORTED_SWAY_SCHEMA: Final[int] = 1
53
+
54
+
55
+@dataclass(frozen=True)
56
+class HarvestCandidate:
57
+    """One failing probe ready to become a `!probe`-tagged section.
58
+
59
+    Attributes
60
+    ----------
61
+    prompt:
62
+        The question text. Becomes the `### Q` body.
63
+    reference:
64
+        The expected answer. Becomes the `### A` body.
65
+    confidence:
66
+        0-1 weight sway assigned to this probe's reference, when
67
+        present. Defaults to 1.0 when the report doesn't carry it.
68
+    probe_name:
69
+        Human-readable probe name from the sway spec. Used for the
70
+        harvest tag so users can trace a synthesized section back to
71
+        its probe origin.
72
+    probe_kind:
73
+        Probe discriminator (``section_internalization`` etc.).
74
+    source_adapter_version:
75
+        The adapter revision sway was scoring when it failed, if
76
+        `adapter_id` carries one. Informational; the harvest
77
+        itself doesn't need it.
78
+    """
79
+
80
+    prompt: str
81
+    reference: str
82
+    confidence: float
83
+    probe_name: str
84
+    probe_kind: str
85
+    source_adapter_version: str | None
86
+
87
+
88
+def read_sway_report(
89
+    path: Path | str,
90
+    *,
91
+    strict: bool = True,
92
+    min_confidence: float = 0.0,
93
+) -> list[HarvestCandidate]:
94
+    """Parse a sway JSON report at `path` into harvest candidates.
95
+
96
+    Parameters
97
+    ----------
98
+    path:
99
+        Path to the sway JSON report.
100
+    strict:
101
+        If True (default), raise :class:`NoReferenceError` when a
102
+        failing probe lacks a ``prompt`` / ``reference`` pair. If
103
+        False, log a warning and skip the probe.
104
+    min_confidence:
105
+        Minimum ``evidence.confidence`` for a candidate to survive.
106
+        Default 0.0 accepts all.
107
+
108
+    Raises
109
+    ------
110
+    MalformedSwayReportError:
111
+        File unreadable, not JSON, missing required keys, or carries
112
+        a newer ``schema_version`` than this reader supports.
113
+    NoReferenceError:
114
+        Strict mode + at least one failing probe lacks a reference.
115
+    """
116
+    report_path = Path(path)
117
+    try:
118
+        raw = report_path.read_text(encoding="utf-8")
119
+    except OSError as exc:
120
+        raise MalformedSwayReportError(f"cannot read sway report at {report_path}: {exc}") from exc
121
+
122
+    try:
123
+        payload = json.loads(raw)
124
+    except json.JSONDecodeError as exc:
125
+        raise MalformedSwayReportError(
126
+            f"sway report at {report_path} is not valid JSON: {exc}"
127
+        ) from exc
128
+
129
+    if not isinstance(payload, dict):
130
+        raise MalformedSwayReportError(
131
+            f"sway report at {report_path} must be a JSON object; got {type(payload).__name__}"
132
+        )
133
+
134
+    schema_version = payload.get("schema_version")
135
+    if not isinstance(schema_version, int):
136
+        raise MalformedSwayReportError(
137
+            f"sway report at {report_path} missing integer `schema_version`"
138
+        )
139
+    if schema_version > _SUPPORTED_SWAY_SCHEMA:
140
+        raise MalformedSwayReportError(
141
+            f"sway report schema_version={schema_version} is newer than this "
142
+            f"reader supports ({_SUPPORTED_SWAY_SCHEMA}); bump the sway pin "
143
+            "in `dlm.lock` after verifying harvest still round-trips"
144
+        )
145
+
146
+    probes = payload.get("probes")
147
+    if not isinstance(probes, list):
148
+        raise MalformedSwayReportError(f"sway report at {report_path} missing `probes` array")
149
+
150
+    adapter_id = payload.get("adapter_id")
151
+    source_adapter_version: str | None = None
152
+    if isinstance(adapter_id, str) and adapter_id:
153
+        source_adapter_version = adapter_id
154
+
155
+    candidates: list[HarvestCandidate] = []
156
+    for idx, probe in enumerate(probes):
157
+        if not isinstance(probe, dict):
158
+            _LOG.warning(
159
+                "sway report %s: probe index %d is not an object; skipping",
160
+                report_path,
161
+                idx,
162
+            )
163
+            continue
164
+        if probe.get("verdict") != "fail":
165
+            continue
166
+        try:
167
+            candidate = _probe_to_candidate(
168
+                probe,
169
+                source_adapter_version=source_adapter_version,
170
+            )
171
+        except NoReferenceError:
172
+            if strict:
173
+                raise
174
+            _LOG.warning(
175
+                "sway report %s: probe %r failed but carries no "
176
+                "reference; skipping (use --strict to fail)",
177
+                report_path,
178
+                probe.get("name", "<unnamed>"),
179
+            )
180
+            continue
181
+        if candidate.confidence < min_confidence:
182
+            _LOG.info(
183
+                "harvest: skipping %r (confidence=%.2f < %.2f)",
184
+                candidate.probe_name,
185
+                candidate.confidence,
186
+                min_confidence,
187
+            )
188
+            continue
189
+        candidates.append(candidate)
190
+
191
+    return candidates
192
+
193
+
194
+def _probe_to_candidate(
195
+    probe: dict[str, Any],
196
+    *,
197
+    source_adapter_version: str | None,
198
+) -> HarvestCandidate:
199
+    """Lift one failing probe into a `HarvestCandidate`.
200
+
201
+    Raises :class:`NoReferenceError` when the evidence doesn't
202
+    carry both a prompt and a reference — that probe cannot be
203
+    round-tripped into a supervised Q/A row.
204
+    """
205
+    name = str(probe.get("name") or "<unnamed>")
206
+    kind = str(probe.get("kind") or "")
207
+    evidence = probe.get("evidence") or {}
208
+    if not isinstance(evidence, dict):
209
+        raise NoReferenceError(f"probe {name!r}: evidence is not an object; cannot harvest")
210
+
211
+    prompt_raw = evidence.get("prompt")
212
+    reference_raw = evidence.get("reference")
213
+    if not isinstance(prompt_raw, str) or not prompt_raw.strip():
214
+        raise NoReferenceError(f"probe {name!r}: evidence.prompt missing or non-string")
215
+    if not isinstance(reference_raw, str) or not reference_raw.strip():
216
+        raise NoReferenceError(f"probe {name!r}: evidence.reference missing or non-string")
217
+
218
+    confidence_raw = evidence.get("confidence", 1.0)
219
+    try:
220
+        confidence = float(confidence_raw)
221
+    except (TypeError, ValueError):
222
+        confidence = 1.0
223
+    confidence = max(0.0, min(1.0, confidence))
224
+
225
+    return HarvestCandidate(
226
+        prompt=prompt_raw.strip(),
227
+        reference=reference_raw.strip(),
228
+        confidence=confidence,
229
+        probe_name=name,
230
+        probe_kind=kind,
231
+        source_adapter_version=source_adapter_version,
232
+    )
tests/unit/harvest/__init__.pyadded
tests/unit/harvest/test_sway_reader.pyadded
@@ -0,0 +1,196 @@
1
+"""Unit tests for `dlm.harvest.sway_reader` (Sprint 33.2)."""
2
+
3
+from __future__ import annotations
4
+
5
+import json
6
+from pathlib import Path
7
+
8
+import pytest
9
+
10
+from dlm.harvest import (
11
+    HarvestCandidate,
12
+    MalformedSwayReportError,
13
+    NoReferenceError,
14
+    read_sway_report,
15
+)
16
+
17
+
18
+def _write(tmp_path: Path, payload: object) -> Path:
19
+    report = tmp_path / "sway.json"
20
+    report.write_text(json.dumps(payload), encoding="utf-8")
21
+    return report
22
+
23
+
24
+_PROBE_FAIL_WITH_REF = {
25
+    "name": "fortran_subroutine_semantics",
26
+    "kind": "section_internalization",
27
+    "verdict": "fail",
28
+    "score": 0.22,
29
+    "raw": 0.22,
30
+    "z_score": -1.7,
31
+    "evidence": {
32
+        "prompt": "What does SUBROUTINE DGEMM compute?",
33
+        "reference": "A double-precision general matrix multiplication.",
34
+        "confidence": 0.9,
35
+    },
36
+    "message": "adapter failed semantic recall",
37
+    "duration_s": 0.4,
38
+}
39
+_PROBE_FAIL_NO_REF = {
40
+    "name": "docstring_recall",
41
+    "kind": "prompt_collapse",
42
+    "verdict": "fail",
43
+    "score": 0.1,
44
+    "evidence": {"per_section_scores": [0.1, 0.15]},
45
+    "message": "probe has no Q/A pair to harvest",
46
+    "duration_s": 0.2,
47
+}
48
+_PROBE_PASS = {
49
+    "name": "calibration",
50
+    "kind": "calibration_drift",
51
+    "verdict": "pass",
52
+    "score": 0.95,
53
+    "evidence": {"delta": 0.01},
54
+    "message": "calibration healthy",
55
+    "duration_s": 0.1,
56
+}
57
+
58
+
59
+def _full_report(probes: list[dict]) -> dict:
60
+    return {
61
+        "schema_version": 1,
62
+        "sway_version": "0.1.0.dev0",
63
+        "base_model_id": "smollm2-135m",
64
+        "adapter_id": "run_7",
65
+        "started_at": "2026-04-21T00:00:00Z",
66
+        "finished_at": "2026-04-21T00:05:00Z",
67
+        "wall_seconds": 300.0,
68
+        "probes": probes,
69
+    }
70
+
71
+
72
+class TestHappyPath:
73
+    def test_single_failing_probe_lifts_cleanly(self, tmp_path: Path) -> None:
74
+        report = _write(tmp_path, _full_report([_PROBE_FAIL_WITH_REF, _PROBE_PASS]))
75
+        candidates = read_sway_report(report)
76
+
77
+        assert len(candidates) == 1
78
+        c = candidates[0]
79
+        assert isinstance(c, HarvestCandidate)
80
+        assert c.prompt == "What does SUBROUTINE DGEMM compute?"
81
+        assert c.reference == "A double-precision general matrix multiplication."
82
+        assert c.confidence == pytest.approx(0.9)
83
+        assert c.probe_name == "fortran_subroutine_semantics"
84
+        assert c.probe_kind == "section_internalization"
85
+        assert c.source_adapter_version == "run_7"
86
+
87
+    def test_empty_probes_list_yields_empty(self, tmp_path: Path) -> None:
88
+        report = _write(tmp_path, _full_report([]))
89
+        assert read_sway_report(report) == []
90
+
91
+    def test_all_passing_yields_empty(self, tmp_path: Path) -> None:
92
+        report = _write(tmp_path, _full_report([_PROBE_PASS, _PROBE_PASS]))
93
+        assert read_sway_report(report) == []
94
+
95
+    def test_missing_adapter_id_leaves_source_version_none(self, tmp_path: Path) -> None:
96
+        payload = _full_report([_PROBE_FAIL_WITH_REF])
97
+        del payload["adapter_id"]
98
+        report = _write(tmp_path, payload)
99
+        candidates = read_sway_report(report)
100
+        assert len(candidates) == 1
101
+        assert candidates[0].source_adapter_version is None
102
+
103
+    def test_min_confidence_filters(self, tmp_path: Path) -> None:
104
+        low_conf = {**_PROBE_FAIL_WITH_REF}
105
+        low_conf["evidence"] = {
106
+            "prompt": "q?",
107
+            "reference": "a.",
108
+            "confidence": 0.5,
109
+        }
110
+        report = _write(tmp_path, _full_report([low_conf]))
111
+        assert read_sway_report(report, min_confidence=0.8) == []
112
+        assert len(read_sway_report(report, min_confidence=0.4)) == 1
113
+
114
+    def test_missing_confidence_defaults_to_one(self, tmp_path: Path) -> None:
115
+        no_conf = {**_PROBE_FAIL_WITH_REF}
116
+        no_conf["evidence"] = {"prompt": "q?", "reference": "a."}
117
+        report = _write(tmp_path, _full_report([no_conf]))
118
+        candidates = read_sway_report(report)
119
+        assert len(candidates) == 1
120
+        assert candidates[0].confidence == 1.0
121
+
122
+
123
+class TestMissingReference:
124
+    def test_strict_raises(self, tmp_path: Path) -> None:
125
+        report = _write(tmp_path, _full_report([_PROBE_FAIL_NO_REF]))
126
+        with pytest.raises(NoReferenceError):
127
+            read_sway_report(report, strict=True)
128
+
129
+    def test_lax_skips_with_log(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
130
+        report = _write(
131
+            tmp_path,
132
+            _full_report([_PROBE_FAIL_NO_REF, _PROBE_FAIL_WITH_REF]),
133
+        )
134
+        with caplog.at_level("WARNING"):
135
+            candidates = read_sway_report(report, strict=False)
136
+        assert len(candidates) == 1
137
+        assert candidates[0].probe_name == "fortran_subroutine_semantics"
138
+        assert any("carries no reference" in rec.message for rec in caplog.records)
139
+
140
+    def test_empty_reference_string_refused(self, tmp_path: Path) -> None:
141
+        empty = {**_PROBE_FAIL_WITH_REF}
142
+        empty["evidence"] = {"prompt": "q?", "reference": "   "}
143
+        report = _write(tmp_path, _full_report([empty]))
144
+        with pytest.raises(NoReferenceError):
145
+            read_sway_report(report, strict=True)
146
+
147
+
148
+class TestMalformed:
149
+    def test_file_missing(self, tmp_path: Path) -> None:
150
+        with pytest.raises(MalformedSwayReportError, match="cannot read"):
151
+            read_sway_report(tmp_path / "does-not-exist.json")
152
+
153
+    def test_not_json(self, tmp_path: Path) -> None:
154
+        bad = tmp_path / "bad.json"
155
+        bad.write_text("this is not json {", encoding="utf-8")
156
+        with pytest.raises(MalformedSwayReportError, match="not valid JSON"):
157
+            read_sway_report(bad)
158
+
159
+    def test_top_level_array_rejected(self, tmp_path: Path) -> None:
160
+        report = _write(tmp_path, [])
161
+        with pytest.raises(MalformedSwayReportError, match="must be a JSON object"):
162
+            read_sway_report(report)
163
+
164
+    def test_missing_schema_version(self, tmp_path: Path) -> None:
165
+        report = _write(tmp_path, {"sway_version": "0.1", "probes": []})
166
+        with pytest.raises(MalformedSwayReportError, match="schema_version"):
167
+            read_sway_report(report)
168
+
169
+    def test_newer_schema_refused(self, tmp_path: Path) -> None:
170
+        payload = _full_report([])
171
+        payload["schema_version"] = 99
172
+        report = _write(tmp_path, payload)
173
+        with pytest.raises(MalformedSwayReportError, match="newer than this reader"):
174
+            read_sway_report(report)
175
+
176
+    def test_missing_probes_array(self, tmp_path: Path) -> None:
177
+        report = _write(tmp_path, {"schema_version": 1, "sway_version": "0.1"})
178
+        with pytest.raises(MalformedSwayReportError, match="`probes` array"):
179
+            read_sway_report(report)
180
+
181
+    def test_probe_evidence_not_object(self, tmp_path: Path) -> None:
182
+        broken = {**_PROBE_FAIL_WITH_REF}
183
+        broken["evidence"] = "not-an-object"
184
+        report = _write(tmp_path, _full_report([broken]))
185
+        with pytest.raises(NoReferenceError, match="evidence is not an object"):
186
+            read_sway_report(report)
187
+
188
+    def test_probe_is_not_object(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
189
+        report = _write(
190
+            tmp_path,
191
+            _full_report([_PROBE_FAIL_WITH_REF, "garbage"]),  # type: ignore[list-item]
192
+        )
193
+        with caplog.at_level("WARNING"):
194
+            candidates = read_sway_report(report)
195
+        assert len(candidates) == 1
196
+        assert any("not an object" in rec.message for rec in caplog.records)