`b465948`

feat(harvest): package skeleton — errors + sway report reader

Authored by

espadonne 3 weeks ago

SHA: b465948d2932d4910c5e8268edf5f7fac3465a3f
Parents: b24a3b5
Tree: 8df468f

5 changed files

Status	File	+
A	`src/dlm/harvest/__init__.py`	35
A	`src/dlm/harvest/errors.py`	31
A	`src/dlm/harvest/sway_reader.py`	232
A	`tests/unit/harvest/__init__.py`	0
A	`tests/unit/harvest/test_sway_reader.py`	196

src/dlm/harvest/__init__.pyadded

 +"""Adversarial replay harvest — pull mode.
++
 +Post-training, `dlm harvest` reads a sway JSON report, extracts
 +failing probes with known references, and writes them back as
 +`!probe`-tagged `::instruction::` sections. The document grows to
 +contain its own weaknesses; the next retrain picks them up via the
 +existing probe-sampling path.
++
 +Public surface:
++
 +- :class:`HarvestCandidate` — one failing probe with its
 +  reference answer, ready to be materialized as a Section.
 +- :func:`read_sway_report` — parse a sway JSON file into
 +  candidates.
++
 +Writer + differ land in ``dlm.harvest.diff`` and ``dlm.harvest.applier``
 +in the next sprint tick.
 +"""
++
 +from __future__ import annotations
++
 +from dlm.harvest.errors import (
 +    HarvestError,
 +    MalformedSwayReportError,
 +    NoReferenceError,
 +)
 +from dlm.harvest.sway_reader import HarvestCandidate, read_sway_report
++
 +__all__ = [
 +    "HarvestCandidate",
 +    "HarvestError",
 +    "MalformedSwayReportError",
 +    "NoReferenceError",
 +    "read_sway_report",
 +]

src/dlm/harvest/errors.pyadded

 +"""Typed errors for `dlm harvest`.
++
 +These surface through the CLI reporter with the same
 +``file:line:col`` treatment as other `DlmError` subclasses.
 +"""
++
 +from __future__ import annotations
++
++
 +class HarvestError(Exception):
 +    """Base exception for the harvest pull path."""
++
++
 +class MalformedSwayReportError(HarvestError):
 +    """The sway JSON report is unreadable or has an unexpected shape.
++
 +    Raised when:
 +    - The file doesn't parse as JSON.
 +    - Required top-level keys are missing (``schema_version``,
 +      ``probes``).
 +    - ``schema_version`` is newer than this reader supports.
 +    """
++
++
 +class NoReferenceError(HarvestError):
 +    """A failing probe has no reference answer we can harvest.
++
 +    Harvest refuses probes without references rather than silently
 +    writing incomplete `!probe` sections. Callers can downgrade this
 +    to a WARN via ``read_sway_report(..., strict=False)``.
 +    """

src/dlm/harvest/sway_reader.pyadded

 +"""Parse a sway JSON report into harvest candidates.
++
 +Sway emits reports with this shape (see
 +``sway/src/dlm_sway/suite/report.py``):
++
 +.. code-block:: json
++
 +    {
 +      "schema_version": 1,
 +      "sway_version": "...",
 +      "base_model_id": "...",
 +      "adapter_id": "...",
 +      "probes": [
 +        {
 +          "name": "...",
 +          "kind": "...",
 +          "verdict": "pass" | "fail" | "warn" | "skip" | "error",
 +          "score": 0.0,
 +          "evidence": {...},
 +          "message": "...",
 +          ...
 +        }
 +      ]
 +    }
++
 +The harvest pull path filters for ``verdict == "fail"`` and lifts
 +out ``evidence.prompt`` + ``evidence.reference`` as the Q/A pair for
 +the next retrain. Probes without both fields are skipped with a
 +:class:`NoReferenceError` under strict mode (default) or a log line
 +under ``strict=False``.
++
 +``evidence.confidence`` (optional, 0-1) gates candidates via the
 +caller's ``--min-confidence``. Absent confidence is treated as 1.0
 +— the probe itself already failed, which is our signal.
 +"""
++
 +from __future__ import annotations
++
 +import json
 +import logging
 +from dataclasses import dataclass
 +from pathlib import Path
 +from typing import Any, Final
++
 +from dlm.harvest.errors import MalformedSwayReportError, NoReferenceError
++
 +_LOG = logging.getLogger(__name__)
++
 +# Sway's JSON schema version we know how to parse. A higher version
 +# in a report triggers a refusal with a clear pointer — sway's schema
 +# is stable but not fixed forever.
 +_SUPPORTED_SWAY_SCHEMA: Final[int] = 1
++
++
 +@dataclass(frozen=True)
 +class HarvestCandidate:
 +    """One failing probe ready to become a `!probe`-tagged section.
++
 +    Attributes
 +    ----------
 +    prompt:
 +        The question text. Becomes the `### Q` body.
 +    reference:
 +        The expected answer. Becomes the `### A` body.
 +    confidence:
 +        0-1 weight sway assigned to this probe's reference, when
 +        present. Defaults to 1.0 when the report doesn't carry it.
 +    probe_name:
 +        Human-readable probe name from the sway spec. Used for the
 +        harvest tag so users can trace a synthesized section back to
 +        its probe origin.
 +    probe_kind:
 +        Probe discriminator (``section_internalization`` etc.).
 +    source_adapter_version:
 +        The adapter revision sway was scoring when it failed, if
 +        `adapter_id` carries one. Informational; the harvest
 +        itself doesn't need it.
 +    """
++
 +    prompt: str
 +    reference: str
 +    confidence: float
 +    probe_name: str
 +    probe_kind: str
 +    source_adapter_version: str | None
++
++
 +def read_sway_report(
 +    path: Path | str,
 +    *,
 +    strict: bool = True,
 +    min_confidence: float = 0.0,
 +) -> list[HarvestCandidate]:
 +    """Parse a sway JSON report at `path` into harvest candidates.
++
 +    Parameters
 +    ----------
 +    path:
 +        Path to the sway JSON report.
 +    strict:
 +        If True (default), raise :class:`NoReferenceError` when a
 +        failing probe lacks a ``prompt`` / ``reference`` pair. If
 +        False, log a warning and skip the probe.
 +    min_confidence:
 +        Minimum ``evidence.confidence`` for a candidate to survive.
 +        Default 0.0 accepts all.
++
 +    Raises
 +    ------
 +    MalformedSwayReportError:
 +        File unreadable, not JSON, missing required keys, or carries
 +        a newer ``schema_version`` than this reader supports.
 +    NoReferenceError:
 +        Strict mode + at least one failing probe lacks a reference.
 +    """
 +    report_path = Path(path)
 +    try:
 +        raw = report_path.read_text(encoding="utf-8")
 +    except OSError as exc:
 +        raise MalformedSwayReportError(f"cannot read sway report at {report_path}: {exc}") from exc
++
 +    try:
 +        payload = json.loads(raw)
 +    except json.JSONDecodeError as exc:
 +        raise MalformedSwayReportError(
 +            f"sway report at {report_path} is not valid JSON: {exc}"
 +        ) from exc
++
 +    if not isinstance(payload, dict):
 +        raise MalformedSwayReportError(
 +            f"sway report at {report_path} must be a JSON object; got {type(payload).__name__}"
 +        )
++
 +    schema_version = payload.get("schema_version")
 +    if not isinstance(schema_version, int):
 +        raise MalformedSwayReportError(
 +            f"sway report at {report_path} missing integer `schema_version`"
 +        )
 +    if schema_version > _SUPPORTED_SWAY_SCHEMA:
 +        raise MalformedSwayReportError(
 +            f"sway report schema_version={schema_version} is newer than this "
 +            f"reader supports ({_SUPPORTED_SWAY_SCHEMA}); bump the sway pin "
 +            "in `dlm.lock` after verifying harvest still round-trips"
 +        )
++
 +    probes = payload.get("probes")
 +    if not isinstance(probes, list):
 +        raise MalformedSwayReportError(f"sway report at {report_path} missing `probes` array")
++
 +    adapter_id = payload.get("adapter_id")
 +    source_adapter_version: str | None = None
 +    if isinstance(adapter_id, str) and adapter_id:
 +        source_adapter_version = adapter_id
++
 +    candidates: list[HarvestCandidate] = []
 +    for idx, probe in enumerate(probes):
 +        if not isinstance(probe, dict):
 +            _LOG.warning(
 +                "sway report %s: probe index %d is not an object; skipping",
 +                report_path,
 +                idx,
 +            )
 +            continue
 +        if probe.get("verdict") != "fail":
 +            continue
 +        try:
 +            candidate = _probe_to_candidate(
 +                probe,
 +                source_adapter_version=source_adapter_version,
 +            )
 +        except NoReferenceError:
 +            if strict:
 +                raise
 +            _LOG.warning(
 +                "sway report %s: probe %r failed but carries no "
 +                "reference; skipping (use --strict to fail)",
 +                report_path,
 +                probe.get("name", "<unnamed>"),
 +            )
 +            continue
 +        if candidate.confidence < min_confidence:
 +            _LOG.info(
 +                "harvest: skipping %r (confidence=%.2f < %.2f)",
 +                candidate.probe_name,
 +                candidate.confidence,
 +                min_confidence,
 +            )
 +            continue
 +        candidates.append(candidate)
++
 +    return candidates
++
++
 +def _probe_to_candidate(
 +    probe: dict[str, Any],
 +    *,
 +    source_adapter_version: str | None,
 +) -> HarvestCandidate:
 +    """Lift one failing probe into a `HarvestCandidate`.
++
 +    Raises :class:`NoReferenceError` when the evidence doesn't
 +    carry both a prompt and a reference — that probe cannot be
 +    round-tripped into a supervised Q/A row.
 +    """
 +    name = str(probe.get("name") or "<unnamed>")
 +    kind = str(probe.get("kind") or "")
 +    evidence = probe.get("evidence") or {}
 +    if not isinstance(evidence, dict):
 +        raise NoReferenceError(f"probe {name!r}: evidence is not an object; cannot harvest")
++
 +    prompt_raw = evidence.get("prompt")
 +    reference_raw = evidence.get("reference")
 +    if not isinstance(prompt_raw, str) or not prompt_raw.strip():
 +        raise NoReferenceError(f"probe {name!r}: evidence.prompt missing or non-string")
 +    if not isinstance(reference_raw, str) or not reference_raw.strip():
 +        raise NoReferenceError(f"probe {name!r}: evidence.reference missing or non-string")
++
 +    confidence_raw = evidence.get("confidence", 1.0)
 +    try:
 +        confidence = float(confidence_raw)
 +    except (TypeError, ValueError):
 +        confidence = 1.0
 +    confidence = max(0.0, min(1.0, confidence))
++
 +    return HarvestCandidate(
 +        prompt=prompt_raw.strip(),
 +        reference=reference_raw.strip(),
 +        confidence=confidence,
 +        probe_name=name,
 +        probe_kind=kind,
 +        source_adapter_version=source_adapter_version,
 +    )

tests/unit/harvest/__init__.pyadded

tests/unit/harvest/test_sway_reader.pyadded

 +"""Unit tests for `dlm.harvest.sway_reader` (Sprint 33.2)."""
++
 +from __future__ import annotations
++
 +import json
 +from pathlib import Path
++
 +import pytest
++
 +from dlm.harvest import (
 +    HarvestCandidate,
 +    MalformedSwayReportError,
 +    NoReferenceError,
 +    read_sway_report,
 +)
++
++
 +def _write(tmp_path: Path, payload: object) -> Path:
 +    report = tmp_path / "sway.json"
 +    report.write_text(json.dumps(payload), encoding="utf-8")
 +    return report
++
++
 +_PROBE_FAIL_WITH_REF = {
 +    "name": "fortran_subroutine_semantics",
 +    "kind": "section_internalization",
 +    "verdict": "fail",
 +    "score": 0.22,
 +    "raw": 0.22,
 +    "z_score": -1.7,
 +    "evidence": {
 +        "prompt": "What does SUBROUTINE DGEMM compute?",
 +        "reference": "A double-precision general matrix multiplication.",
 +        "confidence": 0.9,
 +    },
 +    "message": "adapter failed semantic recall",
 +    "duration_s": 0.4,
 +}
 +_PROBE_FAIL_NO_REF = {
 +    "name": "docstring_recall",
 +    "kind": "prompt_collapse",
 +    "verdict": "fail",
 +    "score": 0.1,
 +    "evidence": {"per_section_scores": [0.1, 0.15]},
 +    "message": "probe has no Q/A pair to harvest",
 +    "duration_s": 0.2,
 +}
 +_PROBE_PASS = {
 +    "name": "calibration",
 +    "kind": "calibration_drift",
 +    "verdict": "pass",
 +    "score": 0.95,
 +    "evidence": {"delta": 0.01},
 +    "message": "calibration healthy",
 +    "duration_s": 0.1,
 +}
++
++
 +def _full_report(probes: list[dict]) -> dict:
 +    return {
 +        "schema_version": 1,
 +        "sway_version": "0.1.0.dev0",
 +        "base_model_id": "smollm2-135m",
 +        "adapter_id": "run_7",
 +        "started_at": "2026-04-21T00:00:00Z",
 +        "finished_at": "2026-04-21T00:05:00Z",
 +        "wall_seconds": 300.0,
 +        "probes": probes,
 +    }
++
++
 +class TestHappyPath:
 +    def test_single_failing_probe_lifts_cleanly(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, _full_report([_PROBE_FAIL_WITH_REF, _PROBE_PASS]))
 +        candidates = read_sway_report(report)
++
 +        assert len(candidates) == 1
 +        c = candidates[0]
 +        assert isinstance(c, HarvestCandidate)
 +        assert c.prompt == "What does SUBROUTINE DGEMM compute?"
 +        assert c.reference == "A double-precision general matrix multiplication."
 +        assert c.confidence == pytest.approx(0.9)
 +        assert c.probe_name == "fortran_subroutine_semantics"
 +        assert c.probe_kind == "section_internalization"
 +        assert c.source_adapter_version == "run_7"
++
 +    def test_empty_probes_list_yields_empty(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, _full_report([]))
 +        assert read_sway_report(report) == []
++
 +    def test_all_passing_yields_empty(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, _full_report([_PROBE_PASS, _PROBE_PASS]))
 +        assert read_sway_report(report) == []
++
 +    def test_missing_adapter_id_leaves_source_version_none(self, tmp_path: Path) -> None:
 +        payload = _full_report([_PROBE_FAIL_WITH_REF])
 +        del payload["adapter_id"]
 +        report = _write(tmp_path, payload)
 +        candidates = read_sway_report(report)
 +        assert len(candidates) == 1
 +        assert candidates[0].source_adapter_version is None
++
 +    def test_min_confidence_filters(self, tmp_path: Path) -> None:
 +        low_conf = {**_PROBE_FAIL_WITH_REF}
 +        low_conf["evidence"] = {
 +            "prompt": "q?",
 +            "reference": "a.",
 +            "confidence": 0.5,
 +        }
 +        report = _write(tmp_path, _full_report([low_conf]))
 +        assert read_sway_report(report, min_confidence=0.8) == []
 +        assert len(read_sway_report(report, min_confidence=0.4)) == 1
++
 +    def test_missing_confidence_defaults_to_one(self, tmp_path: Path) -> None:
 +        no_conf = {**_PROBE_FAIL_WITH_REF}
 +        no_conf["evidence"] = {"prompt": "q?", "reference": "a."}
 +        report = _write(tmp_path, _full_report([no_conf]))
 +        candidates = read_sway_report(report)
 +        assert len(candidates) == 1
 +        assert candidates[0].confidence == 1.0
++
++
 +class TestMissingReference:
 +    def test_strict_raises(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, _full_report([_PROBE_FAIL_NO_REF]))
 +        with pytest.raises(NoReferenceError):
 +            read_sway_report(report, strict=True)
++
 +    def test_lax_skips_with_log(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
 +        report = _write(
 +            tmp_path,
 +            _full_report([_PROBE_FAIL_NO_REF, _PROBE_FAIL_WITH_REF]),
 +        )
 +        with caplog.at_level("WARNING"):
 +            candidates = read_sway_report(report, strict=False)
 +        assert len(candidates) == 1
 +        assert candidates[0].probe_name == "fortran_subroutine_semantics"
 +        assert any("carries no reference" in rec.message for rec in caplog.records)
++
 +    def test_empty_reference_string_refused(self, tmp_path: Path) -> None:
 +        empty = {**_PROBE_FAIL_WITH_REF}
 +        empty["evidence"] = {"prompt": "q?", "reference": "   "}
 +        report = _write(tmp_path, _full_report([empty]))
 +        with pytest.raises(NoReferenceError):
 +            read_sway_report(report, strict=True)
++
++
 +class TestMalformed:
 +    def test_file_missing(self, tmp_path: Path) -> None:
 +        with pytest.raises(MalformedSwayReportError, match="cannot read"):
 +            read_sway_report(tmp_path / "does-not-exist.json")
++
 +    def test_not_json(self, tmp_path: Path) -> None:
 +        bad = tmp_path / "bad.json"
 +        bad.write_text("this is not json {", encoding="utf-8")
 +        with pytest.raises(MalformedSwayReportError, match="not valid JSON"):
 +            read_sway_report(bad)
++
 +    def test_top_level_array_rejected(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, [])
 +        with pytest.raises(MalformedSwayReportError, match="must be a JSON object"):
 +            read_sway_report(report)
++
 +    def test_missing_schema_version(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, {"sway_version": "0.1", "probes": []})
 +        with pytest.raises(MalformedSwayReportError, match="schema_version"):
 +            read_sway_report(report)
++
 +    def test_newer_schema_refused(self, tmp_path: Path) -> None:
 +        payload = _full_report([])
 +        payload["schema_version"] = 99
 +        report = _write(tmp_path, payload)
 +        with pytest.raises(MalformedSwayReportError, match="newer than this reader"):
 +            read_sway_report(report)
++
 +    def test_missing_probes_array(self, tmp_path: Path) -> None:
 +        report = _write(tmp_path, {"schema_version": 1, "sway_version": "0.1"})
 +        with pytest.raises(MalformedSwayReportError, match="`probes` array"):
 +            read_sway_report(report)
++
 +    def test_probe_evidence_not_object(self, tmp_path: Path) -> None:
 +        broken = {**_PROBE_FAIL_WITH_REF}
 +        broken["evidence"] = "not-an-object"
 +        report = _write(tmp_path, _full_report([broken]))
 +        with pytest.raises(NoReferenceError, match="evidence is not an object"):
 +            read_sway_report(report)
++
 +    def test_probe_is_not_object(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
 +        report = _write(
 +            tmp_path,
 +            _full_report([_PROBE_FAIL_WITH_REF, "garbage"]),  # type: ignore[list-item]
 +        )
 +        with caplog.at_level("WARNING"):
 +            candidates = read_sway_report(report)
 +        assert len(candidates) == 1
 +        assert any("not an object" in rec.message for rec in caplog.records)