`c078408`

Warn on degraded eval and lock fallbacks

Authored by

espadonne 3 weeks ago

SHA: c078408783de7e56948078b619ead865aa21ef6e
Parents: 4acddd6
Tree: b76de7a

8 changed files

Status	File	+	-
M	`src/dlm/eval/mode_split.py`	20	5
M	`src/dlm/eval/probes.py`	48	22
M	`src/dlm/export/runner.py`	9	7
M	`src/dlm/train/trainer.py`	7	1
M	`tests/unit/eval/test_mode_split.py`	29	8
M	`tests/unit/eval/test_probes.py`	12	0
M	`tests/unit/export/test_runner.py`	25	0
M	`tests/unit/train/test_lock_wiring.py`	27	0

src/dlm/eval/mode_split.pymodified

  from __future__ import annotations
 +import logging
  from typing import Any
 +_LOG = logging.getLogger(__name__)
++
  def compute_val_loss_by_mode(trainer: Any, val_ds: Any) -> tuple[float | None, float | None]:
      """Return `(val_loss_cpt, val_loss_sft)` from a post-train eval pass.
          elif mode == "sft":
              sft_idx.append(i)
 -    cpt_loss = _safe_eval_loss(trainer, val_ds, cpt_idx)
 -    sft_loss = _safe_eval_loss(trainer, val_ds, sft_idx)
 +    cpt_loss = _safe_eval_loss(trainer, val_ds, cpt_idx, mode="cpt")
 +    sft_loss = _safe_eval_loss(trainer, val_ds, sft_idx, mode="sft")
      return (cpt_loss, sft_loss)
 -def _safe_eval_loss(trainer: Any, val_ds: Any, indices: list[int]) -> float | None:
 +def _safe_eval_loss(trainer: Any, val_ds: Any, indices: list[int], *, mode: str) -> float | None:
      """Run `trainer.evaluate(eval_dataset=subset)`; return eval_loss or None."""
      if not indices:
          return None
      try:
          subset = val_ds.select(indices)
 -    except Exception:
 +    except (AttributeError, IndexError, TypeError, ValueError) as exc:
 +        _LOG.warning(
 +            "val-loss split skipped %s subset selection (%d rows): %s",
 +            mode,
 +            len(indices),
 +            exc,
 +        )
          return None
      try:
          metrics = trainer.evaluate(eval_dataset=subset)
 -    except Exception:
 +    except (RuntimeError, TypeError, ValueError) as exc:
 +        _LOG.warning(
 +            "val-loss split skipped %s evaluation (%d rows): %s",
 +            mode,
 +            len(indices),
 +            exc,
 +        )
          return None
      loss = metrics.get("eval_loss") if isinstance(metrics, dict) else None
      if loss is None:

src/dlm/eval/probes.pymodified

  from __future__ import annotations
  import hashlib
 +import logging
  from dataclasses import dataclass
 -from dlm.data.instruction_parser import parse_instruction_body
 +from dlm.data.errors import InstructionParseError
 +from dlm.data.instruction_parser import QAPair, parse_instruction_body
  from dlm.doc.sections import Section, SectionType
  _PROBE_MARKER = "!probe"
  _PROBE_HEADER = f"### Q {_PROBE_MARKER}"
 +_LOG = logging.getLogger(__name__)
  @dataclass(frozen=True)
      is filled from INSTRUCTION section Q/A pairs via a deterministic
      sample.
      """
 -    explicit = list(_extract_explicit_probes(sections))
 +    parsed_pairs = _parse_instruction_sections(sections)
 +    explicit = list(_extract_explicit_probes(sections, parsed_pairs=parsed_pairs))
      if len(explicit) >= k:
          return explicit[:k]
      needed = k - len(explicit)
      seen_prompts = {p.prompt for p in explicit}
 -    auto = _auto_sample_probes(sections, k=needed, seed=seed, exclude=seen_prompts)
 +    auto = _auto_sample_probes(
 +        sections,
 +        k=needed,
 +        seed=seed,
 +        exclude=seen_prompts,
 +        parsed_pairs=parsed_pairs,
 +    )
      return [*explicit, *auto]
  # --- internals ---------------------------------------------------------------
 -def _extract_explicit_probes(sections: list[Section]) -> list[Probe]:
 +def _extract_explicit_probes(
 +    sections: list[Section],
 +    *,
 +    parsed_pairs: dict[str, list[QAPair]],
 +) -> list[Probe]:
      """Find INSTRUCTION Q/A pairs whose question starts with `!probe`.
      The `!probe` marker appears on the Q header line; the Q body is the
      for section in sections:
          if section.type is not SectionType.INSTRUCTION:
              continue
 -        try:
 -            pairs = parse_instruction_body(
 -                _normalize_probe_markers(section.content),
 -                section_id=section.section_id,
 -            )
 -        except Exception:
 -            # Malformed instruction bodies are the instruction-parser's
 -            # problem; probe extraction is best-effort and must not hide
 -            # grammar errors by raising here.
 -            continue
 +        pairs = parsed_pairs.get(section.section_id, [])
          for pair in pairs:
              # After normalization every probe pair sits in a private
              # namespace; we flag them via a sentinel prefix in the body.
  def _auto_sample_probes(
 -    sections: list[Section], *, k: int, seed: int, exclude: set[str]
 +    sections: list[Section],
 +    *,
 +    k: int,
 +    seed: int,
 +    exclude: set[str],
 +    parsed_pairs: dict[str, list[QAPair]],
  ) -> list[Probe]:
      """Deterministically pick `k` questions from INSTRUCTION sections.
      for section in sections:
          if section.type is not SectionType.INSTRUCTION:
              continue
 -        try:
 -            pairs = parse_instruction_body(
 -                _normalize_probe_markers(section.content),
 -                section_id=section.section_id,
 -            )
 -        except Exception:
 -            continue
 +        pairs = parsed_pairs.get(section.section_id, [])
          for pair in pairs:
              # Skip explicit probes (their question body was prefixed
              # with `!probe:` by the normalizer) — the caller handles
  def _probe_sort_key(prompt: str, seed: int) -> str:
      h = hashlib.sha256(f"{seed}\x00{prompt}".encode())
      return h.hexdigest()
++
++
 +def _parse_instruction_sections(sections: list[Section]) -> dict[str, list[QAPair]]:
 +    """Parse instruction sections once so malformed blocks warn once."""
 +    parsed: dict[str, list[QAPair]] = {}
 +    for section in sections:
 +        if section.type is not SectionType.INSTRUCTION:
 +            continue
 +        try:
 +            parsed[section.section_id] = parse_instruction_body(
 +                _normalize_probe_markers(section.content),
 +                section_id=section.section_id,
 +            )
 +        except InstructionParseError as exc:
 +            _LOG.warning(
 +                "probe extraction skipped malformed instruction section %s at line %d: %s",
 +                exc.section_id,
 +                exc.section_line,
 +                exc,
 +            )
 +            parsed[section.section_id] = []
 +    return parsed

src/dlm/export/runner.pymodified

  from typing import TYPE_CHECKING, Any
  from dlm.export import adapter_gguf, base_gguf, merge, preflight
 +from dlm.export.errors import ExportManifestError
  from dlm.export.manifest import (
      EXPORT_MANIFEST_FILENAME,
      ExportManifest,
          from dlm.export.manifest import compute_sha256, load_export_manifest
          prior = load_export_manifest(export_dir)
 -    except Exception:
 +        if prior.quant != quant:
 +            return False
 +        recorded = next((a for a in prior.artifacts if a.path == base_gguf_path.name), None)
 +        if recorded is None:
 +            return False
 +        return compute_sha256(base_gguf_path) == recorded.sha256
 +    except (ExportManifestError, OSError) as exc:
 +        _LOG.warning("export cache ignored stale manifest under %s: %s", export_dir, exc)
          return False
 -    if prior.quant != quant:
 -        return False
 -    recorded = next((a for a in prior.artifacts if a.path == base_gguf_path.name), None)
 -    if recorded is None:
 -        return False
 -    return compute_sha256(base_gguf_path) == recorded.sha256
  def _perform_merge_path(  # pragma: no cover

src/dlm/train/trainer.pymodified

      DlmLock,
      LockDecision,
      LockMode,
 +    LockSchemaError,
      LockValidationError,
      build_lock,
      hardware_tier_from_backend,
+     )
      try:
          prior = load_lock(store.root)
 -    except Exception:
 +    except LockSchemaError as exc:
          # Audit-05 N5: a corrupt `dlm.lock` on disk would normally kill
          # the run at load time. Under `--update-lock` the operator has
          # explicitly opted to overwrite the file; treat the parse
          # "don't touch the file").
          if lock_mode != "update":
              raise
 +        _LOG.warning(
 +            "update-lock: ignoring unreadable prior dlm.lock at %s: %s",
 +            store.root,
 +            exc,
 +        )
          prior = None
      decision = validate_lock(prior, candidate, mode=lock_mode)

tests/unit/eval/test_mode_split.pymodified

  from __future__ import annotations
 -from types import SimpleNamespace
 +import logging
  from typing import Any
  from unittest.mock import MagicMock
 +import pytest
++
  from dlm.eval.mode_split import compute_val_loss_by_mode
  class TestEvalFailures:
 -    def test_evaluate_exception_yields_none(self) -> None:
 +    def test_evaluate_exception_yields_none(
 +        self,
 +        caplog: pytest.LogCaptureFixture,
 +    ) -> None:
          """A stack-version skew that makes evaluate() raise shouldn't
          crash training — the affected mode just stays None."""
 +        caplog.set_level(logging.WARNING, logger="dlm.eval.mode_split")
          trainer = MagicMock()
          trainer.evaluate.side_effect = RuntimeError("TRL drift")
          val = _FakeDataset([{"text": "a"}, {"messages": []}])
          cpt, sft = compute_val_loss_by_mode(trainer, val)
          assert cpt is None
          assert sft is None
 +        assert "val-loss split skipped cpt evaluation" in caplog.text
 +        assert "val-loss split skipped sft evaluation" in caplog.text
      def test_missing_eval_loss_key_yields_none(self) -> None:
          trainer = MagicMock()
          assert cpt is None
          assert sft is None
 -    def test_select_failure_yields_none(self) -> None:
 +    def test_select_failure_yields_none(
 +        self,
 +        caplog: pytest.LogCaptureFixture,
 +    ) -> None:
 +        caplog.set_level(logging.WARNING, logger="dlm.eval.mode_split")
          trainer = MagicMock()
          trainer.evaluate.return_value = {"eval_loss": 0.0}
 -        # A dataset without a .select method — the helper should swallow.
 -        bad_val = SimpleNamespace(
 -            __len__=lambda: 1,
 -            __iter__=lambda: iter([{"text": "a"}]),
 -        )
 +        # Dataset iteration works, but subset selection does not.
 +        bad_val = _NoSelectDataset([{"text": "a"}])
          cpt, sft = compute_val_loss_by_mode(trainer, bad_val)
          # Both None — the helper couldn't build subsets.
          assert cpt is None
          assert sft is None
 +        assert "val-loss split skipped cpt subset selection" in caplog.text
++
++
 +class _NoSelectDataset:
 +    def __init__(self, rows: list[dict[str, Any]]) -> None:
 +        self._rows = rows
++
 +    def __len__(self) -> int:
 +        return len(self._rows)
++
 +    def __iter__(self):  # type: ignore[no-untyped-def]
 +        return iter(self._rows)

tests/unit/eval/test_probes.pymodified

  from __future__ import annotations
  import dataclasses
 +import logging
  import pytest
          s = Section(type=SectionType.INSTRUCTION, content=body)
          assert extract_probes([s], k=0) == []
 +    def test_malformed_instruction_logs_warning_once(
 +        self,
 +        caplog: pytest.LogCaptureFixture,
 +    ) -> None:
 +        body = "### Q\nunterminated question"
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        caplog.set_level(logging.WARNING, logger="dlm.eval.probes")
 +        assert extract_probes([s], k=3) == []
 +        assert "probe extraction skipped malformed instruction section" in caplog.text
 +        assert len(caplog.records) == 1
++
  class TestProbeDataclass:
      def test_probe_is_frozen(self) -> None:

tests/unit/export/test_runner.pymodified

  from __future__ import annotations
  import json
 +import logging
  from pathlib import Path
  from typing import Any
          assert len(recorder2.commands) == 1
          assert any("convert_lora_to_gguf.py" in str(a) for a in recorder2.commands[0])
 +    def test_bad_cached_manifest_logs_warning_and_rebuilds(
 +        self,
 +        tmp_path: Path,
 +        monkeypatch: pytest.MonkeyPatch,
 +        caplog: pytest.LogCaptureFixture,
 +    ) -> None:
 +        from dlm.export.errors import ExportManifestError
 +        from dlm.export.runner import _cached_base_matches
++
 +        export_dir = tmp_path / "exports" / "Q4_K_M"
 +        export_dir.mkdir(parents=True)
 +        base_gguf = export_dir / "base.Q4_K_M.gguf"
 +        base_gguf.write_bytes(b"cached bytes")
 +        (export_dir / "export_manifest.json").write_text("{}", encoding="utf-8")
++
 +        def _raise(_export_dir: Path) -> object:
 +            raise ExportManifestError("bad manifest")
++
 +        monkeypatch.setattr("dlm.export.manifest.load_export_manifest", _raise)
 +        caplog.set_level(logging.WARNING, logger="dlm.export.runner")
++
 +        assert _cached_base_matches(export_dir, base_gguf, "Q4_K_M") is False
 +        assert "export cache ignored stale manifest" in caplog.text
++
  class TestMergeGate:
      def test_qlora_merge_without_dequantize_raises(self, tmp_path: Path) -> None:

tests/unit/train/test_lock_wiring.pymodified

  from __future__ import annotations
 +import logging
  from pathlib import Path
  from types import SimpleNamespace
  from typing import Any
          updated = load_lock(store.root)
          assert updated is not None
          assert updated.base_model_revision == spec.revision
++
 +    def test_update_mode_warns_and_recovers_from_broken_lock(
 +        self,
 +        tmp_path: Path,
 +        caplog: pytest.LogCaptureFixture,
 +    ) -> None:
 +        store = _bootstrap_store(tmp_path)
 +        parsed = _parsed(tmp_path)
 +        spec = BASE_MODELS["smollm2-135m"]
 +        store_lock = store.root / "dlm.lock"
 +        store_lock.write_text("{not json", encoding="utf-8")
++
 +        caplog.set_level(logging.WARNING, logger="dlm.train.trainer")
 +        run(
 +            store,
 +            parsed,
 +            spec,
 +            _plan(),
 +            trainer_factory=_mock_trainer_factory,
 +            lock_mode="update",
 +        )
++
 +        updated = load_lock(store.root)
 +        assert updated is not None
 +        assert updated.base_model_revision == spec.revision
 +        assert "update-lock: ignoring unreadable prior dlm.lock" in caplog.text