`b3e7a82`

test(eval,inference): 77-test suite — perplexity/early-stop/probes/retention/summary/plan

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 3 weeks ago

SHA: b3e7a827840d383970306a69aeb5836f31281ba3
Parents: 2c53cae
Tree: ace1760

15 changed files

Status	File	+	-
M	`src/dlm/eval/probes.py`	15	1
M	`src/dlm/inference/loader.py`	1	2
M	`src/dlm/inference/plan.py`	1	2
A	`tests/unit/eval/__init__.py`	0	0
A	`tests/unit/eval/test_early_stop.py`	60	0
A	`tests/unit/eval/test_perplexity.py`	32	0
A	`tests/unit/eval/test_probes.py`	81	0
A	`tests/unit/eval/test_retention.py`	84	0
A	`tests/unit/eval/test_summary.py`	104	0
A	`tests/unit/eval/test_val_loss.py`	61	0
A	`tests/unit/inference/__init__.py`	0	0
A	`tests/unit/inference/test_generate.py`	63	0
A	`tests/unit/inference/test_loader.py`	50	0
A	`tests/unit/inference/test_plan.py`	149	0
M	`tests/unit/train/test_trainer.py`	4	1

src/dlm/eval/probes.pymodified

      Hashes `(seed, question)` and keeps the top-k by hash — a stable
      weighted sample without needing `random.Random`. Excludes any
      prompt already in `exclude` (typically explicit probes).
++
 +    Parses the *normalized* section body so sections containing
 +    `### Q !probe` headers don't trip the strict instruction parser
 +    — we strip the marker, then filter out `!probe:`-prefixed bodies
 +    (those are the explicit probes, which the caller has already
 +    captured).
      """
      if k <= 0:
          return []
          if section.type is not SectionType.INSTRUCTION:
              continue
          try:
 -            pairs = parse_instruction_body(section.content, section_id=section.section_id)
 +            pairs = parse_instruction_body(
 +                _normalize_probe_markers(section.content),
 +                section_id=section.section_id,
 +            )
          except Exception:
              continue
          for pair in pairs:
 +            # Skip explicit probes (their question body was prefixed
 +            # with `!probe:` by the normalizer) — the caller handles
 +            # them separately.
 +            if pair.question.startswith(f"{_PROBE_MARKER}:"):
 +                continue
              if pair.question in exclude:
                  continue
              candidates.append(

src/dlm/inference/loader.pymodified

      adapter_path = store.resolve_current_adapter()
      if adapter_path is None or not adapter_path.exists():
          raise AdapterNotFoundError(
 -            f"no adapter under {store.adapter_current_pointer}; "
 -            "has `dlm train` run successfully?"
 +            f"no adapter under {store.adapter_current_pointer}; has `dlm train` run successfully?"
+         )
      from transformers import AutoModelForCausalLM, AutoTokenizer

src/dlm/inference/plan.pymodified

                  dequantize_on_load=True,
                  attn_implementation=_pick_attn(caps),
                  reason=(
 -                    "QLoRA adapter but bitsandbytes not installed; "
 -                    "dequantizing to fp16 on load."
 +                    "QLoRA adapter but bitsandbytes not installed; dequantizing to fp16 on load."
                  ),
+             )
          # Plain LoRA on CUDA.

tests/unit/eval/__init__.pyadded

tests/unit/eval/test_early_stop.pyadded

 +"""EarlyStopConfig validation + was_early_stopped heuristic."""
++
 +from __future__ import annotations
++
 +import pytest
++
 +from dlm.eval.early_stop import EarlyStopConfig, build_callback, was_early_stopped
++
++
 +class TestEarlyStopConfig:
 +    def test_defaults(self) -> None:
 +        cfg = EarlyStopConfig()
 +        assert cfg.patience == 3
 +        assert cfg.threshold == 0.0
 +        assert cfg.metric == "eval_loss"
 +        assert cfg.greater_is_better is False
++
 +    def test_patience_below_one_rejected(self) -> None:
 +        with pytest.raises(ValueError, match="patience"):
 +            EarlyStopConfig(patience=0)
++
 +    def test_negative_threshold_rejected(self) -> None:
 +        with pytest.raises(ValueError, match="threshold"):
 +            EarlyStopConfig(threshold=-0.1)
++
 +    def test_empty_metric_rejected(self) -> None:
 +        with pytest.raises(ValueError, match="metric"):
 +            EarlyStopConfig(metric="")
++
++
 +class TestBuildCallback:
 +    def test_returns_hf_callback(self) -> None:
 +        cfg = EarlyStopConfig(patience=5, threshold=0.01)
 +        callback = build_callback(cfg)
 +        # HF's EarlyStoppingCallback stores these as attributes.
 +        assert callback.early_stopping_patience == 5
 +        assert callback.early_stopping_threshold == pytest.approx(0.01)
++
++
 +class TestWasEarlyStopped:
 +    def test_max_steps_hit_exactly_means_not_stopped(self) -> None:
 +        assert not was_early_stopped(
 +            max_steps_ran=100, configured_max_steps=100, num_epochs_done=0.4
 +        )
++
 +    def test_max_steps_not_hit_means_stopped(self) -> None:
 +        assert was_early_stopped(max_steps_ran=47, configured_max_steps=100, num_epochs_done=0.2)
++
 +    def test_integer_epochs_mean_not_stopped(self) -> None:
 +        """Natural completion finishes exactly `num_train_epochs`."""
 +        assert not was_early_stopped(
 +            max_steps_ran=500, configured_max_steps=None, num_epochs_done=3.0
 +        )
++
 +    def test_fractional_epoch_means_stopped(self) -> None:
 +        assert was_early_stopped(max_steps_ran=200, configured_max_steps=None, num_epochs_done=1.47)
++
 +    def test_max_steps_zero_falls_back_to_epoch_check(self) -> None:
 +        """`max_steps=0` (or negative) means the cap isn't active."""
 +        assert not was_early_stopped(max_steps_ran=300, configured_max_steps=0, num_epochs_done=2.0)

tests/unit/eval/test_perplexity.pyadded

 +"""Perplexity utility — finite, inf, nan, negative, overflow guards."""
++
 +from __future__ import annotations
++
 +import math
++
 +import pytest
++
 +from dlm.eval.perplexity import perplexity
++
++
 +class TestPerplexity:
 +    def test_zero_loss_gives_one(self) -> None:
 +        assert perplexity(0.0) == pytest.approx(1.0)
++
 +    def test_positive_loss_gives_exp(self) -> None:
 +        assert perplexity(1.0) == pytest.approx(math.e)
 +        assert perplexity(math.log(10.0)) == pytest.approx(10.0)
++
 +    def test_nan_returns_inf(self) -> None:
 +        assert perplexity(math.nan) == math.inf
++
 +    def test_inf_returns_inf(self) -> None:
 +        assert perplexity(math.inf) == math.inf
++
 +    def test_negative_returns_inf(self) -> None:
 +        """Negative cross-entropy loss is nonsense — report inf, not a tiny PPL."""
 +        assert perplexity(-1.0) == math.inf
++
 +    def test_overflow_returns_inf(self) -> None:
 +        """`exp(1000.0)` overflows; we substitute inf."""
 +        assert perplexity(1000.0) == math.inf

tests/unit/eval/test_probes.pyadded

 +"""Probe prompt extraction — explicit `!probe` + auto-sample fallback."""
++
 +from __future__ import annotations
++
 +import dataclasses
++
 +import pytest
++
 +from dlm.doc.sections import Section, SectionType
 +from dlm.eval.probes import Probe, extract_probes
++
++
 +class TestExplicitProbes:
 +    def test_single_probe(self) -> None:
 +        body = "### Q !probe\nWhat is Paris?\n### A\nCapital of France."
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        probes = extract_probes([s], k=3)
 +        assert len(probes) == 1
 +        assert probes[0].prompt == "What is Paris?"
 +        assert probes[0].reference == "Capital of France."
 +        assert probes[0].section_id == s.section_id
++
 +    def test_multiple_explicit_probes_limited_by_k(self) -> None:
 +        body = (
 +            "### Q !probe\nQ1?\n### A\nA1\n\n"
 +            "### Q !probe\nQ2?\n### A\nA2\n\n"
 +            "### Q !probe\nQ3?\n### A\nA3"
 +        )
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        probes = extract_probes([s], k=2)
 +        assert len(probes) == 2
 +        assert [p.prompt for p in probes] == ["Q1?", "Q2?"]
++
 +    def test_non_probe_questions_ignored_when_explicit_present(self) -> None:
 +        body = "### Q !probe\nexplicit\n### A\nA1\n\n### Q\nnot-probe\n### A\nA2"
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        probes = extract_probes([s], k=3)
 +        assert len(probes) == 2
 +        # Explicit one comes first.
 +        assert probes[0].prompt == "explicit"
 +        # Auto-sampled fills the remainder.
 +        assert any(p.prompt == "not-probe" for p in probes)
++
++
 +class TestAutoSample:
 +    def test_auto_sample_when_no_explicit(self) -> None:
 +        body = "### Q\nQ1?\n### A\nA1\n\n### Q\nQ2?\n### A\nA2\n\n### Q\nQ3?\n### A\nA3"
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        probes = extract_probes([s], k=2, seed=42)
 +        assert len(probes) == 2
++
 +    def test_auto_sample_deterministic(self) -> None:
 +        body = "\n\n".join(f"### Q\nQ{i}?\n### A\nA{i}" for i in range(10))
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        a = extract_probes([s], k=3, seed=7)
 +        b = extract_probes([s], k=3, seed=7)
 +        assert [p.prompt for p in a] == [p.prompt for p in b]
++
 +    def test_different_seeds_yield_different_picks(self) -> None:
 +        body = "\n\n".join(f"### Q\nQ{i}?\n### A\nA{i}" for i in range(10))
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        a = extract_probes([s], k=3, seed=1)
 +        b = extract_probes([s], k=3, seed=99)
 +        assert {p.prompt for p in a} != {p.prompt for p in b}
++
 +    def test_no_instruction_sections_returns_empty(self) -> None:
 +        """Prose-only docs have nothing to probe — return [] rather than error."""
 +        s = Section(type=SectionType.PROSE, content="just prose, no Q/A")
 +        assert extract_probes([s], k=3) == []
++
 +    def test_k_zero_returns_empty(self) -> None:
 +        body = "### Q !probe\nx\n### A\ny"
 +        s = Section(type=SectionType.INSTRUCTION, content=body)
 +        assert extract_probes([s], k=0) == []
++
++
 +class TestProbeDataclass:
 +    def test_probe_is_frozen(self) -> None:
 +        p = Probe(prompt="hi", reference="hello")
 +        with pytest.raises(dataclasses.FrozenInstanceError):
 +            p.prompt = "other"  # type: ignore[misc]

tests/unit/eval/test_retention.pyadded

 +"""Retention slice determinism + stability."""
++
 +from __future__ import annotations
++
 +from datetime import datetime
++
 +import pytest
++
 +from dlm.eval.errors import RetentionSliceError
 +from dlm.eval.retention import build_retention_slice, retention_delta
 +from dlm.replay.models import IndexEntry
++
++
 +def _entry(sid: str) -> IndexEntry:
 +    return IndexEntry(
 +        section_id=sid,
 +        byte_offset=0,
 +        length=100,
 +        added_at=datetime(2026, 1, 1),
 +    )
++
++
 +class TestBuildRetentionSlice:
 +    def test_empty_corpus_raises(self) -> None:
 +        with pytest.raises(RetentionSliceError, match="empty"):
 +            build_retention_slice([], frac=0.1, seed=0)
++
 +    def test_frac_out_of_range_raises(self) -> None:
 +        entries = [_entry(f"{i:016x}") for i in range(10)]
 +        with pytest.raises(RetentionSliceError, match="frac"):
 +            build_retention_slice(entries, frac=0.0, seed=0)
 +        with pytest.raises(RetentionSliceError, match="frac"):
 +            build_retention_slice(entries, frac=1.5, seed=0)
++
 +    def test_frac_picks_expected_count(self) -> None:
 +        entries = [_entry(f"{i:016x}") for i in range(100)]
 +        slice_ = build_retention_slice(entries, frac=0.1, seed=42)
 +        assert len(slice_.entries) == 10
++
 +    def test_small_corpus_picks_at_least_one(self) -> None:
 +        """5% of 3 is 0.15 → rounds up to 1."""
 +        entries = [_entry(f"{i:016x}") for i in range(3)]
 +        slice_ = build_retention_slice(entries, frac=0.05, seed=0)
 +        assert len(slice_.entries) == 1
++
 +    def test_seed_stable(self) -> None:
 +        entries = [_entry(f"{i:016x}") for i in range(50)]
 +        a = build_retention_slice(entries, frac=0.1, seed=7)
 +        b = build_retention_slice(entries, frac=0.1, seed=7)
 +        assert [e.section_id for e in a.entries] == [e.section_id for e in b.entries]
++
 +    def test_different_seeds_different_slice(self) -> None:
 +        entries = [_entry(f"{i:016x}") for i in range(50)]
 +        a = build_retention_slice(entries, frac=0.1, seed=1)
 +        b = build_retention_slice(entries, frac=0.1, seed=999)
 +        assert {e.section_id for e in a.entries} != {e.section_id for e in b.entries}
++
 +    def test_identical_inputs_identical_slice(self) -> None:
 +        """Same corpus + same seed → identical slice across calls.
++
 +        Unlike Sprint 08's splitter, the retention slice is NOT
 +        growth-stable: adding new entries to the corpus can displace
 +        existing members from the top-k. The spec only requires
 +        seed-determinism for a fixed input, which this asserts.
 +        Cross-run comparability comes from reporting the loss delta on
 +        whatever slice the current run sees, not from freezing the
 +        slice membership across corpus growth.
 +        """
 +        entries = [_entry(f"{i:016x}") for i in range(100)]
 +        first = build_retention_slice(entries, frac=0.05, seed=42)
 +        second = build_retention_slice(entries, frac=0.05, seed=42)
 +        assert first.section_ids == second.section_ids
++
++
 +class TestRetentionDelta:
 +    def test_both_present(self) -> None:
 +        assert retention_delta(
 +            current_retention_loss=1.5, previous_retention_loss=1.2
 +        ) == pytest.approx(0.3)
++
 +    def test_none_when_either_missing(self) -> None:
 +        assert retention_delta(current_retention_loss=None, previous_retention_loss=1.0) is None
 +        assert retention_delta(current_retention_loss=1.0, previous_retention_loss=None) is None
 +        assert retention_delta(current_retention_loss=None, previous_retention_loss=None) is None

tests/unit/eval/test_summary.pyadded

 +"""TrainingSummary schema + round-trip."""
++
 +from __future__ import annotations
++
 +import json
 +from pathlib import Path
++
 +import pytest
 +from pydantic import ValidationError
++
 +from dlm.eval.summary import (
 +    ProbeOutput,
 +    TrainingSummary,
 +    load_summary,
 +    save_summary,
 +    summary_path_for,
 +)
++
++
 +def _summary(**overrides: object) -> TrainingSummary:
 +    base: dict[str, object] = {
 +        "run_id": 1,
 +        "adapter_version": 1,
 +        "seed": 42,
 +        "steps": 100,
 +        "final_train_loss": 1.23,
 +        "final_val_loss": 1.45,
 +        "final_val_perplexity": 4.26,
 +        "early_stopped": False,
 +        "duration_seconds": 12.5,
 +        "determinism_class": "strict",
 +    }
 +    base.update(overrides)
 +    return TrainingSummary.model_validate(base)
++
++
 +class TestSchema:
 +    def test_minimal_accepted(self) -> None:
 +        s = TrainingSummary(run_id=1, adapter_version=1, seed=0)
 +        assert s.run_id == 1
 +        assert s.final_train_loss is None
 +        assert s.probes == []
++
 +    def test_run_id_must_be_positive(self) -> None:
 +        with pytest.raises(ValidationError):
 +            TrainingSummary(run_id=0, adapter_version=1, seed=0)
++
 +    def test_adapter_version_must_be_positive(self) -> None:
 +        with pytest.raises(ValidationError):
 +            TrainingSummary(run_id=1, adapter_version=0, seed=0)
++
 +    def test_extra_fields_forbidden(self) -> None:
 +        with pytest.raises(ValidationError):
 +            TrainingSummary.model_validate(
 +                {"run_id": 1, "adapter_version": 1, "seed": 0, "bonus": "nope"}
 +            )
++
 +    def test_frozen(self) -> None:
 +        s = _summary()
 +        with pytest.raises(ValidationError):
 +            s.run_id = 2  # type: ignore[misc]
++
 +    def test_probes_roundtrip(self) -> None:
 +        s = _summary(
 +            probes=[
 +                {"prompt": "Q?", "response": "A.", "reference": "ref", "section_id": "sid0"},
 +            ]
 +        )
 +        assert isinstance(s.probes[0], ProbeOutput)
 +        assert s.probes[0].prompt == "Q?"
++
++
 +class TestSaveLoad:
 +    def test_round_trip_via_json(self, tmp_path: Path) -> None:
 +        s = _summary(
 +            probes=[{"prompt": "Q?", "response": "A.", "reference": None, "section_id": ""}]
 +        )
 +        p = tmp_path / "summary.json"
 +        save_summary(p, s)
 +        back = load_summary(p)
 +        assert back == s
++
 +    def test_written_file_is_sorted_pretty_json(self, tmp_path: Path) -> None:
 +        s = _summary()
 +        p = tmp_path / "summary.json"
 +        save_summary(p, s)
 +        text = p.read_text()
 +        # Pretty (indented) + trailing newline.
 +        assert text.endswith("\n")
 +        assert "  " in text
 +        # Sorted keys mean `adapter_version` appears before `run_id`.
 +        data = json.loads(text)
 +        keys = list(data.keys())
 +        assert keys == sorted(keys)
++
++
 +class TestSummaryPathFor:
 +    def test_matches_log_stem(self, tmp_path: Path) -> None:
 +        # summary_path_for normalizes timestamps.
 +        p = summary_path_for(tmp_path, 7, "2026-04-18T10:15:23")
 +        assert p.parent == tmp_path
 +        assert p.name.startswith("train-000007-")
 +        assert p.suffix == ".json"
 +        assert ".summary" in p.name

tests/unit/eval/test_val_loss.pyadded

 +"""compute_metrics hook + summarize_eval_state."""
++
 +from __future__ import annotations
++
 +import math
 +from types import SimpleNamespace
++
 +import pytest
++
 +from dlm.eval.val_loss import eval_metrics_from_eval_pred, summarize_eval_state
++
++
 +class TestEvalMetricsFromEvalPred:
 +    def test_returns_perplexity(self) -> None:
 +        pred = SimpleNamespace(metrics={"eval_loss": math.log(10.0)})
 +        result = eval_metrics_from_eval_pred(pred)
 +        assert result["perplexity"] == pytest.approx(10.0)
++
 +    def test_missing_metrics_returns_empty(self) -> None:
 +        pred = SimpleNamespace(metrics=None)
 +        assert eval_metrics_from_eval_pred(pred) == {}
++
 +    def test_non_numeric_loss_returns_empty(self) -> None:
 +        pred = SimpleNamespace(metrics={"eval_loss": "not a number"})
 +        assert eval_metrics_from_eval_pred(pred) == {}
++
 +    def test_no_metrics_attr(self) -> None:
 +        """An EvalPrediction without `metrics` attribute returns empty."""
 +        pred = SimpleNamespace()
 +        assert eval_metrics_from_eval_pred(pred) == {}
++
++
 +class TestSummarizeEvalState:
 +    def test_picks_last_eval_loss(self) -> None:
 +        history = [
 +            {"loss": 2.5, "step": 10},
 +            {"eval_loss": 2.1, "step": 10},
 +            {"loss": 2.3, "step": 20},
 +            {"eval_loss": 1.9, "step": 20},
 +        ]
 +        result = summarize_eval_state(history)
 +        assert result["final_val_loss"] == pytest.approx(1.9)
 +        assert result["final_val_perplexity"] == pytest.approx(math.exp(1.9))
++
 +    def test_no_eval_loss_in_history(self) -> None:
 +        result = summarize_eval_state([{"loss": 1.0}])
 +        assert result["final_val_loss"] is None
 +        assert result["final_val_perplexity"] is None
++
 +    def test_empty_history(self) -> None:
 +        result = summarize_eval_state([])
 +        assert result["final_val_loss"] is None
 +        assert result["final_val_perplexity"] is None
++
 +    def test_non_numeric_eval_loss_skipped(self) -> None:
 +        history = [
 +            {"eval_loss": "oops", "step": 10},
 +            {"eval_loss": 1.5, "step": 20},
 +        ]
 +        result = summarize_eval_state(history)
 +        assert result["final_val_loss"] == pytest.approx(1.5)

tests/unit/inference/__init__.pyadded

tests/unit/inference/test_generate.pyadded

 +"""`build_generate_kwargs` — deterministic vs. sampled argument resolution."""
++
 +from __future__ import annotations
++
 +import pytest
++
 +from dlm.inference.generate import DEFAULT_MAX_NEW_TOKENS, build_generate_kwargs
++
++
 +class TestDeterministicPath:
 +    def test_temperature_zero_is_deterministic(self) -> None:
 +        kwargs = build_generate_kwargs(max_new_tokens=32, temperature=0.0)
 +        assert kwargs["do_sample"] is False
 +        assert kwargs["num_beams"] == 1
 +        # Temperature must NOT leak through when do_sample=False.
 +        assert "temperature" not in kwargs
 +        assert kwargs["max_new_tokens"] == 32
++
 +    def test_default_max_new_tokens(self) -> None:
 +        kwargs = build_generate_kwargs()
 +        assert kwargs["max_new_tokens"] == DEFAULT_MAX_NEW_TOKENS
++
++
 +class TestSampledPath:
 +    def test_non_zero_temperature_flips_sampling(self) -> None:
 +        kwargs = build_generate_kwargs(max_new_tokens=100, temperature=0.7)
 +        assert kwargs["do_sample"] is True
 +        assert kwargs["temperature"] == pytest.approx(0.7)
 +        assert "num_beams" not in kwargs
++
 +    def test_top_p_threaded_when_sampling(self) -> None:
 +        kwargs = build_generate_kwargs(temperature=0.5, top_p=0.9)
 +        assert kwargs["top_p"] == pytest.approx(0.9)
++
 +    def test_top_k_threaded_when_sampling(self) -> None:
 +        kwargs = build_generate_kwargs(temperature=0.5, top_k=40)
 +        assert kwargs["top_k"] == 40
++
 +    def test_top_p_ignored_on_deterministic_path(self) -> None:
 +        kwargs = build_generate_kwargs(temperature=0.0, top_p=0.9)
 +        assert "top_p" not in kwargs
++
++
 +class TestCommon:
 +    def test_repetition_penalty_threaded_both_paths(self) -> None:
 +        kwargs_det = build_generate_kwargs(temperature=0.0, repetition_penalty=1.1)
 +        assert kwargs_det["repetition_penalty"] == pytest.approx(1.1)
 +        kwargs_sample = build_generate_kwargs(temperature=0.5, repetition_penalty=1.1)
 +        assert kwargs_sample["repetition_penalty"] == pytest.approx(1.1)
++
++
 +class TestValidation:
 +    def test_zero_max_new_tokens_rejected(self) -> None:
 +        with pytest.raises(ValueError, match="max_new_tokens"):
 +            build_generate_kwargs(max_new_tokens=0)
++
 +    def test_negative_max_new_tokens_rejected(self) -> None:
 +        with pytest.raises(ValueError, match="max_new_tokens"):
 +            build_generate_kwargs(max_new_tokens=-5)
++
 +    def test_negative_temperature_rejected(self) -> None:
 +        with pytest.raises(ValueError, match="temperature"):
 +            build_generate_kwargs(temperature=-0.1)

tests/unit/inference/test_loader.pyadded

 +"""`build_load_kwargs` — config-assembly without touching HF."""
++
 +from __future__ import annotations
++
 +from dlm.base_models import BASE_MODELS
 +from dlm.hardware.backend import Backend
 +from dlm.inference.loader import build_load_kwargs
 +from dlm.inference.plan import InferencePlan
++
++
 +def _plan(**overrides: object) -> InferencePlan:
 +    base: dict[str, object] = {
 +        "backend": Backend.CUDA,
 +        "precision": "bf16",
 +        "dequantize_on_load": False,
 +        "attn_implementation": "sdpa",
 +        "reason": "test",
 +    }
 +    base.update(overrides)
 +    return InferencePlan(**base)  # type: ignore[arg-type]
++
++
 +class TestBuildLoadKwargs:
 +    def test_basic_fp16_kwargs(self) -> None:
 +        spec = BASE_MODELS["smollm2-135m"]
 +        plan = _plan(backend=Backend.MPS, precision="fp16")
 +        kwargs = build_load_kwargs(spec, plan, has_bitsandbytes=False)
 +        assert kwargs["revision"] == spec.revision
 +        assert kwargs["attn_implementation"] == "sdpa"
 +        # No quantization config on non-CUDA.
 +        assert "quantization_config" not in kwargs
 +        assert "torch_dtype" in kwargs
++
 +    def test_dequantize_path_omits_bnb_config(self) -> None:
 +        """dequantize_on_load=True → no BitsAndBytesConfig even if bnb is installed."""
 +        spec = BASE_MODELS["smollm2-135m"]
 +        plan = _plan(dequantize_on_load=True, precision="fp16")
 +        kwargs = build_load_kwargs(spec, plan, has_bitsandbytes=True)
 +        assert "quantization_config" not in kwargs
++
 +    def test_plain_lora_uses_torch_dtype(self) -> None:
 +        spec = BASE_MODELS["smollm2-135m"]
 +        plan = _plan(backend=Backend.CUDA, precision="bf16", dequantize_on_load=False)
 +        # Has bnb but NO quantization config because this is plain LoRA (the pinned state
 +        # is checked upstream; the plan encodes the final decision via `dequantize_on_load`
 +        # + this function's responsibility is only to assemble from the plan).
 +        # has_bitsandbytes=False → definitely no quantization config.
 +        kwargs = build_load_kwargs(spec, plan, has_bitsandbytes=False)
 +        assert "quantization_config" not in kwargs
 +        assert "torch_dtype" in kwargs

tests/unit/inference/test_plan.pyadded

 +"""InferencePlan resolver — audit F05 cross-hardware coverage."""
++
 +from __future__ import annotations
++
 +import json
 +from pathlib import Path
 +from types import SimpleNamespace
++
 +from dlm.hardware.backend import Backend
 +from dlm.inference.plan import InferencePlan, resolve_inference
++
++
 +def _caps(
 +    *,
 +    backend: Backend,
 +    supports_bf16: bool = False,
 +    has_bitsandbytes: bool = False,
 +    has_flash_attention: bool = False,
 +) -> object:
 +    return SimpleNamespace(
 +        backend=backend,
 +        supports_bf16=supports_bf16,
 +        has_bitsandbytes=has_bitsandbytes,
 +        has_flash_attention=has_flash_attention,
 +    )
++
++
 +def _write_pinned(adapter_dir: Path, *, bnb: str | None) -> None:
 +    adapter_dir.mkdir(parents=True, exist_ok=True)
 +    (adapter_dir / "pinned_versions.json").write_text(
 +        json.dumps({"torch": "2.4.0", "bitsandbytes": bnb})
 +    )
++
++
 +class TestQLoRAOnCUDAWithBnb:
 +    def test_loads_4bit_native(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb="0.43.1")
 +        plan = resolve_inference(
 +            tmp_path, _caps(backend=Backend.CUDA, supports_bf16=True, has_bitsandbytes=True)
 +        )
 +        assert plan.backend == Backend.CUDA
 +        assert plan.precision == "bf16"
 +        assert plan.dequantize_on_load is False
 +        assert "4-bit" in plan.reason
++
++
 +class TestQLoRAOnCUDAWithoutBnb:
 +    def test_dequantizes(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb="0.43.1")
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.CUDA, has_bitsandbytes=False))
 +        assert plan.dequantize_on_load is True
 +        assert plan.precision == "fp16"
 +        assert "bitsandbytes not installed" in plan.reason
++
++
 +class TestQLoRAOnMPS:
 +    """Audit F05 canonical case — CUDA-trained QLoRA resumed on Apple Silicon."""
++
 +    def test_dequantizes_to_fp16(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb="0.43.1")
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.MPS))
 +        assert plan.backend == Backend.MPS
 +        assert plan.precision == "fp16"
 +        assert plan.dequantize_on_load is True
 +        assert plan.attn_implementation == "sdpa"
 +        assert "F05" in plan.reason
++
++
 +class TestLoRANonCUDA:
 +    def test_mps_plain_lora(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.MPS))
 +        assert plan.precision == "fp16"
 +        assert plan.dequantize_on_load is False
++
 +    def test_cpu_plain_lora(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.CPU))
 +        assert plan.dequantize_on_load is False
++
++
 +class TestLoRAOnCUDA:
 +    def test_bf16_when_supported(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.CUDA, supports_bf16=True))
 +        assert plan.precision == "bf16"
 +        assert plan.dequantize_on_load is False
++
 +    def test_fp16_when_bf16_unsupported(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.CUDA, supports_bf16=False))
 +        assert plan.precision == "fp16"
++
++
 +class TestAttnImplPick:
 +    def test_flash_attn_when_available(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb="0.43.1")
 +        plan = resolve_inference(
 +            tmp_path,
 +            _caps(
 +                backend=Backend.CUDA,
 +                supports_bf16=True,
 +                has_bitsandbytes=True,
 +                has_flash_attention=True,
 +            ),
 +        )
 +        assert plan.attn_implementation == "flash_attention_2"
++
 +    def test_sdpa_default(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.CUDA))
 +        assert plan.attn_implementation == "sdpa"
++
++
 +class TestMissingPinnedFile:
 +    def test_no_pinned_versions_treated_as_lora(self, tmp_path: Path) -> None:
 +        """Missing `pinned_versions.json` is conservative: assume LoRA."""
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.MPS))
 +        assert plan.dequantize_on_load is False
++
 +    def test_malformed_pinned_file_treated_as_lora(self, tmp_path: Path) -> None:
 +        (tmp_path / "pinned_versions.json").write_text("not json {{{")
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.MPS))
 +        assert plan.dequantize_on_load is False
++
++
 +class TestPlanSerialization:
 +    def test_to_dict_is_json_friendly(self, tmp_path: Path) -> None:
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.MPS))
 +        data = plan.to_dict()
 +        # Round-trip via json to prove serializability.
 +        encoded = json.dumps(data)
 +        decoded = json.loads(encoded)
 +        assert decoded["backend"] == "mps"
 +        assert decoded["precision"] == "fp16"
++
 +    def test_plan_is_frozen(self, tmp_path: Path) -> None:
 +        import dataclasses
++
 +        _write_pinned(tmp_path, bnb=None)
 +        plan = resolve_inference(tmp_path, _caps(backend=Backend.MPS))
 +        assert isinstance(plan, InferencePlan)
 +        try:
 +            plan.precision = "bf16"  # type: ignore[misc]
 +        except dataclasses.FrozenInstanceError:
 +            pass
 +        else:
 +            raise AssertionError("frozen=True not enforced")

tests/unit/train/test_trainer.pymodified

          spec = BASE_MODELS["smollm2-135m"]
          result = run(
 -            store, _parsed(), spec, _plan(),
 +            store,
 +            _parsed(),
 +            spec,
 +            _plan(),
              trainer_factory=_mock_trainer_factory,
+         )