`f6e20a1`

probes/_training_state: torch.load wrapper + ParamStat dataclass + MissingTrainingStateError (S25 P1)

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 2 weeks ago

SHA: f6e20a1d1c8d77a18942d23aae68ff22e35e269f
Parents: e69bca5
Tree: b44465d

2 changed files

Status	File	+	-
M	`src/dlm_sway/core/errors.py`	19	0
A	`src/dlm_sway/probes/_training_state.py`	239	0

src/dlm_sway/core/errors.pymodified

          self.probe = probe
++class MissingTrainingStateError(SwayError):
++    """The pre-run probes (S25 ``gradient_ghost``) couldn't find a
++    ``training_state.pt`` next to the adapter.
++
++    Distinguishes "the file legitimately doesn't exist for this adapter"
++    (probe SKIPs cleanly) from "the file exists but won't load"
++    (probe ERRORs). Pre-run probes catch this and emit SKIP rather
++    than letting the missing file kill the suite.
++    """
++
++    def __init__(self, adapter_path: object) -> None:
++        super().__init__(
++            f"no training_state.pt under {adapter_path} — adapter wasn't "
++            f"produced by dlm or the file was pruned. Pre-run diagnostics "
++            f"(gradient_ghost) will SKIP for this adapter."
++        )
++        self.adapter_path = adapter_path
++
++
  class DlmCompatError(SwayError):
      """The installed ``dlm`` package's public surface doesn't match what
      sway's resolver expects.

src/dlm_sway/probes/_training_state.pyadded

++"""Loader for dlm's ``training_state.pt`` (Sprint 25, gradient_ghost).
++
++The pre-run ``gradient_ghost`` probe diagnoses adapter convergence
++without any model load: dlm writes the optimizer state snapshot at
++end of training, and we can read its ``global_step`` + per-parameter
++Adam ``exp_avg_sq`` magnitudes to answer "did this train long enough?"
++in ~50 ms.
++
++This module is **dlm-aware via file convention only** — we never
++``import dlm``. We just ``torch.load`` a path the user gave us. The
++file format is dlm's contract; we follow it conservatively and fail
++loudly when the shape drifts.
++
++## File-shape reference (verified against dlm 2026-04-20 stores)
++
++```
++training_state.pt:
++  global_step: int
++  epoch: float
++  best_val_loss: float
++  optimizer_state_dict:
++    state: dict[int param_id → {step, exp_avg, exp_avg_sq}]
++    param_groups: list[dict]   # lr, betas, eps, params: list[int]
++  scheduler_state_dict: dict
++  scaler_state_dict: dict | None
++  *_rng_state: tensor / tuple / None
++  pinned_versions: dict
++  base_model_revision: str
++  dlm_manifest_hash: str | None
++  use_qlora: bool
++```
++
++Per-param ids are integer indices, NOT names — name attribution lives
++in ``_param_id_mapping.py``.
++"""
++
++from __future__ import annotations
++
++import warnings
++from dataclasses import dataclass
++from pathlib import Path
++from typing import Any
++
++from dlm_sway.core.errors import MissingTrainingStateError, SwayError
++
++
++class TrainingStateError(SwayError):
++    """Raised when ``training_state.pt`` exists but can't be parsed."""
++
++
++@dataclass(frozen=True, slots=True)
++class ParamStat:
++    """Optimizer state for a single trainable parameter (one int id).
++
++    The probe reads ``exp_avg_sq_mean`` as a proxy for "how much
++    gradient variance this parameter has seen recently." Adam's
++    second-moment estimate is a moving average of squared gradients;
++    a high value at end-of-training means the gradients haven't
++    shrunk — typical of an undertrained parameter.
++    """
++
++    param_id: int
++    step: int
++    exp_avg_norm: float
++    exp_avg_sq_mean: float
++    numel: int
++
++
++@dataclass(frozen=True, slots=True)
++class TrainingStateSnapshot:
++    """Everything ``gradient_ghost`` needs from a ``training_state.pt``.
++
++    Constructed by :func:`load_training_state`; consumed by
++    ``GradientGhostProbe.run``. Frozen so a probe can pass it around
++    without defensive copies.
++    """
++
++    global_step: int
++    epoch: float
++    best_val_loss: float
++    per_param: tuple[ParamStat, ...]
++    pinned_versions: dict[str, str]
++    base_model_revision: str | None
++    dlm_manifest_hash: str | None
++    use_qlora: bool
++
++
++def load_training_state(adapter_dir: Path) -> TrainingStateSnapshot:
++    """Load + parse ``adapter_dir/training_state.pt``.
++
++    Parameters
++    ----------
++    adapter_dir:
++        A dlm adapter version directory (e.g.
++        ``~/.dlm/store/<id>/adapter/versions/v0001/``). Must contain
++        a ``training_state.pt`` file.
++
++    Returns
++    -------
++    TrainingStateSnapshot
++        Frozen snapshot of the fields the probe consumes.
++
++    Raises
++    ------
++    MissingTrainingStateError
++        ``training_state.pt`` doesn't exist under ``adapter_dir``. The
++        caller should SKIP (this is a clean signal, not an error).
++    TrainingStateError
++        File exists but can't be parsed (torch import missing,
++        unexpected dict shape, optimizer state missing). The caller
++        should ERROR — something is structurally wrong.
++    """
++    adapter_dir = Path(adapter_dir)
++    state_path = adapter_dir / "training_state.pt"
++    if not state_path.exists():
++        raise MissingTrainingStateError(adapter_dir)
++
++    # Lazy-import torch so non-gradient-ghost users don't pay the
++    # ~700 ms torch-import cost on `import dlm_sway`.
++    try:
++        import torch
++    except ImportError as exc:
++        raise TrainingStateError(
++            "torch not installed — gradient_ghost reads pytorch-pickled "
++            "training_state.pt files. Install with: pip install 'dlm-sway[hf]'"
++        ) from exc
++
++    # ``weights_only=False`` is required: dlm's training_state.pt
++    # carries pickled RNG state (numpy / python random). Suppressing
++    # the ``FutureWarning`` keeps the probe output clean — this is a
++    # known-trusted artifact dlm produced, not arbitrary user input.
++    with warnings.catch_warnings():
++        warnings.filterwarnings("ignore", category=FutureWarning)
++        try:
++            state = torch.load(str(state_path), map_location="cpu", weights_only=False)
++        except Exception as exc:  # noqa: BLE001 — torch.load can raise many shapes
++            raise TrainingStateError(
++                f"failed to torch.load {state_path}: {type(exc).__name__}: {exc}"
++            ) from exc
++
++    if not isinstance(state, dict):
++        raise TrainingStateError(f"{state_path}: expected dict, got {type(state).__name__}")
++    opt = state.get("optimizer_state_dict")
++    if not isinstance(opt, dict):
++        raise TrainingStateError(
++            f"{state_path}: missing 'optimizer_state_dict' (got {type(opt).__name__})"
++        )
++    per_param_state = opt.get("state")
++    if not isinstance(per_param_state, dict):
++        raise TrainingStateError(
++            f"{state_path}: optimizer_state_dict.state is not a dict "
++            f"(got {type(per_param_state).__name__})"
++        )
++
++    per_param: list[ParamStat] = []
++    for pid, ps in per_param_state.items():
++        if not isinstance(pid, int):
++            raise TrainingStateError(
++                f"{state_path}: optimizer_state_dict.state has non-int key "
++                f"{pid!r} (type {type(pid).__name__})"
++            )
++        if not isinstance(ps, dict):
++            continue  # Skip malformed entries silently — same as torch.optim does.
++        step_v = _scalar_int(ps.get("step", 0))
++        exp_avg = ps.get("exp_avg")
++        exp_avg_sq = ps.get("exp_avg_sq")
++        per_param.append(
++            ParamStat(
++                param_id=pid,
++                step=step_v,
++                exp_avg_norm=_tensor_norm(exp_avg),
++                exp_avg_sq_mean=_tensor_mean(exp_avg_sq),
++                numel=_tensor_numel(exp_avg_sq),
++            )
++        )
++    per_param.sort(key=lambda s: s.param_id)
++
++    return TrainingStateSnapshot(
++        global_step=int(state.get("global_step", 0) or 0),
++        epoch=float(state.get("epoch", 0.0) or 0.0),
++        best_val_loss=float(state.get("best_val_loss", 0.0) or 0.0),
++        per_param=tuple(per_param),
++        pinned_versions=dict(state.get("pinned_versions") or {}),
++        base_model_revision=state.get("base_model_revision"),
++        dlm_manifest_hash=state.get("dlm_manifest_hash"),
++        use_qlora=bool(state.get("use_qlora", False)),
++    )
++
++
++def _scalar_int(v: Any) -> int:
++    """torch saves ``step`` as a 0-dim tensor; coerce safely."""
++    if v is None:
++        return 0
++    item = getattr(v, "item", None)
++    if callable(item):
++        try:
++            return int(item())
++        except Exception:  # noqa: BLE001
++            return 0
++    try:
++        return int(v)
++    except Exception:  # noqa: BLE001
++        return 0
++
++
++def _tensor_norm(t: Any) -> float:
++    if t is None:
++        return 0.0
++    norm = getattr(t, "norm", None)
++    if callable(norm):
++        try:
++            return float(norm().item())
++        except Exception:  # noqa: BLE001
++            return 0.0
++    return 0.0
++
++
++def _tensor_mean(t: Any) -> float:
++    if t is None:
++        return 0.0
++    mean = getattr(t, "mean", None)
++    if callable(mean):
++        try:
++            return float(mean().item())
++        except Exception:  # noqa: BLE001
++            return 0.0
++    return 0.0
++
++
++def _tensor_numel(t: Any) -> int:
++    if t is None:
++        return 0
++    numel = getattr(t, "numel", None)
++    if callable(numel):
++        try:
++            return int(numel())
++        except Exception:  # noqa: BLE001
++            return 0
++    return 0