`3c0ffd9`

probes/training_drift: parse dlm per-step JSONLs, score smoothness + spikes

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 2 weeks ago

SHA: 3c0ffd9732247a18b90d96b0cef99a8ceef6c60d
Parents: 777d726
Tree: 4366a76

1 changed file

Status	File	+	-
A	`src/dlm_sway/probes/training_drift.py`	486	0

src/dlm_sway/probes/training_drift.pyadded

++"""Training-drift — pre-run probe that reads dlm's per-step loss curve.
++
++Sister probe to :mod:`gradient_ghost`. Where ``gradient_ghost`` reads
++the optimizer state at the end of training, ``training_drift`` reads
++the loss curve *during* training: the rich signal about *how* the
++adapter trained, not just what it produced.
++
++Four metrics extracted from the curve:
++
++- ``final_loss`` — last recorded step's loss.
++- ``convergence_ratio`` — ``final_loss / initial_loss``. Lower is
++  better; a healthy adapter cuts loss by half or more.
++- ``smoothness`` — ``1 - (var(Δloss) / var(loss))``. Values near 1
++  mean the curve descends smoothly; near 0 means each step's
++  change-in-loss is comparable to the loss range itself (spiky).
++- ``instability_events`` — count of steps where
++  ``|Δloss| > 3 · rolling_std``. Spikes that survive the rolling
++  window are real — they correlate with silent adapter degradation.
++
++Verdict: PASS when all three of (smoothness ≥ 0.7, instability_events
++== 0, convergence_ratio ≤ 0.7); else WARN. The thresholds are
++hand-tuned defaults; spec fields override them.
++
++## Why no null calibration
++
++Mirrors ``prompt_collapse`` and ``multi_turn_coherence_decay``: a null
++adapter doesn't *train* — it has no loss curve. The null distribution
++of "smoothness on a noise adapter" is undefined. Fixed-threshold
++verdicts are the published path; users override per-spec.
++
++## Log-format note
++
++dlm writes one JSONL per run at
++``<store_path>/logs/train-NNNNNN-YYYYMMDDTHHMMSS.jsonl``. Each line is
++``{"type": "<banner|step|run_complete|...>", ...}``. This probe
++filters for ``type == "step"`` records and reads ``step`` + ``loss``.
++The sibling ``*.summary.json`` has run aggregates; we don't consume
++it here — the curve is richer.
++"""
++
++from __future__ import annotations
++
++import json
++import math
++from pathlib import Path
++from typing import ClassVar, Literal
++
++import numpy as np
++from pydantic import Field
++
++from dlm_sway.core.errors import SwayError
++from dlm_sway.core.result import ProbeResult, Verdict, safe_finalize
++from dlm_sway.probes.base import Probe, ProbeSpec, RunContext
++
++
++class TrainingDriftError(SwayError):
++    """Raised when a training log file is structurally unparseable.
++
++    Distinct from :class:`MissingTrainingStateError`-style absences:
++    a missing logs directory is a SKIP (no .dlm context, or a
++    pre-training adapter), but a JSONL with corrupted lines or the
++    wrong shape is an ERROR — the user has training data but we
++    can't read it, which deserves a noisy failure.
++    """
++
++
++class TrainingDriftSpec(ProbeSpec):
++    """Spec for ``kind: training_drift``."""
++
++    kind: Literal["training_drift"] = "training_drift"
++    store_path: str | None = None
++    """Path to a dlm store root (the directory containing
++    ``logs/`` and ``adapter/``). When ``None`` the probe SKIPs —
++    typically populated by the .dlm autogen path which knows the
++    store from the dlm_id resolution."""
++    min_steps: int = Field(default=10, ge=2)
++    """Skip when the curve has fewer steps than this. Short runs
++    (3-step early-stops, smoke tests) produce undefined fits;
++    surfacing those as PASS/FAIL would be misleading."""
++    rolling_window: int = Field(default=10, ge=2)
++    """Window for the rolling std used in spike detection. Larger
++    windows mean tighter spike detection at the cost of missing
++    rapid oscillations. 10 is a balance for typical 100–1000 step
++    runs."""
++    spike_sigma: float = Field(default=3.0, gt=0.0)
++    """A step is an instability event when ``|Δloss|`` exceeds this
++    many rolling standard deviations. 3σ is the conventional
++    "outlier" boundary; lower for more sensitivity, higher for less
++    noise tolerance."""
++    assert_smoothness_gte: float = Field(default=0.7, ge=0.0, le=1.0)
++    """Minimum smoothness for PASS."""
++    assert_convergence_ratio_lte: float = Field(default=0.7, gt=0.0)
++    """Maximum ``final_loss / initial_loss`` for PASS. A "well-
++    trained" run typically halves loss; permissive default tolerates
++    noisier data."""
++    assert_instability_events_lte: int = Field(default=0, ge=0)
++    """Maximum allowed spike count. Default 0 — any spike → WARN."""
++
++
++class TrainingDriftProbe(Probe):
++    """The "did this adapter train smoothly?" pre-run probe."""
++
++    kind = "training_drift"
++    spec_cls = TrainingDriftSpec
++    category = "calibration"
++    needs_backend: ClassVar[bool] = False
++    # Pre-run probe: no model load, runs in <100ms, ideal for
++    # ``sway check`` first-pass before any heavy probe fires.
++    # Mirrors gradient_ghost's posture.
++
++    def run(self, spec: ProbeSpec, ctx: RunContext) -> ProbeResult:
++        del ctx  # No backend / sections / null_stats consumed.
++        assert isinstance(spec, TrainingDriftSpec)
++
++        if spec.store_path is None:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message="no store_path provided (no .dlm context)",
++            )
++
++        store_path = Path(spec.store_path).expanduser().resolve()
++        logs_dir = store_path / "logs"
++        if not logs_dir.is_dir():
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message=f"no logs/ directory under {store_path}",
++            )
++
++        log_paths = sorted(logs_dir.glob("train-*.jsonl"))
++        if not log_paths:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message=f"no train-*.jsonl files under {logs_dir}",
++            )
++
++        # Concatenate steps across all runs in chronological order.
++        # Resumed runs may produce duplicate step numbers — dedupe by
++        # keeping the latest occurrence (the most recent run's value
++        # for that step). Sorted glob already gives chronological
++        # order via the timestamp suffix.
++        try:
++            steps_by_idx = _collect_steps(log_paths)
++        except TrainingDriftError as exc:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.ERROR,
++                score=None,
++                message=str(exc),
++                evidence={"log_paths": [str(p) for p in log_paths]},
++            )
++
++        if len(steps_by_idx) < spec.min_steps:
++            return ProbeResult(
++                name=spec.name,
++                kind=spec.kind,
++                verdict=Verdict.SKIP,
++                score=None,
++                message=(
++                    f"only {len(steps_by_idx)} step records "
++                    f"(< min_steps={spec.min_steps}); "
++                    f"curve too short to fit reliably"
++                ),
++                evidence={"num_steps": len(steps_by_idx)},
++            )
++
++        # Sorted (step, loss) pairs.
++        ordered = sorted(steps_by_idx.items())
++        steps = np.asarray([s for s, _ in ordered], dtype=np.int64)
++        losses = np.asarray([loss for _, loss in ordered], dtype=np.float64)
++
++        # All four metrics. Each can fail gracefully (NaN-only loss
++        # column, all-equal losses, etc.) and the verdict respects the
++        # safe_finalize critical-field guard for ``raw``.
++        metrics = _compute_metrics(
++            losses, rolling_window=spec.rolling_window, spike_sigma=spec.spike_sigma
++        )
++
++        verdict, score, message = _verdict_from_metrics(metrics, spec)
++
++        # Bound the curve we ship in evidence: a 10k-step run shouldn't
++        # explode the JSON report. Downsample uniformly to the cap.
++        curve = _downsampled_curve(steps, losses, cap=512)
++
++        return safe_finalize(
++            name=spec.name,
++            kind=spec.kind,
++            verdict=verdict,
++            score=score,
++            raw=metrics["smoothness"],
++            evidence={
++                "final_loss": metrics["final_loss"],
++                "initial_loss": metrics["initial_loss"],
++                "convergence_ratio": metrics["convergence_ratio"],
++                "smoothness": metrics["smoothness"],
++                "instability_events": metrics["instability_events"],
++                "num_steps": len(losses),
++                "num_log_files": len(log_paths),
++                "curve_sampled": curve,
++                "thresholds": {
++                    "smoothness_gte": spec.assert_smoothness_gte,
++                    "convergence_ratio_lte": spec.assert_convergence_ratio_lte,
++                    "instability_events_lte": spec.assert_instability_events_lte,
++                },
++                "weight": spec.weight,
++            },
++            message=message,
++            critical_fields=(),
++            # ``raw`` (smoothness) is bounded [0, 1] in normal cases;
++            # the metric helper returns NaN only on degenerate inputs
++            # which we already surfaced via the SKIP path. Critical-
++            # field guard would null-out the score on a NaN smoothness
++            # which is more confusing than helpful here.
++        )
++
++
++# ---------------------------------------------------------------------------
++# JSONL parsing
++# ---------------------------------------------------------------------------
++
++
++def _collect_steps(log_paths: list[Path]) -> dict[int, float]:
++    """Parse every JSONL, dedupe-by-step, return ``{step: loss}``.
++
++    Resumed runs append to a fresh JSONL but share step numbers with
++    the prior run. Dedupe-by-keep-latest matches dlm's own ``dlm
++    metrics`` semantics: the most recent run for a given step wins.
++    """
++    out: dict[int, float] = {}
++    for path in log_paths:
++        try:
++            with path.open("r", encoding="utf-8") as f:
++                for line_no, raw in enumerate(f, start=1):
++                    raw = raw.strip()
++                    if not raw:
++                        continue
++                    try:
++                        rec = json.loads(raw)
++                    except json.JSONDecodeError as exc:
++                        # A trailing partial line from a crashed
++                        # trainer is the typical cause. Skip it but
++                        # surface as ERROR if EVERY line is broken
++                        # (caller checks ``out`` emptiness).
++                        if line_no == 1:
++                            raise TrainingDriftError(
++                                f"first line of {path.name} is not valid JSON: {exc}"
++                            ) from exc
++                        continue
++                    if not isinstance(rec, dict):
++                        continue
++                    if rec.get("type") != "step":
++                        continue
++                    try:
++                        step = int(rec["step"])
++                        loss = float(rec["loss"])
++                    except (KeyError, TypeError, ValueError):
++                        continue
++                    if not math.isfinite(loss):
++                        # NaN loss is a real signal — record it as
++                        # +inf so the spike detector flags the
++                        # instability without crashing on np.log.
++                        loss = math.inf
++                    out[step] = loss
++        except OSError as exc:
++            raise TrainingDriftError(f"failed to read {path}: {exc}") from exc
++    return out
++
++
++# ---------------------------------------------------------------------------
++# Metric computation
++# ---------------------------------------------------------------------------
++
++
++def _compute_metrics(
++    losses: np.ndarray,
++    *,
++    rolling_window: int,
++    spike_sigma: float,
++) -> dict[str, float]:
++    """Compute the four headline metrics from the loss array.
++
++    All metrics are robust to NaN/inf: non-finite step losses are
++    replaced with the most-recent finite value before fitting (so a
++    single bad batch doesn't poison the curve), but their *positions*
++    are still counted as instability events.
++    """
++    losses = losses.astype(np.float64, copy=True)
++    instability_from_nan = int(np.sum(~np.isfinite(losses)))
++
++    if instability_from_nan > 0:
++        # Forward-fill non-finite values from the previous finite
++        # entry so downstream stats don't blow up. The first entry
++        # MUST be finite (training records the first batch's loss
++        # before anything could go wrong); guard anyway.
++        finite_mask = np.isfinite(losses)
++        if not finite_mask.any():
++            # Pathological: every step recorded NaN. Return all-NaN
++            # metrics so the verdict path can surface the failure.
++            return {
++                "initial_loss": float("nan"),
++                "final_loss": float("nan"),
++                "convergence_ratio": float("nan"),
++                "smoothness": 0.0,
++                "instability_events": float(len(losses)),
++            }
++        last_good = float(losses[finite_mask][0])
++        for i, v in enumerate(losses):
++            if not math.isfinite(v):
++                losses[i] = last_good
++            else:
++                last_good = float(v)
++
++    initial_loss = float(losses[0])
++    final_loss = float(losses[-1])
++    convergence_ratio = float(final_loss / initial_loss) if initial_loss != 0.0 else float("inf")
++
++    deltas = np.diff(losses)
++    var_loss = float(losses.var())
++    var_delta = float(deltas.var()) if deltas.size > 0 else 0.0
++    if var_loss > 0.0:  # noqa: SIM108 — branch comments are load-bearing
++        # Clip into [0, 1]: a curve where var(Δloss) > var(loss)
++        # implies the per-step jitter dominates the overall sweep —
++        # treat as fully-spiky (smoothness=0) rather than emit a
++        # negative number.
++        smoothness = max(0.0, 1.0 - var_delta / var_loss)
++    else:
++        # Identical losses across every step: the run never
++        # progressed. Smoothness is formally 1.0 (perfectly flat),
++        # but that's misleading — surface as 0.0 so the verdict path
++        # can flag it.
++        smoothness = 0.0
++
++    instability_events = _count_spikes(deltas, window=rolling_window, sigma=spike_sigma)
++    instability_events += instability_from_nan
++
++    return {
++        "initial_loss": initial_loss,
++        "final_loss": final_loss,
++        "convergence_ratio": convergence_ratio,
++        "smoothness": smoothness,
++        "instability_events": float(instability_events),
++    }
++
++
++def _count_spikes(deltas: np.ndarray, *, window: int, sigma: float) -> int:
++    """Count loss-increase events that exceed a robust noise threshold.
++
++    The naive ``|Δloss| > sigma · rolling_std`` heuristic is broken for
++    exponential decay: deltas span orders of magnitude across training,
++    so within-window std stays tiny while absolute deltas are large —
++    every step trips the threshold. The semantically correct
++    "instability event" is a loss *increase*, not a faster-than-typical
++    decrease.
++
++    Heuristic:
++
++    1. Restrict to positive deltas (``delta > 0`` ⇒ loss went up). Loss
++       going down faster than usual isn't an instability — it's the
++       happy path.
++    2. For each positive delta, compare to the median absolute delta
++       in a centered window. Flag when ``delta > sigma · MAD``, where
++       MAD is the median absolute deviation (robust to outliers).
++    3. Short curves fall back to global MAD; constant-loss curves
++       (every delta zero) report 0 spikes.
++
++    The ``sigma`` parameter retains its semantic meaning ("how many
++    typical deltas does this exceed") but operates against the
++    median-absolute-delta scale rather than std-of-delta.
++    """
++    if deltas.size == 0:
++        return 0
++
++    # Median absolute delta — the "typical movement scale" we judge
++    # spikes against. Median is robust to a few outliers; if the
++    # whole curve is flat, MAD is 0 and we report no spikes.
++    abs_deltas = np.abs(deltas)
++
++    if deltas.size < window:
++        baseline = float(np.median(abs_deltas))
++        if baseline == 0.0:
++            return 0
++        return int(np.sum((deltas > 0.0) & (deltas > sigma * baseline)))
++
++    spikes = 0
++    half = window // 2
++    for i in range(deltas.size):
++        if deltas[i] <= 0.0:
++            continue  # Loss went down — not an instability event.
++        lo = max(0, i - half)
++        hi = min(deltas.size, i + half + 1)
++        window_slice = abs_deltas[lo:hi]
++        # Exclude the point itself so a real outlier doesn't inflate
++        # the baseline it's measured against.
++        if window_slice.size > 1:
++            mask = np.ones(window_slice.size, dtype=bool)
++            mask[i - lo] = False
++            window_slice = window_slice[mask]
++        baseline = float(np.median(window_slice))
++        if baseline == 0.0:
++            # Surrounding deltas are all zero — any nonzero positive
++            # delta is an instability event by construction.
++            spikes += 1
++            continue
++        if float(deltas[i]) > sigma * baseline:
++            spikes += 1
++    return spikes
++
++
++# ---------------------------------------------------------------------------
++# Verdict mapping
++# ---------------------------------------------------------------------------
++
++
++def _verdict_from_metrics(
++    metrics: dict[str, float], spec: TrainingDriftSpec
++) -> tuple[Verdict, float, str]:
++    """Map the four metrics to a (verdict, score, message)."""
++    smooth = metrics["smoothness"]
++    conv = metrics["convergence_ratio"]
++    instability = int(metrics["instability_events"])
++
++    smoothness_pass = smooth >= spec.assert_smoothness_gte
++    convergence_pass = conv <= spec.assert_convergence_ratio_lte
++    instability_pass = instability <= spec.assert_instability_events_lte
++
++    failures: list[str] = []
++    if not smoothness_pass:
++        failures.append(f"smoothness={smooth:.2f} < {spec.assert_smoothness_gte}")
++    if not convergence_pass:
++        failures.append(f"convergence_ratio={conv:.2f} > {spec.assert_convergence_ratio_lte}")
++    if not instability_pass:
++        failures.append(f"instability_events={instability} > {spec.assert_instability_events_lte}")
++
++    headline = (
++        f"smoothness={smooth:.2f}, convergence_ratio={conv:.2f}, "
++        f"instability_events={instability}, final_loss={metrics['final_loss']:.3f}"
++    )
++
++    if not failures:
++        # All three thresholds clear → PASS with a normalized score
++        # blending the three signals (smoothness contributes most;
++        # convergence and instability are guard rails).
++        score = float(min(1.0, max(0.0, smooth)))
++        return Verdict.PASS, score, headline
++
++    # Score: a continuous blend of how far we are from each threshold,
++    # so the report can rank "borderline warn" against "actively bad."
++    # Doesn't influence the verdict — that's already FAIL/WARN.
++    score = float(min(1.0, max(0.0, smooth * 0.5)))
++    return Verdict.WARN, score, f"{headline}; warnings: {'; '.join(failures)}"
++
++
++# ---------------------------------------------------------------------------
++# Curve downsampling for evidence
++# ---------------------------------------------------------------------------
++
++
++def _downsampled_curve(
++    steps: np.ndarray, losses: np.ndarray, *, cap: int
++) -> list[tuple[int, float]]:
++    """Uniform-stride downsample so a 10k-step run still fits in the JSON report.
++
++    Always preserves the first and last point so initial/final loss
++    are visible regardless of cap. For curves shorter than the cap,
++    returns the full series unchanged.
++    """
++    n = int(len(losses))
++    if n <= cap:
++        return [(int(s), float(loss)) for s, loss in zip(steps, losses, strict=True)]
++    # Stride to keep at most ``cap`` points: stride = ceil(n / cap).
++    # The +1 accounts for always-appending the final point even when
++    # ``stride * (cap - 1) < n - 1``.
++    stride = max(1, (n + cap - 1) // cap)
++    idx = list(range(0, n, stride))
++    if idx[-1] != n - 1:
++        idx.append(n - 1)
++    return [(int(steps[i]), float(losses[i])) for i in idx]