`233d743`

feat(train): disk preflight + two-phase checkpoint commit (audit F12)

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 3 weeks ago

SHA: 233d743f0a0df6aef3d7ad284c060a48a9cb23d1
Parents: ae2aea1
Tree: d74479a

2 changed files

Status	File	+	-
A	`src/dlm/train/checkpoint_commit.py`	143	0
A	`src/dlm/train/disk_preflight.py`	84	0

src/dlm/train/checkpoint_commit.pyadded

 +"""Two-phase checkpoint commit (audit F12).
++
 +Invariant
 +---------
++
 +`adapter/current.txt` **never** points at a half-written version
 +directory. Either the old version is still authoritative, or the new
 +one is — never a partial state.
++
 +Lifecycle
 +---------
++
 +1. `allocate_next_version(store)` picks `vNNNN` where `NNNN` is one
 +   above the highest existing version directory (or `0001` on a fresh
 +   store). Creates the empty directory.
 +2. Caller populates the directory — `adapter.save_pretrained()` writes
 +   the adapter config + weights; `state_sidecar.save_state()` writes
 +   `training_state.pt` + sha256.
 +3. `fsync_dir(path)` flushes the directory entry to disk.
 +4. `store.set_current_adapter(path)` atomically flips the pointer via
 +   `os.replace` on a tmp file (already implemented in Sprint 04).
++
 +The `commit_version()` helper bundles steps 1 + 3 + 4 around a
 +caller-supplied writer function, so the "happy path" is one call. If
 +the writer raises, the pending directory is *not* made current — it's
 +left in place so the caller can inspect / clean up / retry.
 +"""
++
 +from __future__ import annotations
++
 +import os
 +from collections.abc import Callable
 +from pathlib import Path
 +from typing import TYPE_CHECKING
++
 +if TYPE_CHECKING:
 +    from dlm.store.paths import StorePath
++
 +# Regex-safe prefix shared with `StorePath.adapter_version`.
 +_VERSION_PREFIX = "v"
++
++
 +def allocate_next_version(store: StorePath) -> Path:
 +    """Return the next empty `adapter/versions/vNNNN/` path.
++
 +    Creates the directory (and any missing parents). `StorePath.adapter_version`
 +    does *not* create; we do it here so callers can start writing
 +    immediately.
 +    """
 +    existing = _existing_versions(store)
 +    next_n = (max(existing) if existing else 0) + 1
 +    version_dir = store.adapter_version(next_n)
 +    version_dir.mkdir(parents=True, exist_ok=False)
 +    return version_dir
++
++
 +def commit_version(
 +    store: StorePath,
 +    writer: Callable[[Path], None],
 +) -> Path:
 +    """Allocate → populate → fsync → flip pointer.
++
 +    Returns the committed version directory. On writer exception:
 +    - the pending directory is left on disk (not cleaned up, so the
 +      caller can diagnose)
 +    - the current pointer is NOT updated
 +    - the exception propagates
 +    """
 +    pending = allocate_next_version(store)
 +    try:
 +        writer(pending)
 +    except BaseException:
 +        # Leave `pending` on disk; the next allocate_next_version call
 +        # skips over it by bumping NNNN. Cleanup is a caller concern.
 +        raise
++
 +    fsync_dir(pending)
 +    store.set_current_adapter(pending)
 +    return pending
++
++
 +def fsync_dir(path: Path) -> None:
 +    """Flush the directory entry for `path` to disk.
++
 +    After writing the adapter files we need to ensure the directory
 +    metadata (new file entries) survives a power loss. `os.fsync` on
 +    the directory fd is the POSIX idiom. Windows doesn't allow opening
 +    a directory handle for fsync; on Windows we no-op (the underlying
 +    filesystem typically metadata-journals).
 +    """
 +    if os.name == "nt":  # pragma: no cover — macOS/Linux covered
 +        return
 +    fd = os.open(str(path), os.O_RDONLY)
 +    try:
 +        os.fsync(fd)
 +    finally:
 +        os.close(fd)
++
++
 +def list_pending_versions(store: StorePath) -> list[Path]:
 +    """Return version dirs that exist on disk but aren't the current pointer.
++
 +    Used by the trainer's startup routine to detect crash-before-flip
 +    remnants: if the pending dir has a complete adapter + training_state
 +    + matching sha256, the user could in principle resume from it by
 +    manually flipping the pointer. We surface them rather than
 +    auto-deleting.
 +    """
 +    existing = _existing_versions(store)
 +    current = store.resolve_current_adapter()
 +    current_n = _parse_version_number(current) if current is not None else None
 +    return [
 +        store.adapter_version(n)
 +        for n in sorted(existing)
 +        if n != current_n
 +    ]
++
++
 +def _existing_versions(store: StorePath) -> list[int]:
 +    base = store.adapter_versions
 +    if not base.is_dir():
 +        return []
 +    out: list[int] = []
 +    for entry in base.iterdir():
 +        if not entry.is_dir():
 +            continue
 +        n = _parse_version_dirname(entry.name)
 +        if n is not None:
 +            out.append(n)
 +    return out
++
++
 +def _parse_version_dirname(name: str) -> int | None:
 +    if not name.startswith(_VERSION_PREFIX):
 +        return None
 +    try:
 +        return int(name[len(_VERSION_PREFIX) :])
 +    except ValueError:
 +        return None
++
++
 +def _parse_version_number(path: Path) -> int | None:
 +    return _parse_version_dirname(path.name)

src/dlm/train/disk_preflight.pyadded

 +"""Pre-train disk-space check (audit F12).
++
 +A training run that fails halfway through because the disk filled up
 +leaves the store in a confusing state (half-written checkpoint, partial
 +log). Catch it up-front: estimate the bytes we're about to write, add
 +a safety margin, and refuse to start if the filesystem doesn't have
 +headroom.
++
 +The estimate is deliberately pessimistic — LoRA adapters are usually
 +<100 MB, but the checkpoint also has to hold the torch-serialized
 +optimizer state (which is ~2× the adapter for AdamW + scaler), logs,
 +and any cached evaluation artifacts. We also account for replay-corpus
 +growth (one snapshot per new section, zstd-compressed).
++
 +This module doesn't know about the heavy HF stack; it works from
 +`BaseModelSpec` + `TrainingPlan` + simple arithmetic. Heavy math lives
 +in `dlm.hardware.memory`; here we only need a byte estimate.
 +"""
++
 +from __future__ import annotations
++
 +import shutil
 +from pathlib import Path
 +from typing import TYPE_CHECKING
++
 +from dlm.train.errors import DiskSpaceError
++
 +if TYPE_CHECKING:
 +    from dlm.base_models import BaseModelSpec
 +    from dlm.hardware.plan import TrainingPlan
++
 +# Floor estimates (bytes). Generous by design; training "fails fast on
 +# low disk" is a much worse UX than "warns conservatively".
 +_LOG_RESERVE = 10 * 1024 * 1024  # 10 MB per run for JSONL logs
 +_OPTIMIZER_MULTIPLIER = 2.5  # AdamW + scaler + schedule state vs adapter bytes
 +_ADAPTER_FLOOR = 50 * 1024 * 1024  # 50 MB minimum adapter size (conservative)
++
++
 +def estimate_checkpoint_bytes(spec: BaseModelSpec, plan: TrainingPlan) -> int:
 +    """Rough byte estimate for one full checkpoint commit.
++
 +    Components:
 +    - Adapter weights (LoRA rank × shapes; modeled as a fraction of base)
 +    - Optimizer state (AdamW keeps m + v per trainable param)
 +    - Scaler + scheduler state
 +    - Log reserve
 +    """
 +    # LoRA adds roughly `r × (in + out)` params per target module. For a
 +    # canonical rank-16 adapter on a 1.5B model this works out to ~50 MB
 +    # in fp16 / ~100 MB in fp32. The estimate treats the adapter as 1%
 +    # of the base size, clamped to the floor.
 +    base_bytes = int(spec.size_gb_fp16 * (1024**3))
 +    adapter_bytes = max(base_bytes // 100, _ADAPTER_FLOOR)
 +    optimizer_bytes = int(adapter_bytes * _OPTIMIZER_MULTIPLIER)
++
 +    # Gradient checkpointing trades time for memory at runtime but
 +    # doesn't change checkpoint size, so it's not in the formula here.
 +    _ = plan  # suppress unused warning; kept as a hook for future plan-driven heuristics
++
 +    return adapter_bytes + optimizer_bytes + _LOG_RESERVE
++
++
 +def preflight_disk(
 +    store_root: Path,
 +    spec: BaseModelSpec,
 +    plan: TrainingPlan,
 +    *,
 +    safety: float = 1.5,
 +) -> None:
 +    """Raise `DiskSpaceError` if the store FS can't fit a checkpoint + margin.
++
 +    `safety` defaults to 1.5× — the trainer can get unlucky with
 +    intermediate buffers, and a hard-fail at step 9/10 of a multi-hour
 +    run is painful.
 +    """
 +    if safety <= 0:
 +        raise ValueError(f"safety must be > 0, got {safety!r}")
++
 +    estimate = estimate_checkpoint_bytes(spec, plan)
 +    required = int(estimate * safety)
++
 +    usage = shutil.disk_usage(store_root)
 +    if usage.free < required:
 +        raise DiskSpaceError(required_bytes=required, free_bytes=usage.free)