`0af0f7c`

Scrub train support jargon

Authored by

espadonne 3 weeks ago

SHA: 0af0f7c46a55e23096550fbc24092d17367a5130
Parents: 8a42142
Tree: 8830b17

15 changed files

Status	File	+	-
M	`src/dlm/train/__init__.py`	3	3
M	`src/dlm/train/adapter.py`	10	10
M	`src/dlm/train/checkpoint_commit.py`	1	1
M	`src/dlm/train/determinism.py`	3	4
M	`src/dlm/train/distributed/__init__.py`	1	1
M	`src/dlm/train/distributed/gpus.py`	3	3
M	`src/dlm/train/distributed/rank_io.py`	2	3
M	`src/dlm/train/distributed/worker_entry.py`	8	8
M	`src/dlm/train/gate/orchestrator.py`	2	2
M	`src/dlm/train/loader.py`	2	2
M	`src/dlm/train/logger.py`	4	4
M	`src/dlm/train/multi_adapter/trainer.py`	4	5
M	`src/dlm/train/oom_guard.py`	4	4
M	`src/dlm/train/state_sidecar.py`	20	20
M	`src/dlm/train/tokenization.py`	1	1

src/dlm/train/__init__.pymodified

  """Training engine — load base, attach LoRA/QLoRA, run one SFT cycle.
 -See Sprint 09 for the design. Heavy imports (`torch`, `transformers`,
 -`peft`, `trl`, `bitsandbytes`) are deferred to the functions that use
 -them so `import dlm.train` stays cheap.
 +Heavy imports (`torch`, `transformers`, `peft`, `trl`,
 +`bitsandbytes`) are deferred to the functions that use them so
 +`import dlm.train` stays cheap.
  """
  from __future__ import annotations

src/dlm/train/adapter.pymodified

    - `"resume"` → `PeftModel.from_pretrained(model, resume_path, is_trainable=True)`.
  - `apply_kbit_preparation(model, gradient_checkpointing)` runs
    `prepare_model_for_kbit_training` for the QLoRA path. MUST be called
 -  BEFORE `get_peft_model` (audit risk noted in sprint spec).
 +  BEFORE `get_peft_model`, which is the order PEFT + bitsandbytes require.
  The trainer's higher-level orchestrator composes these — we keep the
  individual functions small + testable.
  ) -> Any:
      """Return a `peft.LoraConfig` sized for `spec`.
 -    If `tokenizer_grew=True` (Sprint 07 bringup added a new pad token),
 -    we MUST train the embedding + lm_head alongside the LoRA deltas —
 +    If `tokenizer_grew=True` (bringup added a new pad token), we MUST
 +    train the embedding + lm_head alongside the LoRA deltas —
      otherwise the new embedding row is undefined. `modules_to_save`
      inflates the adapter checkpoint size substantially; surfacing this
 -    at the LoRA level keeps the tradeoff auditable (CLAUDE.md pitfall
 -    #4 / audit F02).
 +    at the LoRA level keeps the tradeoff explicit.
      `use_dora=True` switches from plain LoRA to DoRA (weight-
      decomposed low-rank adaptation). DoRA factors each weight update
  def verify_resume_tokenizer_compat(adapter_dir: Path, *, tokenizer_grew: bool) -> None:
      """Assert the saved adapter's `modules_to_save` agrees with the current tokenizer.
 -    Audit-04 M5: on resume, we load a LoRA adapter whose `adapter_config.json`
 -    was written under a particular tokenizer state. If the current run's
 -    tokenizer bringup grew the vocab but the saved adapter doesn't train
 -    embeddings (or vice versa), the resumed training will silently corrupt
 -    the `<|pad|>` row or fail to update a re-resized embedding table.
 +    On resume, we load a LoRA adapter whose `adapter_config.json` was
 +    written under a particular tokenizer state. If the current run's
 +    tokenizer bringup grew the vocab but the saved adapter doesn't
 +    train embeddings (or vice versa), the resumed training will
 +    silently corrupt the `<|pad|>` row or fail to update a re-resized
 +    embedding table.
      Raises `ResumeIntegrityError` with actionable text on mismatch. Missing
      or unreadable `adapter_config.json` is treated as a mismatch (the

src/dlm/train/checkpoint_commit.pymodified

     `training_state.pt` + sha256.
 . `fsync_dir(path)` flushes the directory entry to disk.
 . `store.set_current_adapter(path)` atomically flips the pointer via
 -   `os.replace` on a tmp file (already implemented in Sprint 04).
 +   `os.replace` on a tmp file.
  The `commit_version()` helper bundles steps 1 + 3 + 4 around a
  caller-supplied writer function, so the "happy path" is one call. If

src/dlm/train/determinism.pymodified

 . `torch.backends.cudnn.benchmark = False` — no autotuner (which is
     non-deterministic).
 -MPS determinism is best-effort (audit F20); the `describe()` function
 -surfaces this to the training banner so Apple Silicon users see the
 -caveat before the run starts.
 +MPS determinism is best-effort; the training banner surfaces this so
 +Apple Silicon users see the caveat before the run starts.
  Heavy imports (`torch`, `numpy`) are deferred to call-sites so
  `import dlm.train` stays cheap.
          # the trainer logs a WARN for any that trigger.
          torch.use_deterministic_algorithms(True, warn_only=True)
          torch.backends.cudnn.benchmark = False
 -    except ImportError:  # pragma: no cover — torch is a runtime dep once Sprint 09 lands
 +    except ImportError:  # pragma: no cover — torch is a runtime dependency in production
          notes.append("torch not installed; determinism contract not enforced")
          class_ = "loose"

src/dlm/train/distributed/__init__.pymodified

  """Distributed (multi-GPU) training via HuggingFace Accelerate.
 -Sprint 23 scope: single-node, multi-GPU DDP. No multi-node, FSDP, or
 +Current scope: single-node, multi-GPU DDP. No multi-node, FSDP, or
  DeepSpeed.
  Surface:

src/dlm/train/distributed/gpus.pymodified

  def strip_gpus_flag(args: list[str], *, skip_argv0: bool = False) -> list[str]:
      """Drop `--gpus <value>` / `--gpus=<value>` from an argv-like list.
 -    Shared helper (audit-08 N1) so the launcher side (strips argv[0]
 -    because `accelerate launch -m <entry>` substitutes it) and the
 -    worker side (passes argv[1:] from `sys.argv`) don't drift. The
 +    Shared helper so the launcher side (strips argv[0] because
 +    `accelerate launch -m <entry>` substitutes it) and the worker
 +    side (passes argv[1:] from `sys.argv`) don't drift. The
      `skip_argv0` flag controls which input convention is used.
      """
      start = 1 if skip_argv0 else 0

src/dlm/train/distributed/rank_io.pymodified

      # We call gather per-metric to avoid building a tensor for a
      # heterogeneous dict. For a shape-stable dict of floats this is
 -    # clearer than stacking. torch is a core runtime dep so the
 -    # import is always available (audit-08 N6: dropped dead
 -    # defensive try/except).
 +    # clearer than stacking. torch is a core runtime dependency, so
 +    # the import is always available.
      import torch
      reduced: dict[str, float] = {}

src/dlm/train/distributed/worker_entry.pymodified

  Accelerate spawns one process per GPU; each invokes this module. The
  worker re-parses the subset of CLI args it cares about (path, seed,
  max_steps, resume/fresh, phase) and routes into the existing
 -`dlm.train.trainer.run` — which Sprint 23 still owns the single-GPU
 -I/O shape for.
 +`dlm.train.trainer.run`, which still owns the single-process I/O
 +shape.
  Full DDP integration (refactoring `trainer.run` to gate its I/O via
 -`rank_io.master_only`) is tracked as Sprint 23 follow-up; this entry
 -makes the launcher path complete end-to-end from the CLI but the
 -actual multi-GPU training loop remains a scaffold until the
 -integration test lands on real hardware.
 +`rank_io.master_only`) remains follow-up work; this entry makes the
 +launcher path complete end-to-end from the CLI but the actual
 +multi-GPU training loop remains a scaffold until the integration test
 +lands on real hardware.
  """
  from __future__ import annotations
      """Drop `--gpus <value>` / `--gpus=<value>` from argv (worker side).
      Per-rank invocations must not recurse into the launcher branch of
 -    `dlm train`. Delegates to the shared `strip_gpus_flag` helper
 -    (audit-08 N1); the worker passes argv without argv[0].
 +    `dlm train`. Delegates to the shared `strip_gpus_flag` helper;
 +    the worker passes argv without argv[0].
      """
      from dlm.train.distributed.gpus import strip_gpus_flag

src/dlm/train/gate/orchestrator.pymodified

      fewer than two adapters; callers don't need to check the config
      themselves. Embedding is injected as a callable so tests can stub
      it without loading an HF model. Any `GateTrainingError` is logged
 -    and swallowed — Sprint 34 treats gate training as best-effort so
 -    an SFT commit is never undone by a gate hiccup.
 +    and swallowed — gate training is best-effort, so an SFT commit is
 +    never undone by a gate hiccup.
      """
      training = parsed.frontmatter.training
      gate_cfg = training.gate

src/dlm/train/loader.pymodified

  _AUDIO_MODEL_CLASSES: dict[str, str] = {
      # Maps `BaseModelSpec.architecture` → transformers class name.
 -    # Sprint 35.2 v1 ships Qwen2-Audio only; add new entries here when
 -    # more audio-LM families land in the registry.
 +    # Add new entries here as more audio-LM families land in the
 +    # registry.
      "Qwen2AudioForConditionalGeneration": "Qwen2AudioForConditionalGeneration",
+ }

src/dlm/train/logger.pymodified

  """Structured per-step JSONL logger for training runs.
  One file per run at `logs/train-<run_id>-<started_at>.jsonl`. Every
 -line is a self-describing JSON object so downstream tools (Sprint 20's
 -`dlm metrics`, ad-hoc scripts) can parse it without state.
 +line is a self-describing JSON object so downstream tools (`dlm
 +metrics`, ad-hoc scripts) can parse it without state.
  Design
  ------
    versions, and the training-plan snapshot. Followed by `"type":
    "step"` records with `step`, `loss`, `lr`, `grad_norm`,
    `tokens_per_sec`, and optional `val_loss` on eval steps.
 -- **No Rich / tqdm here.** That's a Sprint 13 UX concern. The logger
 -  is plain JSONL so CI and automation can consume it directly.
 +- **No Rich / tqdm here.** The logger is plain JSONL so CI and
 +  automation can consume it directly.
  """
  from __future__ import annotations

src/dlm/train/multi_adapter/trainer.pymodified

  point regardless of document shape.
  Scope note: inference selection, export merge, and doctor memory
 -refusal layer on in sprint 20b. This module owns only the per-adapter
 +refusal are handled elsewhere. This module owns only the per-adapter
  orchestration and the resulting adapter-versioned store layout.
  """
      """Run the post-SFT learned-gate training pass when enabled.
      Kept separate so the multi-adapter orchestrator's happy path stays
 -    short. All errors are swallowed — gate training is best-effort per
 -    the Sprint 34 risk matrix.
 +    short. All errors are swallowed — gate training is best-effort.
      """
      import logging
  ) -> tuple[Callable[[str], Any], int]:  # pragma: no cover — heavy HF path
      """Default embedder — loads the HF base model + tokenizer.
 -    Covered by the Sprint 34 slow integration test; unit tests pass a
 -    stub via `gate_embed_factory`.
 +    Covered by the slow integration tests; unit tests pass a stub via
 +    `gate_embed_factory`.
      """
      from transformers import AutoTokenizer

src/dlm/train/oom_guard.pymodified

      Callers on non-CUDA devices can skip this — re-raising a
      non-CUDA OOM as an `OOMError` would be actively misleading.
      """
 -    # Import torch up-front (audit-04 m4) so that a missing torch
 -    # surfaces as a clean error at context-enter rather than inside the
 -    # exception handler, where it would silently tunnel any caught
 -    # exception past the OOM-reformatting path.
 +    # Import torch up-front so that a missing torch surfaces as a
 +    # clean error at context-enter rather than inside the exception
 +    # handler, where it would silently tunnel any caught exception
 +    # past the OOM-reformatting path.
      import torch
      try:

src/dlm/train/state_sidecar.pymodified

  RNG state out of the torch payload into a JSON sidecar, dropping
  `weights_only=False` from the load path. The writer always emits
  v2; the reader accepts v1 (legacy) with a migration warning."""
 -# Run-level flags the inference path consumes without loading torch
 -# (audit-05 M1): separate from `pinned_versions.json`, which is a pure
 -# package-version manifest. This file records *how* the adapter was
 -# trained — currently just the QLoRA flag; future fields (e.g., base
 -# compute dtype) extend this rather than polluting version metadata.
 +# Run-level flags the inference path consumes without loading torch:
 +# separate from `pinned_versions.json`, which is a pure package-version
 +# manifest. This file records *how* the adapter was trained —
 +# currently just the QLoRA flag; future fields (e.g., base compute
 +# dtype) extend this rather than polluting version metadata.
  TRAINING_RUN_FILENAME = "training_run.json"
      dlm_manifest_hash: str | None
      base_model_revision: str
      pinned_versions: PinnedVersions
 -    # audit-05 M1: explicit QLoRA flag. `InferencePlan` reads this via
 -    # `training_run.json` (written alongside) rather than inferring from
 -    # the bitsandbytes version pin, which false-positives on plain LoRA
 -    # runs on CUDA+bnb hosts.
 +    # Explicit QLoRA flag. `InferencePlan` reads this via
 +    # `training_run.json` (written alongside) rather than inferring
 +    # from the bitsandbytes version pin, which false-positives on
 +    # plain LoRA runs on CUDA+bnb hosts.
      use_qlora: bool
          json.dumps(dict(state["pinned_versions"]), sort_keys=True, indent=2) + "\n",
+     )
 -    # Run-level flags (audit-05 M1). Separate file so `InferencePlan`
 -    # can read `use_qlora` without loading torch or the whole state dict.
 +    # Run-level flags. Separate file so `InferencePlan` can read
 +    # `use_qlora` without loading torch or the whole state dict.
      training_run_path = directory / TRAINING_RUN_FILENAME
      write_text(
          training_run_path,
      try:
          torch_payload = torch.load(io.BytesIO(blob), weights_only=True)
      except Exception as weights_only_exc:
 -        # Legacy v1 format (pre-audit-11 B7) stored everything including
 -        # numpy ndarrays under weights_only=False. Retry with the legacy
 -        # loader + log a one-time migration notice. The next release
 -        # drops this branch; callers should re-save.
 +        # Legacy v1 format stored everything including numpy ndarrays
 +        # under weights_only=False. Retry with the legacy loader +
 +        # log a one-time migration notice. The next release drops this
 +        # branch; callers should re-save.
          try:
              torch_payload = torch.load(io.BytesIO(blob), weights_only=False)
          except Exception as exc:
  def _version_diff(pinned: PinnedVersions, runtime: PinnedVersions) -> list[str]:
      """Return `["key: saved→current", ...]` for keys whose versions differ.
 -    Asymmetric handling of `None` (audit-04 M6): losing a pinned package
 -    between save + resume (e.g., a QLoRA checkpoint from a CUDA box
 -    being resumed on Apple Silicon without `bitsandbytes`) is drift
 -    the user should see. Gaining a package that wasn't pinned is not
 -    drift — there was no prior state to diverge from.
 +    Asymmetric handling of `None`: losing a pinned package between
 +    save + resume (e.g., a QLoRA checkpoint from a CUDA box being
 +    resumed on Apple Silicon without `bitsandbytes`) is drift the user
 +    should see. Gaining a package that wasn't pinned is not drift —
 +    there was no prior state to diverge from.
      Rules:
      - saved=str, current=str, equal    → no drift

src/dlm/train/tokenization.pymodified

      Rows that carry neither ``messages`` nor ``text`` — preference
      rows destined for DPOTrainer — pass through untouched. DPOTrainer
 -    owns its own tokenization path (Sprint 17).
 +    owns its own tokenization path.
      """
      sha = tokenizer_sha256(tokenizer) if cache is not None else ""
      stats_hits = 0