Scrub train support jargon
- SHA
0af0f7c46a55e23096550fbc24092d17367a5130- Parents
-
8a42142 - Tree
8830b17
0af0f7c
0af0f7c46a55e23096550fbc24092d17367a51308a42142
8830b17| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm/train/__init__.py
|
3 | 3 |
| M |
src/dlm/train/adapter.py
|
10 | 10 |
| M |
src/dlm/train/checkpoint_commit.py
|
1 | 1 |
| M |
src/dlm/train/determinism.py
|
3 | 4 |
| M |
src/dlm/train/distributed/__init__.py
|
1 | 1 |
| M |
src/dlm/train/distributed/gpus.py
|
3 | 3 |
| M |
src/dlm/train/distributed/rank_io.py
|
2 | 3 |
| M |
src/dlm/train/distributed/worker_entry.py
|
8 | 8 |
| M |
src/dlm/train/gate/orchestrator.py
|
2 | 2 |
| M |
src/dlm/train/loader.py
|
2 | 2 |
| M |
src/dlm/train/logger.py
|
4 | 4 |
| M |
src/dlm/train/multi_adapter/trainer.py
|
4 | 5 |
| M |
src/dlm/train/oom_guard.py
|
4 | 4 |
| M |
src/dlm/train/state_sidecar.py
|
20 | 20 |
| M |
src/dlm/train/tokenization.py
|
1 | 1 |
src/dlm/train/__init__.pymodified@@ -1,8 +1,8 @@ | ||
| 1 | 1 | """Training engine — load base, attach LoRA/QLoRA, run one SFT cycle. |
| 2 | 2 | |
| 3 | -See Sprint 09 for the design. Heavy imports (`torch`, `transformers`, | |
| 4 | -`peft`, `trl`, `bitsandbytes`) are deferred to the functions that use | |
| 5 | -them so `import dlm.train` stays cheap. | |
| 3 | +Heavy imports (`torch`, `transformers`, `peft`, `trl`, | |
| 4 | +`bitsandbytes`) are deferred to the functions that use them so | |
| 5 | +`import dlm.train` stays cheap. | |
| 6 | 6 | """ |
| 7 | 7 | |
| 8 | 8 | from __future__ import annotations |
src/dlm/train/adapter.pymodified@@ -10,7 +10,7 @@ Three entry points: | ||
| 10 | 10 | - `"resume"` → `PeftModel.from_pretrained(model, resume_path, is_trainable=True)`. |
| 11 | 11 | - `apply_kbit_preparation(model, gradient_checkpointing)` runs |
| 12 | 12 | `prepare_model_for_kbit_training` for the QLoRA path. MUST be called |
| 13 | - BEFORE `get_peft_model` (audit risk noted in sprint spec). | |
| 13 | + BEFORE `get_peft_model`, which is the order PEFT + bitsandbytes require. | |
| 14 | 14 | |
| 15 | 15 | The trainer's higher-level orchestrator composes these — we keep the |
| 16 | 16 | individual functions small + testable. |
@@ -41,12 +41,11 @@ def build_lora_config( | ||
| 41 | 41 | ) -> Any: |
| 42 | 42 | """Return a `peft.LoraConfig` sized for `spec`. |
| 43 | 43 | |
| 44 | - If `tokenizer_grew=True` (Sprint 07 bringup added a new pad token), | |
| 45 | - we MUST train the embedding + lm_head alongside the LoRA deltas — | |
| 44 | + If `tokenizer_grew=True` (bringup added a new pad token), we MUST | |
| 45 | + train the embedding + lm_head alongside the LoRA deltas — | |
| 46 | 46 | otherwise the new embedding row is undefined. `modules_to_save` |
| 47 | 47 | inflates the adapter checkpoint size substantially; surfacing this |
| 48 | - at the LoRA level keeps the tradeoff auditable (CLAUDE.md pitfall | |
| 49 | - #4 / audit F02). | |
| 48 | + at the LoRA level keeps the tradeoff explicit. | |
| 50 | 49 | |
| 51 | 50 | `use_dora=True` switches from plain LoRA to DoRA (weight- |
| 52 | 51 | decomposed low-rank adaptation). DoRA factors each weight update |
@@ -75,11 +74,12 @@ def build_lora_config( | ||
| 75 | 74 | def verify_resume_tokenizer_compat(adapter_dir: Path, *, tokenizer_grew: bool) -> None: |
| 76 | 75 | """Assert the saved adapter's `modules_to_save` agrees with the current tokenizer. |
| 77 | 76 | |
| 78 | - Audit-04 M5: on resume, we load a LoRA adapter whose `adapter_config.json` | |
| 79 | - was written under a particular tokenizer state. If the current run's | |
| 80 | - tokenizer bringup grew the vocab but the saved adapter doesn't train | |
| 81 | - embeddings (or vice versa), the resumed training will silently corrupt | |
| 82 | - the `<|pad|>` row or fail to update a re-resized embedding table. | |
| 77 | + On resume, we load a LoRA adapter whose `adapter_config.json` was | |
| 78 | + written under a particular tokenizer state. If the current run's | |
| 79 | + tokenizer bringup grew the vocab but the saved adapter doesn't | |
| 80 | + train embeddings (or vice versa), the resumed training will | |
| 81 | + silently corrupt the `<|pad|>` row or fail to update a re-resized | |
| 82 | + embedding table. | |
| 83 | 83 | |
| 84 | 84 | Raises `ResumeIntegrityError` with actionable text on mismatch. Missing |
| 85 | 85 | or unreadable `adapter_config.json` is treated as a mismatch (the |
src/dlm/train/checkpoint_commit.pymodified@@ -18,7 +18,7 @@ Lifecycle | ||
| 18 | 18 | `training_state.pt` + sha256. |
| 19 | 19 | 3. `fsync_dir(path)` flushes the directory entry to disk. |
| 20 | 20 | 4. `store.set_current_adapter(path)` atomically flips the pointer via |
| 21 | - `os.replace` on a tmp file (already implemented in Sprint 04). | |
| 21 | + `os.replace` on a tmp file. | |
| 22 | 22 | |
| 23 | 23 | The `commit_version()` helper bundles steps 1 + 3 + 4 around a |
| 24 | 24 | caller-supplied writer function, so the "happy path" is one call. If |
src/dlm/train/determinism.pymodified@@ -15,9 +15,8 @@ every RNG + backend flag the trainer touches: | ||
| 15 | 15 | 6. `torch.backends.cudnn.benchmark = False` — no autotuner (which is |
| 16 | 16 | non-deterministic). |
| 17 | 17 | |
| 18 | -MPS determinism is best-effort (audit F20); the `describe()` function | |
| 19 | -surfaces this to the training banner so Apple Silicon users see the | |
| 20 | -caveat before the run starts. | |
| 18 | +MPS determinism is best-effort; the training banner surfaces this so | |
| 19 | +Apple Silicon users see the caveat before the run starts. | |
| 21 | 20 | |
| 22 | 21 | Heavy imports (`torch`, `numpy`) are deferred to call-sites so |
| 23 | 22 | `import dlm.train` stays cheap. |
@@ -99,7 +98,7 @@ def seed_everything(seed: int) -> DeterminismSummary: | ||
| 99 | 98 | # the trainer logs a WARN for any that trigger. |
| 100 | 99 | torch.use_deterministic_algorithms(True, warn_only=True) |
| 101 | 100 | torch.backends.cudnn.benchmark = False |
| 102 | - except ImportError: # pragma: no cover — torch is a runtime dep once Sprint 09 lands | |
| 101 | + except ImportError: # pragma: no cover — torch is a runtime dependency in production | |
| 103 | 102 | notes.append("torch not installed; determinism contract not enforced") |
| 104 | 103 | class_ = "loose" |
| 105 | 104 | |
src/dlm/train/distributed/__init__.pymodified@@ -1,6 +1,6 @@ | ||
| 1 | 1 | """Distributed (multi-GPU) training via HuggingFace Accelerate. |
| 2 | 2 | |
| 3 | -Sprint 23 scope: single-node, multi-GPU DDP. No multi-node, FSDP, or | |
| 3 | +Current scope: single-node, multi-GPU DDP. No multi-node, FSDP, or | |
| 4 | 4 | DeepSpeed. |
| 5 | 5 | |
| 6 | 6 | Surface: |
src/dlm/train/distributed/gpus.pymodified@@ -25,9 +25,9 @@ class UnsupportedGpuSpecError(ValueError): | ||
| 25 | 25 | def strip_gpus_flag(args: list[str], *, skip_argv0: bool = False) -> list[str]: |
| 26 | 26 | """Drop `--gpus <value>` / `--gpus=<value>` from an argv-like list. |
| 27 | 27 | |
| 28 | - Shared helper (audit-08 N1) so the launcher side (strips argv[0] | |
| 29 | - because `accelerate launch -m <entry>` substitutes it) and the | |
| 30 | - worker side (passes argv[1:] from `sys.argv`) don't drift. The | |
| 28 | + Shared helper so the launcher side (strips argv[0] because | |
| 29 | + `accelerate launch -m <entry>` substitutes it) and the worker | |
| 30 | + side (passes argv[1:] from `sys.argv`) don't drift. The | |
| 31 | 31 | `skip_argv0` flag controls which input convention is used. |
| 32 | 32 | """ |
| 33 | 33 | start = 1 if skip_argv0 else 0 |
src/dlm/train/distributed/rank_io.pymodified@@ -95,9 +95,8 @@ def gather_metrics( | ||
| 95 | 95 | |
| 96 | 96 | # We call gather per-metric to avoid building a tensor for a |
| 97 | 97 | # heterogeneous dict. For a shape-stable dict of floats this is |
| 98 | - # clearer than stacking. torch is a core runtime dep so the | |
| 99 | - # import is always available (audit-08 N6: dropped dead | |
| 100 | - # defensive try/except). | |
| 98 | + # clearer than stacking. torch is a core runtime dependency, so | |
| 99 | + # the import is always available. | |
| 101 | 100 | import torch |
| 102 | 101 | |
| 103 | 102 | reduced: dict[str, float] = {} |
src/dlm/train/distributed/worker_entry.pymodified@@ -3,14 +3,14 @@ | ||
| 3 | 3 | Accelerate spawns one process per GPU; each invokes this module. The |
| 4 | 4 | worker re-parses the subset of CLI args it cares about (path, seed, |
| 5 | 5 | max_steps, resume/fresh, phase) and routes into the existing |
| 6 | -`dlm.train.trainer.run` — which Sprint 23 still owns the single-GPU | |
| 7 | -I/O shape for. | |
| 6 | +`dlm.train.trainer.run`, which still owns the single-process I/O | |
| 7 | +shape. | |
| 8 | 8 | |
| 9 | 9 | Full DDP integration (refactoring `trainer.run` to gate its I/O via |
| 10 | -`rank_io.master_only`) is tracked as Sprint 23 follow-up; this entry | |
| 11 | -makes the launcher path complete end-to-end from the CLI but the | |
| 12 | -actual multi-GPU training loop remains a scaffold until the | |
| 13 | -integration test lands on real hardware. | |
| 10 | +`rank_io.master_only`) remains follow-up work; this entry makes the | |
| 11 | +launcher path complete end-to-end from the CLI but the actual | |
| 12 | +multi-GPU training loop remains a scaffold until the integration test | |
| 13 | +lands on real hardware. | |
| 14 | 14 | """ |
| 15 | 15 | |
| 16 | 16 | from __future__ import annotations |
@@ -43,8 +43,8 @@ def _strip_gpus_flag(args: list[str]) -> list[str]: | ||
| 43 | 43 | """Drop `--gpus <value>` / `--gpus=<value>` from argv (worker side). |
| 44 | 44 | |
| 45 | 45 | Per-rank invocations must not recurse into the launcher branch of |
| 46 | - `dlm train`. Delegates to the shared `strip_gpus_flag` helper | |
| 47 | - (audit-08 N1); the worker passes argv without argv[0]. | |
| 46 | + `dlm train`. Delegates to the shared `strip_gpus_flag` helper; | |
| 47 | + the worker passes argv without argv[0]. | |
| 48 | 48 | """ |
| 49 | 49 | from dlm.train.distributed.gpus import strip_gpus_flag |
| 50 | 50 | |
src/dlm/train/gate/orchestrator.pymodified@@ -110,8 +110,8 @@ def run_post_sft_gate( | ||
| 110 | 110 | fewer than two adapters; callers don't need to check the config |
| 111 | 111 | themselves. Embedding is injected as a callable so tests can stub |
| 112 | 112 | it without loading an HF model. Any `GateTrainingError` is logged |
| 113 | - and swallowed — Sprint 34 treats gate training as best-effort so | |
| 114 | - an SFT commit is never undone by a gate hiccup. | |
| 113 | + and swallowed — gate training is best-effort, so an SFT commit is | |
| 114 | + never undone by a gate hiccup. | |
| 115 | 115 | """ |
| 116 | 116 | training = parsed.frontmatter.training |
| 117 | 117 | gate_cfg = training.gate |
src/dlm/train/loader.pymodified@@ -110,8 +110,8 @@ def load_processor(spec: BaseModelSpec) -> Any: # pragma: no cover | ||
| 110 | 110 | |
| 111 | 111 | _AUDIO_MODEL_CLASSES: dict[str, str] = { |
| 112 | 112 | # Maps `BaseModelSpec.architecture` → transformers class name. |
| 113 | - # Sprint 35.2 v1 ships Qwen2-Audio only; add new entries here when | |
| 114 | - # more audio-LM families land in the registry. | |
| 113 | + # Add new entries here as more audio-LM families land in the | |
| 114 | + # registry. | |
| 115 | 115 | "Qwen2AudioForConditionalGeneration": "Qwen2AudioForConditionalGeneration", |
| 116 | 116 | } |
| 117 | 117 | |
src/dlm/train/logger.pymodified@@ -1,8 +1,8 @@ | ||
| 1 | 1 | """Structured per-step JSONL logger for training runs. |
| 2 | 2 | |
| 3 | 3 | One file per run at `logs/train-<run_id>-<started_at>.jsonl`. Every |
| 4 | -line is a self-describing JSON object so downstream tools (Sprint 20's | |
| 5 | -`dlm metrics`, ad-hoc scripts) can parse it without state. | |
| 4 | +line is a self-describing JSON object so downstream tools (`dlm | |
| 5 | +metrics`, ad-hoc scripts) can parse it without state. | |
| 6 | 6 | |
| 7 | 7 | Design |
| 8 | 8 | ------ |
@@ -17,8 +17,8 @@ Design | ||
| 17 | 17 | versions, and the training-plan snapshot. Followed by `"type": |
| 18 | 18 | "step"` records with `step`, `loss`, `lr`, `grad_norm`, |
| 19 | 19 | `tokens_per_sec`, and optional `val_loss` on eval steps. |
| 20 | -- **No Rich / tqdm here.** That's a Sprint 13 UX concern. The logger | |
| 21 | - is plain JSONL so CI and automation can consume it directly. | |
| 20 | +- **No Rich / tqdm here.** The logger is plain JSONL so CI and | |
| 21 | + automation can consume it directly. | |
| 22 | 22 | """ |
| 23 | 23 | |
| 24 | 24 | from __future__ import annotations |
src/dlm/train/multi_adapter/trainer.pymodified@@ -14,7 +14,7 @@ to a single `run()` call — the orchestrator is a safe default entry | ||
| 14 | 14 | point regardless of document shape. |
| 15 | 15 | |
| 16 | 16 | Scope note: inference selection, export merge, and doctor memory |
| 17 | -refusal layer on in sprint 20b. This module owns only the per-adapter | |
| 17 | +refusal are handled elsewhere. This module owns only the per-adapter | |
| 18 | 18 | orchestration and the resulting adapter-versioned store layout. |
| 19 | 19 | """ |
| 20 | 20 | |
@@ -143,8 +143,7 @@ def _maybe_run_gate_pass( | ||
| 143 | 143 | """Run the post-SFT learned-gate training pass when enabled. |
| 144 | 144 | |
| 145 | 145 | Kept separate so the multi-adapter orchestrator's happy path stays |
| 146 | - short. All errors are swallowed — gate training is best-effort per | |
| 147 | - the Sprint 34 risk matrix. | |
| 146 | + short. All errors are swallowed — gate training is best-effort. | |
| 148 | 147 | """ |
| 149 | 148 | import logging |
| 150 | 149 | |
@@ -203,8 +202,8 @@ def _default_embedder( | ||
| 203 | 202 | ) -> tuple[Callable[[str], Any], int]: # pragma: no cover — heavy HF path |
| 204 | 203 | """Default embedder — loads the HF base model + tokenizer. |
| 205 | 204 | |
| 206 | - Covered by the Sprint 34 slow integration test; unit tests pass a | |
| 207 | - stub via `gate_embed_factory`. | |
| 205 | + Covered by the slow integration tests; unit tests pass a stub via | |
| 206 | + `gate_embed_factory`. | |
| 208 | 207 | """ |
| 209 | 208 | from transformers import AutoTokenizer |
| 210 | 209 | |
src/dlm/train/oom_guard.pymodified@@ -93,10 +93,10 @@ def catch_cuda_oom( # pragma: no cover | ||
| 93 | 93 | Callers on non-CUDA devices can skip this — re-raising a |
| 94 | 94 | non-CUDA OOM as an `OOMError` would be actively misleading. |
| 95 | 95 | """ |
| 96 | - # Import torch up-front (audit-04 m4) so that a missing torch | |
| 97 | - # surfaces as a clean error at context-enter rather than inside the | |
| 98 | - # exception handler, where it would silently tunnel any caught | |
| 99 | - # exception past the OOM-reformatting path. | |
| 96 | + # Import torch up-front so that a missing torch surfaces as a | |
| 97 | + # clean error at context-enter rather than inside the exception | |
| 98 | + # handler, where it would silently tunnel any caught exception | |
| 99 | + # past the OOM-reformatting path. | |
| 100 | 100 | import torch |
| 101 | 101 | |
| 102 | 102 | try: |
src/dlm/train/state_sidecar.pymodified@@ -59,11 +59,11 @@ STATE_SIDECAR_VERSION = 2 | ||
| 59 | 59 | RNG state out of the torch payload into a JSON sidecar, dropping |
| 60 | 60 | `weights_only=False` from the load path. The writer always emits |
| 61 | 61 | v2; the reader accepts v1 (legacy) with a migration warning.""" |
| 62 | -# Run-level flags the inference path consumes without loading torch | |
| 63 | -# (audit-05 M1): separate from `pinned_versions.json`, which is a pure | |
| 64 | -# package-version manifest. This file records *how* the adapter was | |
| 65 | -# trained — currently just the QLoRA flag; future fields (e.g., base | |
| 66 | -# compute dtype) extend this rather than polluting version metadata. | |
| 62 | +# Run-level flags the inference path consumes without loading torch: | |
| 63 | +# separate from `pinned_versions.json`, which is a pure package-version | |
| 64 | +# manifest. This file records *how* the adapter was trained — | |
| 65 | +# currently just the QLoRA flag; future fields (e.g., base compute | |
| 66 | +# dtype) extend this rather than polluting version metadata. | |
| 67 | 67 | TRAINING_RUN_FILENAME = "training_run.json" |
| 68 | 68 | |
| 69 | 69 | |
@@ -101,10 +101,10 @@ class TrainingState(TypedDict): | ||
| 101 | 101 | dlm_manifest_hash: str | None |
| 102 | 102 | base_model_revision: str |
| 103 | 103 | pinned_versions: PinnedVersions |
| 104 | - # audit-05 M1: explicit QLoRA flag. `InferencePlan` reads this via | |
| 105 | - # `training_run.json` (written alongside) rather than inferring from | |
| 106 | - # the bitsandbytes version pin, which false-positives on plain LoRA | |
| 107 | - # runs on CUDA+bnb hosts. | |
| 104 | + # Explicit QLoRA flag. `InferencePlan` reads this via | |
| 105 | + # `training_run.json` (written alongside) rather than inferring | |
| 106 | + # from the bitsandbytes version pin, which false-positives on | |
| 107 | + # plain LoRA runs on CUDA+bnb hosts. | |
| 108 | 108 | use_qlora: bool |
| 109 | 109 | |
| 110 | 110 | |
@@ -241,8 +241,8 @@ def save_state(directory: Path, state: TrainingState) -> None: | ||
| 241 | 241 | json.dumps(dict(state["pinned_versions"]), sort_keys=True, indent=2) + "\n", |
| 242 | 242 | ) |
| 243 | 243 | |
| 244 | - # Run-level flags (audit-05 M1). Separate file so `InferencePlan` | |
| 245 | - # can read `use_qlora` without loading torch or the whole state dict. | |
| 244 | + # Run-level flags. Separate file so `InferencePlan` can read | |
| 245 | + # `use_qlora` without loading torch or the whole state dict. | |
| 246 | 246 | training_run_path = directory / TRAINING_RUN_FILENAME |
| 247 | 247 | write_text( |
| 248 | 248 | training_run_path, |
@@ -284,10 +284,10 @@ def load_state(directory: Path, *, runtime_versions: PinnedVersions) -> Training | ||
| 284 | 284 | try: |
| 285 | 285 | torch_payload = torch.load(io.BytesIO(blob), weights_only=True) |
| 286 | 286 | except Exception as weights_only_exc: |
| 287 | - # Legacy v1 format (pre-audit-11 B7) stored everything including | |
| 288 | - # numpy ndarrays under weights_only=False. Retry with the legacy | |
| 289 | - # loader + log a one-time migration notice. The next release | |
| 290 | - # drops this branch; callers should re-save. | |
| 287 | + # Legacy v1 format stored everything including numpy ndarrays | |
| 288 | + # under weights_only=False. Retry with the legacy loader + | |
| 289 | + # log a one-time migration notice. The next release drops this | |
| 290 | + # branch; callers should re-save. | |
| 291 | 291 | try: |
| 292 | 292 | torch_payload = torch.load(io.BytesIO(blob), weights_only=False) |
| 293 | 293 | except Exception as exc: |
@@ -362,11 +362,11 @@ def _merge_rng_sidecar(directory: Path, torch_payload: dict[str, Any]) -> dict[s | ||
| 362 | 362 | def _version_diff(pinned: PinnedVersions, runtime: PinnedVersions) -> list[str]: |
| 363 | 363 | """Return `["key: saved→current", ...]` for keys whose versions differ. |
| 364 | 364 | |
| 365 | - Asymmetric handling of `None` (audit-04 M6): losing a pinned package | |
| 366 | - between save + resume (e.g., a QLoRA checkpoint from a CUDA box | |
| 367 | - being resumed on Apple Silicon without `bitsandbytes`) is drift | |
| 368 | - the user should see. Gaining a package that wasn't pinned is not | |
| 369 | - drift — there was no prior state to diverge from. | |
| 365 | + Asymmetric handling of `None`: losing a pinned package between | |
| 366 | + save + resume (e.g., a QLoRA checkpoint from a CUDA box being | |
| 367 | + resumed on Apple Silicon without `bitsandbytes`) is drift the user | |
| 368 | + should see. Gaining a package that wasn't pinned is not drift — | |
| 369 | + there was no prior state to diverge from. | |
| 370 | 370 | |
| 371 | 371 | Rules: |
| 372 | 372 | - saved=str, current=str, equal → no drift |
src/dlm/train/tokenization.pymodified@@ -87,7 +87,7 @@ def pretokenize_rows( | ||
| 87 | 87 | |
| 88 | 88 | Rows that carry neither ``messages`` nor ``text`` — preference |
| 89 | 89 | rows destined for DPOTrainer — pass through untouched. DPOTrainer |
| 90 | - owns its own tokenization path (Sprint 17). | |
| 90 | + owns its own tokenization path. | |
| 91 | 91 | """ |
| 92 | 92 | sha = tokenizer_sha256(tokenizer) if cache is not None else "" |
| 93 | 93 | stats_hits = 0 |