tenseleyflow/documentlanguagemodel / 0af0f7c

Browse files

Scrub train support jargon

Authored by espadonne
SHA
0af0f7c46a55e23096550fbc24092d17367a5130
Parents
8a42142
Tree
8830b17

15 changed files

StatusFile+-
M src/dlm/train/__init__.py 3 3
M src/dlm/train/adapter.py 10 10
M src/dlm/train/checkpoint_commit.py 1 1
M src/dlm/train/determinism.py 3 4
M src/dlm/train/distributed/__init__.py 1 1
M src/dlm/train/distributed/gpus.py 3 3
M src/dlm/train/distributed/rank_io.py 2 3
M src/dlm/train/distributed/worker_entry.py 8 8
M src/dlm/train/gate/orchestrator.py 2 2
M src/dlm/train/loader.py 2 2
M src/dlm/train/logger.py 4 4
M src/dlm/train/multi_adapter/trainer.py 4 5
M src/dlm/train/oom_guard.py 4 4
M src/dlm/train/state_sidecar.py 20 20
M src/dlm/train/tokenization.py 1 1
src/dlm/train/__init__.pymodified
@@ -1,8 +1,8 @@
11
 """Training engine — load base, attach LoRA/QLoRA, run one SFT cycle.
22
 
3
-See Sprint 09 for the design. Heavy imports (`torch`, `transformers`,
4
-`peft`, `trl`, `bitsandbytes`) are deferred to the functions that use
5
-them so `import dlm.train` stays cheap.
3
+Heavy imports (`torch`, `transformers`, `peft`, `trl`,
4
+`bitsandbytes`) are deferred to the functions that use them so
5
+`import dlm.train` stays cheap.
66
 """
77
 
88
 from __future__ import annotations
src/dlm/train/adapter.pymodified
@@ -10,7 +10,7 @@ Three entry points:
1010
   - `"resume"` → `PeftModel.from_pretrained(model, resume_path, is_trainable=True)`.
1111
 - `apply_kbit_preparation(model, gradient_checkpointing)` runs
1212
   `prepare_model_for_kbit_training` for the QLoRA path. MUST be called
13
-  BEFORE `get_peft_model` (audit risk noted in sprint spec).
13
+  BEFORE `get_peft_model`, which is the order PEFT + bitsandbytes require.
1414
 
1515
 The trainer's higher-level orchestrator composes these — we keep the
1616
 individual functions small + testable.
@@ -41,12 +41,11 @@ def build_lora_config(
4141
 ) -> Any:
4242
     """Return a `peft.LoraConfig` sized for `spec`.
4343
 
44
-    If `tokenizer_grew=True` (Sprint 07 bringup added a new pad token),
45
-    we MUST train the embedding + lm_head alongside the LoRA deltas —
44
+    If `tokenizer_grew=True` (bringup added a new pad token), we MUST
45
+    train the embedding + lm_head alongside the LoRA deltas —
4646
     otherwise the new embedding row is undefined. `modules_to_save`
4747
     inflates the adapter checkpoint size substantially; surfacing this
48
-    at the LoRA level keeps the tradeoff auditable (CLAUDE.md pitfall
49
-    #4 / audit F02).
48
+    at the LoRA level keeps the tradeoff explicit.
5049
 
5150
     `use_dora=True` switches from plain LoRA to DoRA (weight-
5251
     decomposed low-rank adaptation). DoRA factors each weight update
@@ -75,11 +74,12 @@ def build_lora_config(
7574
 def verify_resume_tokenizer_compat(adapter_dir: Path, *, tokenizer_grew: bool) -> None:
7675
     """Assert the saved adapter's `modules_to_save` agrees with the current tokenizer.
7776
 
78
-    Audit-04 M5: on resume, we load a LoRA adapter whose `adapter_config.json`
79
-    was written under a particular tokenizer state. If the current run's
80
-    tokenizer bringup grew the vocab but the saved adapter doesn't train
81
-    embeddings (or vice versa), the resumed training will silently corrupt
82
-    the `<|pad|>` row or fail to update a re-resized embedding table.
77
+    On resume, we load a LoRA adapter whose `adapter_config.json` was
78
+    written under a particular tokenizer state. If the current run's
79
+    tokenizer bringup grew the vocab but the saved adapter doesn't
80
+    train embeddings (or vice versa), the resumed training will
81
+    silently corrupt the `<|pad|>` row or fail to update a re-resized
82
+    embedding table.
8383
 
8484
     Raises `ResumeIntegrityError` with actionable text on mismatch. Missing
8585
     or unreadable `adapter_config.json` is treated as a mismatch (the
src/dlm/train/checkpoint_commit.pymodified
@@ -18,7 +18,7 @@ Lifecycle
1818
    `training_state.pt` + sha256.
1919
 3. `fsync_dir(path)` flushes the directory entry to disk.
2020
 4. `store.set_current_adapter(path)` atomically flips the pointer via
21
-   `os.replace` on a tmp file (already implemented in Sprint 04).
21
+   `os.replace` on a tmp file.
2222
 
2323
 The `commit_version()` helper bundles steps 1 + 3 + 4 around a
2424
 caller-supplied writer function, so the "happy path" is one call. If
src/dlm/train/determinism.pymodified
@@ -15,9 +15,8 @@ every RNG + backend flag the trainer touches:
1515
 6. `torch.backends.cudnn.benchmark = False` — no autotuner (which is
1616
    non-deterministic).
1717
 
18
-MPS determinism is best-effort (audit F20); the `describe()` function
19
-surfaces this to the training banner so Apple Silicon users see the
20
-caveat before the run starts.
18
+MPS determinism is best-effort; the training banner surfaces this so
19
+Apple Silicon users see the caveat before the run starts.
2120
 
2221
 Heavy imports (`torch`, `numpy`) are deferred to call-sites so
2322
 `import dlm.train` stays cheap.
@@ -99,7 +98,7 @@ def seed_everything(seed: int) -> DeterminismSummary:
9998
         # the trainer logs a WARN for any that trigger.
10099
         torch.use_deterministic_algorithms(True, warn_only=True)
101100
         torch.backends.cudnn.benchmark = False
102
-    except ImportError:  # pragma: no cover — torch is a runtime dep once Sprint 09 lands
101
+    except ImportError:  # pragma: no cover — torch is a runtime dependency in production
103102
         notes.append("torch not installed; determinism contract not enforced")
104103
         class_ = "loose"
105104
 
src/dlm/train/distributed/__init__.pymodified
@@ -1,6 +1,6 @@
11
 """Distributed (multi-GPU) training via HuggingFace Accelerate.
22
 
3
-Sprint 23 scope: single-node, multi-GPU DDP. No multi-node, FSDP, or
3
+Current scope: single-node, multi-GPU DDP. No multi-node, FSDP, or
44
 DeepSpeed.
55
 
66
 Surface:
src/dlm/train/distributed/gpus.pymodified
@@ -25,9 +25,9 @@ class UnsupportedGpuSpecError(ValueError):
2525
 def strip_gpus_flag(args: list[str], *, skip_argv0: bool = False) -> list[str]:
2626
     """Drop `--gpus <value>` / `--gpus=<value>` from an argv-like list.
2727
 
28
-    Shared helper (audit-08 N1) so the launcher side (strips argv[0]
29
-    because `accelerate launch -m <entry>` substitutes it) and the
30
-    worker side (passes argv[1:] from `sys.argv`) don't drift. The
28
+    Shared helper so the launcher side (strips argv[0] because
29
+    `accelerate launch -m <entry>` substitutes it) and the worker
30
+    side (passes argv[1:] from `sys.argv`) don't drift. The
3131
     `skip_argv0` flag controls which input convention is used.
3232
     """
3333
     start = 1 if skip_argv0 else 0
src/dlm/train/distributed/rank_io.pymodified
@@ -95,9 +95,8 @@ def gather_metrics(
9595
 
9696
     # We call gather per-metric to avoid building a tensor for a
9797
     # heterogeneous dict. For a shape-stable dict of floats this is
98
-    # clearer than stacking. torch is a core runtime dep so the
99
-    # import is always available (audit-08 N6: dropped dead
100
-    # defensive try/except).
98
+    # clearer than stacking. torch is a core runtime dependency, so
99
+    # the import is always available.
101100
     import torch
102101
 
103102
     reduced: dict[str, float] = {}
src/dlm/train/distributed/worker_entry.pymodified
@@ -3,14 +3,14 @@
33
 Accelerate spawns one process per GPU; each invokes this module. The
44
 worker re-parses the subset of CLI args it cares about (path, seed,
55
 max_steps, resume/fresh, phase) and routes into the existing
6
-`dlm.train.trainer.run` — which Sprint 23 still owns the single-GPU
7
-I/O shape for.
6
+`dlm.train.trainer.run`, which still owns the single-process I/O
7
+shape.
88
 
99
 Full DDP integration (refactoring `trainer.run` to gate its I/O via
10
-`rank_io.master_only`) is tracked as Sprint 23 follow-up; this entry
11
-makes the launcher path complete end-to-end from the CLI but the
12
-actual multi-GPU training loop remains a scaffold until the
13
-integration test lands on real hardware.
10
+`rank_io.master_only`) remains follow-up work; this entry makes the
11
+launcher path complete end-to-end from the CLI but the actual
12
+multi-GPU training loop remains a scaffold until the integration test
13
+lands on real hardware.
1414
 """
1515
 
1616
 from __future__ import annotations
@@ -43,8 +43,8 @@ def _strip_gpus_flag(args: list[str]) -> list[str]:
4343
     """Drop `--gpus <value>` / `--gpus=<value>` from argv (worker side).
4444
 
4545
     Per-rank invocations must not recurse into the launcher branch of
46
-    `dlm train`. Delegates to the shared `strip_gpus_flag` helper
47
-    (audit-08 N1); the worker passes argv without argv[0].
46
+    `dlm train`. Delegates to the shared `strip_gpus_flag` helper;
47
+    the worker passes argv without argv[0].
4848
     """
4949
     from dlm.train.distributed.gpus import strip_gpus_flag
5050
 
src/dlm/train/gate/orchestrator.pymodified
@@ -110,8 +110,8 @@ def run_post_sft_gate(
110110
     fewer than two adapters; callers don't need to check the config
111111
     themselves. Embedding is injected as a callable so tests can stub
112112
     it without loading an HF model. Any `GateTrainingError` is logged
113
-    and swallowed — Sprint 34 treats gate training as best-effort so
114
-    an SFT commit is never undone by a gate hiccup.
113
+    and swallowed — gate training is best-effort, so an SFT commit is
114
+    never undone by a gate hiccup.
115115
     """
116116
     training = parsed.frontmatter.training
117117
     gate_cfg = training.gate
src/dlm/train/loader.pymodified
@@ -110,8 +110,8 @@ def load_processor(spec: BaseModelSpec) -> Any: # pragma: no cover
110110
 
111111
 _AUDIO_MODEL_CLASSES: dict[str, str] = {
112112
     # Maps `BaseModelSpec.architecture` → transformers class name.
113
-    # Sprint 35.2 v1 ships Qwen2-Audio only; add new entries here when
114
-    # more audio-LM families land in the registry.
113
+    # Add new entries here as more audio-LM families land in the
114
+    # registry.
115115
     "Qwen2AudioForConditionalGeneration": "Qwen2AudioForConditionalGeneration",
116116
 }
117117
 
src/dlm/train/logger.pymodified
@@ -1,8 +1,8 @@
11
 """Structured per-step JSONL logger for training runs.
22
 
33
 One file per run at `logs/train-<run_id>-<started_at>.jsonl`. Every
4
-line is a self-describing JSON object so downstream tools (Sprint 20's
5
-`dlm metrics`, ad-hoc scripts) can parse it without state.
4
+line is a self-describing JSON object so downstream tools (`dlm
5
+metrics`, ad-hoc scripts) can parse it without state.
66
 
77
 Design
88
 ------
@@ -17,8 +17,8 @@ Design
1717
   versions, and the training-plan snapshot. Followed by `"type":
1818
   "step"` records with `step`, `loss`, `lr`, `grad_norm`,
1919
   `tokens_per_sec`, and optional `val_loss` on eval steps.
20
-- **No Rich / tqdm here.** That's a Sprint 13 UX concern. The logger
21
-  is plain JSONL so CI and automation can consume it directly.
20
+- **No Rich / tqdm here.** The logger is plain JSONL so CI and
21
+  automation can consume it directly.
2222
 """
2323
 
2424
 from __future__ import annotations
src/dlm/train/multi_adapter/trainer.pymodified
@@ -14,7 +14,7 @@ to a single `run()` call — the orchestrator is a safe default entry
1414
 point regardless of document shape.
1515
 
1616
 Scope note: inference selection, export merge, and doctor memory
17
-refusal layer on in sprint 20b. This module owns only the per-adapter
17
+refusal are handled elsewhere. This module owns only the per-adapter
1818
 orchestration and the resulting adapter-versioned store layout.
1919
 """
2020
 
@@ -143,8 +143,7 @@ def _maybe_run_gate_pass(
143143
     """Run the post-SFT learned-gate training pass when enabled.
144144
 
145145
     Kept separate so the multi-adapter orchestrator's happy path stays
146
-    short. All errors are swallowed — gate training is best-effort per
147
-    the Sprint 34 risk matrix.
146
+    short. All errors are swallowed — gate training is best-effort.
148147
     """
149148
     import logging
150149
 
@@ -203,8 +202,8 @@ def _default_embedder(
203202
 ) -> tuple[Callable[[str], Any], int]:  # pragma: no cover — heavy HF path
204203
     """Default embedder — loads the HF base model + tokenizer.
205204
 
206
-    Covered by the Sprint 34 slow integration test; unit tests pass a
207
-    stub via `gate_embed_factory`.
205
+    Covered by the slow integration tests; unit tests pass a stub via
206
+    `gate_embed_factory`.
208207
     """
209208
     from transformers import AutoTokenizer
210209
 
src/dlm/train/oom_guard.pymodified
@@ -93,10 +93,10 @@ def catch_cuda_oom( # pragma: no cover
9393
     Callers on non-CUDA devices can skip this — re-raising a
9494
     non-CUDA OOM as an `OOMError` would be actively misleading.
9595
     """
96
-    # Import torch up-front (audit-04 m4) so that a missing torch
97
-    # surfaces as a clean error at context-enter rather than inside the
98
-    # exception handler, where it would silently tunnel any caught
99
-    # exception past the OOM-reformatting path.
96
+    # Import torch up-front so that a missing torch surfaces as a
97
+    # clean error at context-enter rather than inside the exception
98
+    # handler, where it would silently tunnel any caught exception
99
+    # past the OOM-reformatting path.
100100
     import torch
101101
 
102102
     try:
src/dlm/train/state_sidecar.pymodified
@@ -59,11 +59,11 @@ STATE_SIDECAR_VERSION = 2
5959
 RNG state out of the torch payload into a JSON sidecar, dropping
6060
 `weights_only=False` from the load path. The writer always emits
6161
 v2; the reader accepts v1 (legacy) with a migration warning."""
62
-# Run-level flags the inference path consumes without loading torch
63
-# (audit-05 M1): separate from `pinned_versions.json`, which is a pure
64
-# package-version manifest. This file records *how* the adapter was
65
-# trained — currently just the QLoRA flag; future fields (e.g., base
66
-# compute dtype) extend this rather than polluting version metadata.
62
+# Run-level flags the inference path consumes without loading torch:
63
+# separate from `pinned_versions.json`, which is a pure package-version
64
+# manifest. This file records *how* the adapter was trained —
65
+# currently just the QLoRA flag; future fields (e.g., base compute
66
+# dtype) extend this rather than polluting version metadata.
6767
 TRAINING_RUN_FILENAME = "training_run.json"
6868
 
6969
 
@@ -101,10 +101,10 @@ class TrainingState(TypedDict):
101101
     dlm_manifest_hash: str | None
102102
     base_model_revision: str
103103
     pinned_versions: PinnedVersions
104
-    # audit-05 M1: explicit QLoRA flag. `InferencePlan` reads this via
105
-    # `training_run.json` (written alongside) rather than inferring from
106
-    # the bitsandbytes version pin, which false-positives on plain LoRA
107
-    # runs on CUDA+bnb hosts.
104
+    # Explicit QLoRA flag. `InferencePlan` reads this via
105
+    # `training_run.json` (written alongside) rather than inferring
106
+    # from the bitsandbytes version pin, which false-positives on
107
+    # plain LoRA runs on CUDA+bnb hosts.
108108
     use_qlora: bool
109109
 
110110
 
@@ -241,8 +241,8 @@ def save_state(directory: Path, state: TrainingState) -> None:
241241
         json.dumps(dict(state["pinned_versions"]), sort_keys=True, indent=2) + "\n",
242242
     )
243243
 
244
-    # Run-level flags (audit-05 M1). Separate file so `InferencePlan`
245
-    # can read `use_qlora` without loading torch or the whole state dict.
244
+    # Run-level flags. Separate file so `InferencePlan` can read
245
+    # `use_qlora` without loading torch or the whole state dict.
246246
     training_run_path = directory / TRAINING_RUN_FILENAME
247247
     write_text(
248248
         training_run_path,
@@ -284,10 +284,10 @@ def load_state(directory: Path, *, runtime_versions: PinnedVersions) -> Training
284284
     try:
285285
         torch_payload = torch.load(io.BytesIO(blob), weights_only=True)
286286
     except Exception as weights_only_exc:
287
-        # Legacy v1 format (pre-audit-11 B7) stored everything including
288
-        # numpy ndarrays under weights_only=False. Retry with the legacy
289
-        # loader + log a one-time migration notice. The next release
290
-        # drops this branch; callers should re-save.
287
+        # Legacy v1 format stored everything including numpy ndarrays
288
+        # under weights_only=False. Retry with the legacy loader +
289
+        # log a one-time migration notice. The next release drops this
290
+        # branch; callers should re-save.
291291
         try:
292292
             torch_payload = torch.load(io.BytesIO(blob), weights_only=False)
293293
         except Exception as exc:
@@ -362,11 +362,11 @@ def _merge_rng_sidecar(directory: Path, torch_payload: dict[str, Any]) -> dict[s
362362
 def _version_diff(pinned: PinnedVersions, runtime: PinnedVersions) -> list[str]:
363363
     """Return `["key: saved→current", ...]` for keys whose versions differ.
364364
 
365
-    Asymmetric handling of `None` (audit-04 M6): losing a pinned package
366
-    between save + resume (e.g., a QLoRA checkpoint from a CUDA box
367
-    being resumed on Apple Silicon without `bitsandbytes`) is drift
368
-    the user should see. Gaining a package that wasn't pinned is not
369
-    drift — there was no prior state to diverge from.
365
+    Asymmetric handling of `None`: losing a pinned package between
366
+    save + resume (e.g., a QLoRA checkpoint from a CUDA box being
367
+    resumed on Apple Silicon without `bitsandbytes`) is drift the user
368
+    should see. Gaining a package that wasn't pinned is not drift —
369
+    there was no prior state to diverge from.
370370
 
371371
     Rules:
372372
     - saved=str, current=str, equal    → no drift
src/dlm/train/tokenization.pymodified
@@ -87,7 +87,7 @@ def pretokenize_rows(
8787
 
8888
     Rows that carry neither ``messages`` nor ``text`` — preference
8989
     rows destined for DPOTrainer — pass through untouched. DPOTrainer
90
-    owns its own tokenization path (Sprint 17).
90
+    owns its own tokenization path.
9191
     """
9292
     sha = tokenizer_sha256(tokenizer) if cache is not None else ""
9393
     stats_hits = 0