tenseleyflow/documentlanguagemodel / 15c7016

Browse files

Scrub inference jargon

Authored by espadonne
SHA
15c70167628e7b241830f0bbb48586c92a9de819
Parents
e7cc6ed
Tree
4cd8d62

5 changed files

StatusFile+-
M src/dlm/inference/audio_loader.py 1 1
M src/dlm/inference/backends/base.py 6 7
M src/dlm/inference/generate.py 3 3
M src/dlm/inference/loader.py 4 4
M src/dlm/inference/plan.py 7 7
src/dlm/inference/audio_loader.pymodified
@@ -44,7 +44,7 @@ def load_for_audio_inference( # pragma: no cover
4444
 
4545
     Pragma'd from unit coverage — exercises class-named model load +
4646
     `AutoProcessor.from_pretrained` over real HF weights. Covered by
47
-    the Sprint 35.2 slow integration test (T12).
47
+    the slow audio integration test (T12).
4848
     """
4949
     if spec.modality != "audio-language":
5050
         raise ValueError(
src/dlm/inference/backends/base.pymodified
@@ -1,16 +1,15 @@
11
 """`InferenceBackend` Protocol shared by PyTorch + MLX paths.
22
 
3
-Phase 5 Sprint 21 introduces a second inference backend (MLX) for
4
-Apple Silicon throughput. The existing PyTorch path stays authoritative
5
-on every other platform and remains the training-time runtime. This
6
-Protocol is the shape both paths satisfy so the CLI + REPL can treat
7
-them interchangeably.
3
+MLX provides a second inference backend for Apple Silicon throughput.
4
+The existing PyTorch path stays authoritative on every other platform
5
+and remains the training-time runtime. This Protocol is the shape both
6
+paths satisfy so the CLI + REPL can treat them interchangeably.
87
 
98
 Backends are stateful: `load()` resolves the adapter, loads weights,
109
 and stashes the live model on `self`; `generate()` is called repeatedly
1110
 against that loaded state; `unload()` releases memory. Pooling /
12
-reuse across CLI invocations is a later concern (Sprint 24 REPL) —
13
-the shape supports it without mandating it yet.
11
+reuse across CLI invocations is a later concern — the shape supports
12
+it without mandating it yet.
1413
 """
1514
 
1615
 from __future__ import annotations
src/dlm/inference/generate.pymodified
@@ -7,8 +7,8 @@ Deterministic generation requires ALL of:
77
 - `num_beams=1`
88
 - `temperature=0.0` (technically moot when do_sample=False, but
99
   some HF code paths still read it — belt and braces)
10
-- The model's cuDNN flags set to deterministic mode (Sprint 09
11
-  `determinism.seed_everything` handles this at `dlm train` time)
10
+- The model's cuDNN flags set to deterministic mode
11
+  (`determinism.seed_everything` handles this at `dlm train` time)
1212
 
1313
 When the caller passes `temperature > 0`, we flip `do_sample=True`
1414
 automatically — otherwise a non-zero temperature is silently ignored
@@ -107,7 +107,7 @@ def generate( # pragma: no cover
107107
     """Render `prompt`, run generation, decode response-only tokens.
108108
 
109109
     Pragma'd from unit coverage because it calls `model.generate`.
110
-    Covered by Sprint 10's slow-marked integration test.
110
+    Covered by the slow-marked integration test.
111111
     """
112112
     import torch
113113
 
src/dlm/inference/loader.pymodified
@@ -16,9 +16,9 @@ Given a `StorePath` and the current host's `Capabilities`, resolve an
1616
   fp16 residual on top of a fp16 base.
1717
 
1818
 The tokenizer is loaded from the **adapter directory**, not the
19
-`store.cache/`, because Sprint 07's bringup persists the final
19
+`store.cache/`, because tokenizer bringup persists the final
2020
 tokenizer state (including `<|pad|>` additions) into the adapter dir
21
-at training-end. This is the cross-sprint contract F02 depends on.
21
+at training-end. This is the contract export and inference depend on.
2222
 
2323
 Heavy imports are deferred; the orchestration logic that picks args,
2424
 paths, and dtypes is unit-testable without HF.
@@ -140,7 +140,7 @@ def load_for_inference( # pragma: no cover
140140
 
141141
     Pragma'd from unit coverage because it calls `AutoModelForCausalLM.from_pretrained`
142142
     and `PeftModel.from_pretrained`, which each need ~5 seconds and a
143
-    real HF cache. Covered by Sprint 10's slow-marked integration test.
143
+    real HF cache. Covered by the slow-marked integration test.
144144
 
145145
     `adapter_name`, when provided, targets the named multi-adapter
146146
     layout (`adapter/<name>/current.txt`). When `None`, uses the flat
@@ -164,7 +164,7 @@ def load_for_inference( # pragma: no cover
164164
     model.eval()
165165
 
166166
     # Tokenizer from the adapter dir — source of truth after any
167
-    # vocab growth (Sprint 07 bringup contract).
167
+    # vocab growth from training-time bringup.
168168
     tokenizer = AutoTokenizer.from_pretrained(str(adapter_path))
169169
 
170170
     return LoadedInference(
src/dlm/inference/plan.pymodified
@@ -1,4 +1,4 @@
1
-"""`InferencePlan` — cross-hardware load plan for prompt-time (audit F05).
1
+"""`InferencePlan` — cross-hardware load plan for prompt-time.
22
 
33
 The problem
44
 -----------
@@ -14,9 +14,9 @@ be on.
1414
 The solution
1515
 ------------
1616
 
17
-`InferencePlan` is the twin of Sprint 05's `TrainingPlan`: a
18
-hardware-doctor decision, but for the inference path. It reads the
19
-saved adapter's training metadata (`training_run.json`, with a legacy
17
+`InferencePlan` is the inference-side twin of `TrainingPlan`: a
18
+hardware-doctor decision for prompt-time loading. It reads the saved
19
+adapter's training metadata (`training_run.json`, with a legacy
2020
 `pinned_versions.json` fallback) to learn
2121
 whether QLoRA was in play, cross-references with the current `Capabilities`,
2222
 and emits:
@@ -73,8 +73,8 @@ def resolve_inference(adapter_dir: Path, caps: Any) -> InferencePlan:
7373
     Decision tree:
7474
     - CUDA host + bnb installed + QLoRA-trained → 4-bit load, no dequant.
7575
     - CUDA host, QLoRA-trained, but bnb missing → dequantize to fp16.
76
-    - Non-CUDA host + QLoRA-trained → dequantize to fp16 (the "audit
77
-      F05" scenario: laptop inference of a server-trained adapter).
76
+    - Non-CUDA host + QLoRA-trained → dequantize to fp16 (the
77
+      cross-hardware laptop/server scenario).
7878
     - Non-QLoRA adapter → load at the host's best precision (bf16 on
7979
       capable CUDA, else fp16).
8080
     """
@@ -121,7 +121,7 @@ def resolve_inference(adapter_dir: Path, caps: Any) -> InferencePlan:
121121
             attn_implementation="sdpa",
122122
             reason=(
123123
                 f"QLoRA adapter on {backend} host; dequantizing to fp16 "
124
-                "(bitsandbytes is CUDA-only). Audit F05 cross-hardware path."
124
+                "(bitsandbytes is CUDA-only)."
125125
             ),
126126
         )
127127
     return InferencePlan(