Scrub inference jargon
- SHA
15c70167628e7b241830f0bbb48586c92a9de819- Parents
-
e7cc6ed - Tree
4cd8d62
15c7016
15c70167628e7b241830f0bbb48586c92a9de819e7cc6ed
4cd8d62| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm/inference/audio_loader.py
|
1 | 1 |
| M |
src/dlm/inference/backends/base.py
|
6 | 7 |
| M |
src/dlm/inference/generate.py
|
3 | 3 |
| M |
src/dlm/inference/loader.py
|
4 | 4 |
| M |
src/dlm/inference/plan.py
|
7 | 7 |
src/dlm/inference/audio_loader.pymodified@@ -44,7 +44,7 @@ def load_for_audio_inference( # pragma: no cover | ||
| 44 | 44 | |
| 45 | 45 | Pragma'd from unit coverage — exercises class-named model load + |
| 46 | 46 | `AutoProcessor.from_pretrained` over real HF weights. Covered by |
| 47 | - the Sprint 35.2 slow integration test (T12). | |
| 47 | + the slow audio integration test (T12). | |
| 48 | 48 | """ |
| 49 | 49 | if spec.modality != "audio-language": |
| 50 | 50 | raise ValueError( |
src/dlm/inference/backends/base.pymodified@@ -1,16 +1,15 @@ | ||
| 1 | 1 | """`InferenceBackend` Protocol shared by PyTorch + MLX paths. |
| 2 | 2 | |
| 3 | -Phase 5 Sprint 21 introduces a second inference backend (MLX) for | |
| 4 | -Apple Silicon throughput. The existing PyTorch path stays authoritative | |
| 5 | -on every other platform and remains the training-time runtime. This | |
| 6 | -Protocol is the shape both paths satisfy so the CLI + REPL can treat | |
| 7 | -them interchangeably. | |
| 3 | +MLX provides a second inference backend for Apple Silicon throughput. | |
| 4 | +The existing PyTorch path stays authoritative on every other platform | |
| 5 | +and remains the training-time runtime. This Protocol is the shape both | |
| 6 | +paths satisfy so the CLI + REPL can treat them interchangeably. | |
| 8 | 7 | |
| 9 | 8 | Backends are stateful: `load()` resolves the adapter, loads weights, |
| 10 | 9 | and stashes the live model on `self`; `generate()` is called repeatedly |
| 11 | 10 | against that loaded state; `unload()` releases memory. Pooling / |
| 12 | -reuse across CLI invocations is a later concern (Sprint 24 REPL) — | |
| 13 | -the shape supports it without mandating it yet. | |
| 11 | +reuse across CLI invocations is a later concern — the shape supports | |
| 12 | +it without mandating it yet. | |
| 14 | 13 | """ |
| 15 | 14 | |
| 16 | 15 | from __future__ import annotations |
src/dlm/inference/generate.pymodified@@ -7,8 +7,8 @@ Deterministic generation requires ALL of: | ||
| 7 | 7 | - `num_beams=1` |
| 8 | 8 | - `temperature=0.0` (technically moot when do_sample=False, but |
| 9 | 9 | some HF code paths still read it — belt and braces) |
| 10 | -- The model's cuDNN flags set to deterministic mode (Sprint 09 | |
| 11 | - `determinism.seed_everything` handles this at `dlm train` time) | |
| 10 | +- The model's cuDNN flags set to deterministic mode | |
| 11 | + (`determinism.seed_everything` handles this at `dlm train` time) | |
| 12 | 12 | |
| 13 | 13 | When the caller passes `temperature > 0`, we flip `do_sample=True` |
| 14 | 14 | automatically — otherwise a non-zero temperature is silently ignored |
@@ -107,7 +107,7 @@ def generate( # pragma: no cover | ||
| 107 | 107 | """Render `prompt`, run generation, decode response-only tokens. |
| 108 | 108 | |
| 109 | 109 | Pragma'd from unit coverage because it calls `model.generate`. |
| 110 | - Covered by Sprint 10's slow-marked integration test. | |
| 110 | + Covered by the slow-marked integration test. | |
| 111 | 111 | """ |
| 112 | 112 | import torch |
| 113 | 113 | |
src/dlm/inference/loader.pymodified@@ -16,9 +16,9 @@ Given a `StorePath` and the current host's `Capabilities`, resolve an | ||
| 16 | 16 | fp16 residual on top of a fp16 base. |
| 17 | 17 | |
| 18 | 18 | The tokenizer is loaded from the **adapter directory**, not the |
| 19 | -`store.cache/`, because Sprint 07's bringup persists the final | |
| 19 | +`store.cache/`, because tokenizer bringup persists the final | |
| 20 | 20 | tokenizer state (including `<|pad|>` additions) into the adapter dir |
| 21 | -at training-end. This is the cross-sprint contract F02 depends on. | |
| 21 | +at training-end. This is the contract export and inference depend on. | |
| 22 | 22 | |
| 23 | 23 | Heavy imports are deferred; the orchestration logic that picks args, |
| 24 | 24 | paths, and dtypes is unit-testable without HF. |
@@ -140,7 +140,7 @@ def load_for_inference( # pragma: no cover | ||
| 140 | 140 | |
| 141 | 141 | Pragma'd from unit coverage because it calls `AutoModelForCausalLM.from_pretrained` |
| 142 | 142 | and `PeftModel.from_pretrained`, which each need ~5 seconds and a |
| 143 | - real HF cache. Covered by Sprint 10's slow-marked integration test. | |
| 143 | + real HF cache. Covered by the slow-marked integration test. | |
| 144 | 144 | |
| 145 | 145 | `adapter_name`, when provided, targets the named multi-adapter |
| 146 | 146 | layout (`adapter/<name>/current.txt`). When `None`, uses the flat |
@@ -164,7 +164,7 @@ def load_for_inference( # pragma: no cover | ||
| 164 | 164 | model.eval() |
| 165 | 165 | |
| 166 | 166 | # Tokenizer from the adapter dir — source of truth after any |
| 167 | - # vocab growth (Sprint 07 bringup contract). | |
| 167 | + # vocab growth from training-time bringup. | |
| 168 | 168 | tokenizer = AutoTokenizer.from_pretrained(str(adapter_path)) |
| 169 | 169 | |
| 170 | 170 | return LoadedInference( |
src/dlm/inference/plan.pymodified@@ -1,4 +1,4 @@ | ||
| 1 | -"""`InferencePlan` — cross-hardware load plan for prompt-time (audit F05). | |
| 1 | +"""`InferencePlan` — cross-hardware load plan for prompt-time. | |
| 2 | 2 | |
| 3 | 3 | The problem |
| 4 | 4 | ----------- |
@@ -14,9 +14,9 @@ be on. | ||
| 14 | 14 | The solution |
| 15 | 15 | ------------ |
| 16 | 16 | |
| 17 | -`InferencePlan` is the twin of Sprint 05's `TrainingPlan`: a | |
| 18 | -hardware-doctor decision, but for the inference path. It reads the | |
| 19 | -saved adapter's training metadata (`training_run.json`, with a legacy | |
| 17 | +`InferencePlan` is the inference-side twin of `TrainingPlan`: a | |
| 18 | +hardware-doctor decision for prompt-time loading. It reads the saved | |
| 19 | +adapter's training metadata (`training_run.json`, with a legacy | |
| 20 | 20 | `pinned_versions.json` fallback) to learn |
| 21 | 21 | whether QLoRA was in play, cross-references with the current `Capabilities`, |
| 22 | 22 | and emits: |
@@ -73,8 +73,8 @@ def resolve_inference(adapter_dir: Path, caps: Any) -> InferencePlan: | ||
| 73 | 73 | Decision tree: |
| 74 | 74 | - CUDA host + bnb installed + QLoRA-trained → 4-bit load, no dequant. |
| 75 | 75 | - CUDA host, QLoRA-trained, but bnb missing → dequantize to fp16. |
| 76 | - - Non-CUDA host + QLoRA-trained → dequantize to fp16 (the "audit | |
| 77 | - F05" scenario: laptop inference of a server-trained adapter). | |
| 76 | + - Non-CUDA host + QLoRA-trained → dequantize to fp16 (the | |
| 77 | + cross-hardware laptop/server scenario). | |
| 78 | 78 | - Non-QLoRA adapter → load at the host's best precision (bf16 on |
| 79 | 79 | capable CUDA, else fp16). |
| 80 | 80 | """ |
@@ -121,7 +121,7 @@ def resolve_inference(adapter_dir: Path, caps: Any) -> InferencePlan: | ||
| 121 | 121 | attn_implementation="sdpa", |
| 122 | 122 | reason=( |
| 123 | 123 | f"QLoRA adapter on {backend} host; dequantizing to fp16 " |
| 124 | - "(bitsandbytes is CUDA-only). Audit F05 cross-hardware path." | |
| 124 | + "(bitsandbytes is CUDA-only)." | |
| 125 | 125 | ), |
| 126 | 126 | ) |
| 127 | 127 | return InferencePlan( |