tenseleyflow/documentlanguagemodel / 95402d2

Browse files

Scrub base model jargon

Authored by espadonne
SHA
95402d2749cc6da27f45a29d66ed72f56e81ce7f
Parents
5e24f0e
Tree
e00f9d0

10 changed files

StatusFile+-
M src/dlm/base_models/errors.py 2 3
M src/dlm/base_models/license.py 15 16
M src/dlm/base_models/probes.py 20 22
M src/dlm/base_models/registry.py 22 24
M src/dlm/base_models/resolver.py 2 3
M src/dlm/base_models/schema.py 12 13
M src/dlm/base_models/templates/chatml.jinja 3 3
M src/dlm/base_models/templates/llama3.jinja 1 1
M src/dlm/base_models/templates/mistral.jinja 1 1
M src/dlm/base_models/templates/phi3.jinja 1 1
src/dlm/base_models/errors.pymodified
@@ -65,9 +65,8 @@ class ProbeFailedError(BaseModelError):
6565
 class GatedModelError(BaseModelError):
6666
     """Model requires HuggingFace license acceptance and the user hasn't accepted.
6767
 
68
-    Lives here (not in Sprint 12b) because registry probes catch it
69
-    first; Sprint 12b's `dlm init --i-accept-license` flow writes the
70
-    acceptance record, but the error shape is owned here.
68
+    Lives here because registry probes catch it first; the acceptance
69
+    record is written elsewhere, but the error shape is owned here.
7170
     """
7271
 
7372
     def __init__(self, hf_id: str, license_url: str | None) -> None:
src/dlm/base_models/license.pymodified
@@ -1,22 +1,21 @@
1
-"""License acceptance records for gated base models (Sprint 12b).
1
+"""License acceptance records for gated base models.
22
 
33
 The `BaseModelSpec` schema already carries `requires_acceptance`,
4
-`redistributable`, `license_spdx`, and `license_url` (Sprint 06). What
5
-Sprint 12b adds is the *acceptance record* — a small Pydantic model
6
-that stores "user X accepted license Y at time T via path Z", plus a
7
-helper that validates an `accept_license` flag against the spec.
4
+`redistributable`, `license_spdx`, and `license_url`. This module adds
5
+the *acceptance record* — a small Pydantic model that stores "user X
6
+accepted license Y at time T via path Z", plus a helper that validates
7
+an `accept_license` flag against the spec.
88
 
99
 `LicenseAcceptance` rides on two load-bearing files:
1010
 
11
-- `manifest.json.license_acceptance` (this sprint): the per-store
12
-  durable record; reads on every subsequent `dlm train` to verify
13
-  the acceptance fingerprint is still present.
14
-- Repo-level `dlm.lock.license_acceptance` (Sprint 15): the
15
-  determinism-contract mirror; divergence between the two triggers a
16
-  lock re-check.
11
+- `manifest.json.license_acceptance`: the per-store durable record;
12
+  read on every subsequent `dlm train` to verify the acceptance
13
+  fingerprint is still present.
14
+- Repo-level `dlm.lock.license_acceptance`: the determinism-contract
15
+  mirror; divergence between the two triggers a lock re-check.
1716
 
18
-The interactive prompt in `dlm init` is owned by Sprint 13; this
19
-module ships the data types + helpers that sprint will call.
17
+The interactive prompt in `dlm init` lives in the CLI layer; this
18
+module ships the data types + helpers that prompt calls.
2019
 """
2120
 
2221
 from __future__ import annotations
@@ -38,12 +37,12 @@ class LicenseAcceptance(BaseModel):
3837
     `via` records *how* acceptance was captured:
3938
 
4039
     - `"cli_flag"` — `--i-accept-license` on init/train (explicit).
41
-    - `"interactive"` — `y/N` prompt (Sprint 13 UX).
42
-    - `"frontmatter"` — persisted in `.dlm` frontmatter (Sprint 13).
40
+    - `"interactive"` — `y/N` prompt.
41
+    - `"frontmatter"` — persisted in `.dlm` frontmatter.
4342
 
4443
     The `license_url` is captured at acceptance time so a later
4544
     upstream URL change is auditable (the recorded URL stays the
46
-    user's contract; warn-not-fail on drift per Sprint 12b risks).
45
+    user's contract; drift is visible without rewriting history).
4746
     """
4847
 
4948
     model_config = ConfigDict(extra="forbid", frozen=True)
src/dlm/base_models/probes.pymodified
@@ -11,17 +11,17 @@ Five probes:
1111
    `spec.architecture`. Catches model-surgery mismatches and wrong
1212
    revisions.
1313
 2. `probe_chat_template` — tokenizer has a non-empty `chat_template`
14
-   attribute. Essential for Sprint 12's Modelfile emission.
14
+   attribute. Essential for Modelfile emission.
1515
 3. `probe_gguf_arch_supported` — scans the vendored
1616
    `convert_hf_to_gguf.py` for a `@Model.register("<arch>")` matching
17
-   `spec.gguf_arch`. Sprint 11 owns the vendored submodule; until then
18
-   the probe skips with a clear message.
17
+   `spec.gguf_arch`. If the vendored submodule is absent, the probe
18
+   skips with a clear message.
1919
 4. `probe_pretokenizer_label` — reads `vendor/llama_cpp_pretokenizer_hashes.json`
2020
    (populated by `scripts/bump-llama-cpp.sh`) and checks the spec's
2121
    `tokenizer_pre` is a known **label**. Silent drift here causes
2222
    silent GGUF export failures per findings §9; the probe catches it
2323
    early. This is the offline fast-check.
24
-5. `probe_pretokenizer_hash` — real fingerprint check (audit-04 B8 /
24
+5. `probe_pretokenizer_hash` — real fingerprint check (see
2525
    CLAUDE.md pitfall #5). Tokenizes `_LLAMA_CPP_CHKTXT` and compares
2626
    the sha256 of the stringified token sequence against a vendored
2727
    per-label fingerprint table. Detects silent upstream tokenization
@@ -45,7 +45,7 @@ from dlm.base_models.schema import BaseModelSpec
4545
 
4646
 _LOG = logging.getLogger(__name__)
4747
 
48
-# Vendored artifact locations (Sprint 11 populates `vendor/llama.cpp`).
48
+# Vendored artifact locations.
4949
 _REPO_ROOT: Final[Path] = Path(__file__).resolve().parents[3]
5050
 VENDOR_LLAMA_CPP_DEFAULT: Final[Path] = _REPO_ROOT / "vendor" / "llama.cpp"
5151
 VENDOR_PRETOKENIZER_HASHES_DEFAULT: Final[Path] = (
@@ -60,7 +60,7 @@ VENDOR_PRETOKENIZER_FINGERPRINTS_DEFAULT: Final[Path] = (
6060
 # stringify the resulting token-id list, sha256 it — that digest is
6161
 # the fingerprint llama.cpp maps to one of its pre-tokenizer types.
6262
 # Keep verbatim; any edit here desynchronizes us from llama.cpp's
63
-# identification logic (audit-04 B8 + CLAUDE.md pitfall #5).
63
+# identification logic (see CLAUDE.md pitfall #5).
6464
 _LLAMA_CPP_CHKTXT: Final[str] = (
6565
     "\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n"
6666
     "🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ "
@@ -167,14 +167,14 @@ def probe_gguf_arch_supported(
167167
     """Scan vendored ``convert_hf_to_gguf.py`` for
168168
     ``@Model.register("<gguf_arch>")`` or ``@ModelBase.register(...)``.
169169
 
170
-    Until Sprint 11 lands the submodule, this probe skips.
170
+    If the vendored converter submodule is absent, this probe skips.
171171
     """
172172
     script = (vendor_path or VENDOR_LLAMA_CPP_DEFAULT) / "convert_hf_to_gguf.py"
173173
     if not script.exists():
174174
         return ProbeResult(
175175
             name="gguf_arch",
176176
             passed=True,
177
-            detail=f"skipped: {script} not present (Sprint 11 vendors llama.cpp)",
177
+            detail=f"skipped: {script} not present (vendor/llama.cpp missing)",
178178
             skipped=True,
179179
         )
180180
 
@@ -231,10 +231,9 @@ def probe_pretokenizer_label(
231231
     The vendored table is a JSON array of label strings that llama.cpp
232232
     recognizes in `get_vocab_base_pre()`. Missing table → skip.
233233
 
234
-    NOTE (audit-04 M7): this is a *label* probe, not a hash probe.
235
-    Sprint 11 will add real `probe_pretokenizer_hash` that canonically
236
-    digests `tokenizer.json` and compares against llama.cpp's fingerprint
237
-    table. For now we check coarse compatibility via the label.
234
+    NOTE: this is a *label* probe, not a hash probe.
235
+    `probe_pretokenizer_hash` is the canonical fingerprint check; this
236
+    probe only checks coarse compatibility via the label.
238237
     """
239238
     path = hashes_path or VENDOR_PRETOKENIZER_HASHES_DEFAULT
240239
     if not path.exists():
@@ -283,11 +282,11 @@ def probe_pretokenizer_hash(
283282
 ) -> ProbeResult:
284283
     """Compute the real llama.cpp pre-tokenizer fingerprint and compare.
285284
 
286
-    Audit-04 B8 / CLAUDE.md pitfall #5. The label probe (above) only
287
-    checks membership in a string table; llama.cpp itself identifies
288
-    the pre-tokenizer by sha256-hashing the token-id sequence produced
289
-    by tokenizing a stable test string (`_LLAMA_CPP_CHKTXT`). We do
290
-    the same here — if the upstream tokenizer changes behavior (new
285
+    See CLAUDE.md pitfall #5. The label probe (above) only checks
286
+    membership in a string table; llama.cpp itself identifies the
287
+    pre-tokenizer by sha256-hashing the token-id sequence produced by
288
+    tokenizing a stable test string (`_LLAMA_CPP_CHKTXT`). We do the
289
+    same here — if the upstream tokenizer changes behavior (new
291290
     revision, silently different merges), the fingerprint drifts and
292291
     this probe fails loudly *before* a broken GGUF reaches Ollama.
293292
 
@@ -560,7 +559,7 @@ def run_all(spec: BaseModelSpec, *, skip_export_probes: bool = False) -> ProbeRe
560559
     vendored llama.cpp can absorb (e.g. brand-new Qwen3 on a llama.cpp
561560
     pin from last month). They forfeit `dlm export` to Ollama until
562561
     the vendored copy catches up. VL bases auto-opt-out of export
563
-    probes — GGUF conversion for VL archs is tracked in Sprint 35.4.
562
+    probes because current GGUF export does not support them.
564563
     """
565564
     from dlm.modality import modality_for
566565
 
@@ -573,10 +572,9 @@ def run_all(spec: BaseModelSpec, *, skip_export_probes: bool = False) -> ProbeRe
573572
     else:
574573
         core = (*core, probe_chat_template(spec))
575574
 
576
-    # Media bases (VL + audio) bypass the llama.cpp-converter probes:
577
-    # converter support for VL archs is Sprint 35.4's scope, and audio
578
-    # archs are not on any llama.cpp roadmap yet. The export path
579
-    # refuses GGUF cleanly for both and emits an HF snapshot instead.
575
+    # Media bases (VL + audio) bypass the llama.cpp-converter probes.
576
+    # The export path refuses GGUF cleanly for both and emits an HF
577
+    # snapshot instead.
580578
     is_media = dispatch.requires_processor
581579
     if skip_export_probes or is_media:
582580
         return ProbeReport(hf_id=spec.hf_id, results=core)
src/dlm/base_models/registry.pymodified
@@ -9,19 +9,19 @@ Notes on individual entries:
99
   with <100M MAU). We record it as `license_spdx="Other"` and surface
1010
   the URL via `license_url`; it remains `redistributable=True` because
1111
   the license permits bundling + redistribution with attribution.
12
-  **Caveat (audit-04 m11):** the boolean `redistributable` field does
13
-  not express the MAU threshold or attribution requirement. A
12
+  **Caveat:** the boolean `redistributable` field does not express the
13
+  MAU threshold or attribution requirement. A
1414
   `redistributable_conditions: str | None` field on `BaseModelSpec`
1515
   plus a pack-time attestation checkbox would encode this properly —
16
-  deferred to Sprint 12b's license-UX extension. Until then, users
17
-  at the scale threshold must consult the license text themselves.
16
+  deferred follow-up work. Until then, users at the scale threshold
17
+  must consult the license text themselves.
1818
 - Llama-3.2 models are gated on HuggingFace (`requires_acceptance=True`)
1919
   and their license does NOT permit bundling into a `.dlm.pack`
20
-  (`redistributable=False`) — enforced by Sprint 14's pack gate and
21
-  Sprint 28's share-protocol refusal.
20
+  (`redistributable=False`) — enforced by the pack gate and
21
+  share-protocol refusal.
2222
 - SmolLM2 and Phi-3.5-mini are permissive (Apache-2.0 / MIT).
2323
 - `size_gb_fp16` is approximate; the hardware doctor uses it to seed
24
-  VRAM estimates, which then get refined by sprint 09's runtime guard.
24
+  VRAM estimates, which then get refined by runtime checks.
2525
 """
2626
 
2727
 from __future__ import annotations
@@ -211,20 +211,19 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
211211
         context_length=131_072,
212212
         recommended_seq_len=2048,
213213
     ),
214
-    # --- Vision-language bases (Sprint 35 v1) -------------------------------
214
+    # --- Vision-language bases ----------------------------------------------
215215
     # PaliGemma-3B-mix-224 — Google's instruction-tuned VL base built on
216216
     # Gemma-2B + SigLIP-So400m. Gated under the Gemma license; cannot
217217
     # redistribute inside a `.dlm.pack` (same pattern as Llama-3.2).
218218
     # Training targets Gemma's transformer blocks; the vision tower is
219219
     # trained jointly when modules_to_save expands to ["embed_tokens",
220
-    # "lm_head"], but Sprint 35 v1 keeps modules_to_save empty so only
221
-    # the LLM-side LoRA adapters move — the vision tower is frozen.
220
+    # "lm_head"], but the current entry keeps modules_to_save empty so
221
+    # only the LLM-side LoRA adapters move — the vision tower is frozen.
222222
     #
223223
     # `gguf_arch` / `tokenizer_pre` are set to tags the current vendored
224224
     # llama.cpp doesn't recognize; the export probes surface
225
-    # UNSUPPORTED + refuse GGUF conversion until Sprint 35.4 lands the
226
-    # arch-support gate. HF-snapshot export (`dlm export --hf-snapshot`)
227
-    # still works.
225
+    # UNSUPPORTED + refuse GGUF conversion until GGUF support lands.
226
+    # HF-snapshot export (`dlm export --hf-snapshot`) still works.
228227
     BaseModelSpec(
229228
         key="paligemma-3b-mix-224",
230229
         hf_id="google/paligemma-3b-mix-224",
@@ -233,7 +232,7 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
233232
         # it as drift; a maintainer pastes in the observed SHA from
234233
         # the script's output. Offline probe tests skip cleanly
235234
         # until then (see tests/unit/base_models/test_vl_registry.py).
236
-        # Landed as part of Sprint 35 v1; to verify, run:
235
+        # To verify, run:
237236
         #     uv run python scripts/refresh-registry.py --check
238237
         revision="8d2f7bc9c15d71a00c14f9eb7e4c7b99c79e0a11",
239238
         architecture="PaliGemmaForConditionalGeneration",
@@ -258,11 +257,10 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
258257
         ),
259258
     ),
260259
     # Qwen2-VL-2B-Instruct — Alibaba's Apache-2.0 VL base with dynamic-
261
-    # resolution support in native HF. Sprint 35.3 pins a conservative
262
-    # fixed 672×672 preprocessing plan (implementation-note (a) in the
263
-    # sprint spec) to avoid growing the VlPreprocessorPlan abstraction
264
-    # for dynamic ranges in v1 — later sprints can extend the plan with
265
-    # {min_pixels, max_pixels} when a user reaches that limit.
260
+    # resolution support in native HF. The current entry pins a
261
+    # conservative fixed 672×672 preprocessing plan to avoid growing
262
+    # the VlPreprocessorPlan abstraction for dynamic ranges yet; a
263
+    # future extension can add {min_pixels, max_pixels} when needed.
266264
     #
267265
     # 672×672 with Qwen2-VL's 28-pixel patch-merger grid yields 24×24 =
268266
     # 576 vision tokens per image. `<|image_pad|>` is the runtime
@@ -339,7 +337,7 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
339337
             num_image_tokens=256,
340338
         ),
341339
     ),
342
-    # --- Audio-language bases (Sprint 35.2) ---------------------------------
340
+    # --- Audio-language bases -----------------------------------------------
343341
     # Qwen2-Audio-7B-Instruct — Alibaba's open audio-text model. Uses
344342
     # the Qwen2 LLM backbone + a dedicated audio encoder. Apache-2.0
345343
     # but the 7B checkpoint is gated on HF via license acceptance, so
@@ -348,10 +346,10 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
348346
     # Apache-2.0, but not-bundled-by-default because the pack size
349347
     # (~14 GB fp16) dominates the tarball.
350348
     #
351
-    # The 16 kHz pin + 30 s max-length match the training-time defaults
352
-    # documented in the Qwen2-Audio card. Resampling support lands as a
353
-    # 35.2 follow-up; v1 refuses mismatched sample rates with an
354
-    # actionable error at preprocess time.
349
+    # The 16 kHz pin + 30 s max-length match the training-time
350
+    # defaults documented in the Qwen2-Audio card. Resampling support
351
+    # lands as follow-up work; current releases refuse mismatched
352
+    # sample rates with an actionable error at preprocess time.
355353
     #
356354
     # Placeholder SHA flagged the same way as paligemma — the weekly
357355
     # `scripts/refresh-registry.py --check` run surfaces drift and a
src/dlm/base_models/resolver.pymodified
@@ -10,9 +10,8 @@ Spec grammar:
1010
 
1111
 Gated models (`requires_acceptance=True`) raise `GatedModelError` unless
1212
 the caller has already accepted the license (signalled via
13
-`accept_license=True`). Sprint 12b ships the `dlm init --i-accept-license`
14
-flow that flips this on persistently; Sprint 06 tests pass
15
-`accept_license=True` directly to exercise the downstream path.
13
+`accept_license=True`). The CLI uses this to persist acceptance; tests
14
+pass `accept_license=True` directly to exercise the downstream path.
1615
 """
1716
 
1817
 from __future__ import annotations
src/dlm/base_models/schema.pymodified
@@ -9,14 +9,13 @@ point:
99
   same spec pin at exactly the same weights.
1010
 - `target_modules`: per-architecture LoRA target list (see findings §8;
1111
   `"all-linear"` is avoided because it bloats small models).
12
-- `template`: the chat-template dialect used by Sprint 12's Go-template
12
+- `template`: the chat-template dialect used by the Go-template
1313
   registry for Modelfile generation.
1414
 - `gguf_arch` / `tokenizer_pre`: identifiers the llama.cpp converter
15
-  matches against; Sprint 11's export preflight uses them.
16
-- License / gating (audit-02 F04 + F21): separate fields for SPDX,
17
-  acceptance gating, and re-distribution — each consumed by a different
18
-  gate (Sprint 12b license UX; Sprint 14 pack `--include-base`;
19
-  Sprint 28 share-protocol push refusal).
15
+  matches against; export preflight uses them.
16
+- License / gating: separate fields for SPDX, acceptance gating, and
17
+  re-distribution — each consumed by a different policy gate (license
18
+  acceptance, pack `--include-base`, share-protocol refusal).
2019
 """
2120
 
2221
 from __future__ import annotations
@@ -38,10 +37,10 @@ class VlPreprocessorPlan(BaseModel):
3837
     preflight checks + cache keying.
3938
 
4039
     `target_size` is `(height, width)` in pixels. `resize_policy`
41
-    defaults to `"fixed"` because that's what Sprint 35 v1 ships —
42
-    Qwen2-VL's dynamic resolution lands in 35.3. `image_token` is the
43
-    textual placeholder inserted into prompts before the processor
44
-    expands it into `num_image_tokens` copies.
40
+    defaults to `"fixed"` because that's what the current launch
41
+    registry ships. `image_token` is the textual placeholder inserted
42
+    into prompts before the processor expands it into
43
+    `num_image_tokens` copies.
4544
     """
4645
 
4746
     model_config = ConfigDict(extra="forbid", frozen=True)
@@ -61,10 +60,10 @@ class VlPreprocessorPlan(BaseModel):
6160
 
6261
 
6362
 class AudioPreprocessorPlan(BaseModel):
64
-    """Per-base audio-preprocessing parameters (Sprint 35.2).
63
+    """Per-base audio-preprocessing parameters.
6564
 
6665
     Mirrors `VlPreprocessorPlan` — pinned at registry-build time so
67
-    the audio cache key stays stable. Sprint 35.2 v1 refuses audio at
66
+    the audio cache key stays stable. Current releases refuse audio at
6867
     non-target `sample_rate`; resampling lands as a follow-up.
6968
 
7069
     `sample_rate` is the model's training rate in Hz (Qwen2-Audio:
@@ -101,7 +100,7 @@ class BaseModelSpec(BaseModel):
101100
     gguf_arch: str = Field(..., min_length=1, description="Name llama.cpp's converter uses.")
102101
     tokenizer_pre: str = Field(..., min_length=1, description="Pre-tokenizer label.")
103102
 
104
-    # License + acceptance (audit-02 F04 / F21).
103
+    # License + acceptance.
105104
     license_spdx: str = Field(..., min_length=1)
106105
     license_url: str | None = None
107106
     requires_acceptance: bool = False
src/dlm/base_models/templates/chatml.jinjamodified
@@ -1,8 +1,8 @@
11
 {#
22
 ChatML reference template — used by Qwen 2.5, SmolLM2, and compatible
3
-models. Source-of-truth for Sprint 12's Go `text/template` round-trip
4
-tests: rendering these messages through this Jinja and Sprint 12's Go
5
-template must produce token-identical sequences after tokenization.
3
+models. Source-of-truth for Go `text/template` round-trip tests:
4
+rendering these messages through this Jinja and the Go template must
5
+produce token-identical sequences after tokenization.
66
 
77
 Render `{"messages": [...]}` where each message has `role` in
88
 {"system", "user", "assistant"} and a `content` string. Call with
src/dlm/base_models/templates/llama3.jinjamodified
@@ -1,7 +1,7 @@
11
 {#
22
 Llama 3 / 3.1 / 3.2 reference template. Uses header-id framing and the
33
 `<|eot_id|>` end-of-turn marker (distinct from the EOS). Source-of-truth
4
-for Sprint 12's Go template round-trip tests.
4
+for Go template round-trip tests.
55
 
66
 Required tokens in the tokenizer:
77
   <|begin_of_text|>  <|start_header_id|>  <|end_header_id|>  <|eot_id|>
src/dlm/base_models/templates/mistral.jinjamodified
@@ -4,7 +4,7 @@ wraps the conversation in `<s>...</s>` BOS/EOS. System messages are
44
 prepended to the first user message by convention.
55
 
66
 Vendored for future Mistral-family entries in the registry; not used by
7
-the 10 launch bases but Sprint 12's Go registry mirrors this shape.
7
+the 10 launch bases but the Go registry mirrors this shape.
88
 #}
99
 {%- if messages[0]['role'] == 'system' -%}
1010
     {%- set system_prompt = messages[0]['content'] -%}
src/dlm/base_models/templates/phi3.jinjamodified
@@ -1,7 +1,7 @@
11
 {#
22
 Phi-3 / Phi-3.5 reference template. Uses `<|role|>` opener and `<|end|>`
33
 closer; finishes with `<|endoftext|>` on assistant turns (handled by
4
-`add_generation_prompt=False`). Source-of-truth for Sprint 12 round-trip.
4
+`add_generation_prompt=False`). Source-of-truth for round-trip tests.
55
 
66
 Roles accepted: "system", "user", "assistant".
77
 #}