Python · 21772 bytes Raw Blame History
1 """Compatibility probes run against a `BaseModelSpec`.
2
3 Each probe is an independent function returning `ProbeResult`. `run_all`
4 aggregates them into a `ProbeReport`. Probes must be non-destructive
5 (read-only) and offline-safe where possible — the refresh-registry
6 script exercises them online.
7
8 Five probes:
9
10 1. `probe_architecture` — `AutoConfig(hf_id).architectures[0]` matches
11 `spec.architecture`. Catches model-surgery mismatches and wrong
12 revisions.
13 2. `probe_chat_template` — tokenizer has a non-empty `chat_template`
14 attribute. Essential for Modelfile emission.
15 3. `probe_gguf_arch_supported` — scans the vendored
16 `convert_hf_to_gguf.py` for a `@Model.register("<arch>")` matching
17 `spec.gguf_arch`. If the vendored submodule is absent, the probe
18 skips with a clear message.
19 4. `probe_pretokenizer_label` — reads `vendor/llama_cpp_pretokenizer_hashes.json`
20 (populated by `scripts/bump-llama-cpp.sh`) and checks the spec's
21 `tokenizer_pre` is a known **label**. Silent drift here causes
22 silent GGUF export failures per findings §9; the probe catches it
23 early. This is the offline fast-check.
24 5. `probe_pretokenizer_hash` — real fingerprint check (see
25 CLAUDE.md pitfall #5). Tokenizes `_LLAMA_CPP_CHKTXT` and compares
26 the sha256 of the stringified token sequence against a vendored
27 per-label fingerprint table. Detects silent upstream tokenization
28 changes that the label probe would miss. Requires a local HF
29 cache; skipped cleanly otherwise.
30
31 Heavy imports (`transformers.AutoConfig`, `AutoTokenizer`) happen
32 inside each probe so the module loads cheaply.
33 """
34
35 from __future__ import annotations
36
37 import json
38 import logging
39 import re
40 from pathlib import Path
41 from typing import Final
42
43 from dlm.base_models.errors import GatedModelError, ProbeReport, ProbeResult
44 from dlm.base_models.schema import BaseModelSpec
45
46 _LOG = logging.getLogger(__name__)
47
48 # Vendored artifact locations.
49 _REPO_ROOT: Final[Path] = Path(__file__).resolve().parents[3]
50 VENDOR_LLAMA_CPP_DEFAULT: Final[Path] = _REPO_ROOT / "vendor" / "llama.cpp"
51 VENDOR_PRETOKENIZER_HASHES_DEFAULT: Final[Path] = (
52 _REPO_ROOT / "vendor" / "llama_cpp_pretokenizer_hashes.json"
53 )
54 VENDOR_PRETOKENIZER_FINGERPRINTS_DEFAULT: Final[Path] = (
55 _REPO_ROOT / "vendor" / "llama_cpp_pretokenizer_fingerprints.json"
56 )
57
58 # The canonical test string llama.cpp uses at `convert_hf_to_gguf.py::
59 # get_vocab_base_pre`. Tokenize this under the model's BPE tokenizer,
60 # stringify the resulting token-id list, sha256 it — that digest is
61 # the fingerprint llama.cpp maps to one of its pre-tokenizer types.
62 # Keep verbatim; any edit here desynchronizes us from llama.cpp's
63 # identification logic (see CLAUDE.md pitfall #5).
64 _LLAMA_CPP_CHKTXT: Final[str] = (
65 "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n"
66 "🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ "
67 "🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 "
68 "កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ "
69 "------======= нещо на Български '''''''```````\"\"\"\"......!!!!!!?????? "
70 "I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, "
71 "'D you like some tea? We'Ve a'lL"
72 )
73
74
75 # --- individual probes --------------------------------------------------------
76
77
78 def probe_architecture(spec: BaseModelSpec) -> ProbeResult:
79 """`AutoConfig.from_pretrained(hf_id, revision).architectures[0]` matches."""
80 try:
81 from huggingface_hub.errors import GatedRepoError
82 from transformers import AutoConfig
83 except ImportError as exc: # pragma: no cover — dev env always has transformers
84 return ProbeResult(
85 name="architecture",
86 passed=True,
87 detail=f"skipped: transformers unavailable ({exc})",
88 skipped=True,
89 )
90
91 try:
92 cfg = AutoConfig.from_pretrained(spec.hf_id, revision=spec.revision)
93 except GatedRepoError as exc:
94 raise GatedModelError(spec.hf_id, spec.license_url) from exc
95 except Exception as exc:
96 return ProbeResult(
97 name="architecture",
98 passed=False,
99 detail=f"load failed: {type(exc).__name__}: {exc}",
100 )
101
102 architectures = getattr(cfg, "architectures", None)
103 if not architectures:
104 return ProbeResult(
105 name="architecture",
106 passed=False,
107 detail="config.json has no `architectures` entry",
108 )
109
110 observed = architectures[0]
111 if observed != spec.architecture:
112 return ProbeResult(
113 name="architecture",
114 passed=False,
115 detail=f"expected {spec.architecture!r}, got {observed!r}",
116 )
117 return ProbeResult(
118 name="architecture",
119 passed=True,
120 detail=f"matched {observed!r}",
121 )
122
123
124 def probe_chat_template(spec: BaseModelSpec) -> ProbeResult:
125 """Tokenizer carries a non-empty `chat_template` attribute."""
126 try:
127 from huggingface_hub.errors import GatedRepoError
128 from transformers import AutoTokenizer
129 except ImportError as exc: # pragma: no cover
130 return ProbeResult(
131 name="chat_template",
132 passed=True,
133 detail=f"skipped: transformers unavailable ({exc})",
134 skipped=True,
135 )
136
137 try:
138 tokenizer = AutoTokenizer.from_pretrained(spec.hf_id, revision=spec.revision)
139 except GatedRepoError as exc:
140 raise GatedModelError(spec.hf_id, spec.license_url) from exc
141 except Exception as exc:
142 return ProbeResult(
143 name="chat_template",
144 passed=False,
145 detail=f"load failed: {type(exc).__name__}: {exc}",
146 )
147
148 template = getattr(tokenizer, "chat_template", None)
149 if not template:
150 return ProbeResult(
151 name="chat_template",
152 passed=False,
153 detail="tokenizer has no chat_template",
154 )
155 return ProbeResult(
156 name="chat_template",
157 passed=True,
158 detail=f"present ({len(template)} chars)",
159 )
160
161
162 def probe_gguf_arch_supported(
163 spec: BaseModelSpec,
164 *,
165 vendor_path: Path | None = None,
166 ) -> ProbeResult:
167 """Scan vendored ``convert_hf_to_gguf.py`` for
168 ``@Model.register("<gguf_arch>")`` or ``@ModelBase.register(...)``.
169
170 If the vendored converter submodule is absent, this probe skips.
171 """
172 script = (vendor_path or VENDOR_LLAMA_CPP_DEFAULT) / "convert_hf_to_gguf.py"
173 if not script.exists():
174 return ProbeResult(
175 name="gguf_arch",
176 passed=True,
177 detail=f"skipped: {script} not present (vendor/llama.cpp missing)",
178 skipped=True,
179 )
180
181 try:
182 source = script.read_text(encoding="utf-8", errors="replace")
183 except OSError as exc:
184 return ProbeResult(
185 name="gguf_arch",
186 passed=False,
187 detail=f"read failed: {exc}",
188 )
189
190 # llama.cpp's converter registers HF architecture class names via
191 # ``@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model", ...)`` (the
192 # class was renamed from ``@Model.register`` mid-2024; we accept both
193 # forms so this probe stays tolerant if the vendored copy is ever
194 # pinned to an older tag). A single decorator may list *multiple*
195 # architectures, so we capture the full parenthesized arg list and
196 # then extract every quoted string from it.
197 decorator_re = re.compile(r"""@(?:Model|ModelBase)\.register\(([^)]*)\)""")
198 arg_string_re = re.compile(r"""["']([^"']+)["']""")
199 found_archs: set[str] = set()
200 for args in decorator_re.findall(source):
201 found_archs.update(arg_string_re.findall(args))
202 # Compare against the HF architecture (what the decorator actually
203 # registers), not the short gguf label. Historically the probe
204 # compared ``spec.gguf_arch`` — a silent false-negative, because
205 # llama.cpp registers ``"Qwen2ForCausalLM"`` not ``"qwen2"``; the
206 # probe only passed for registered models, which bypass this code
207 # path entirely.
208 if spec.architecture in found_archs:
209 return ProbeResult(
210 name="gguf_arch",
211 passed=True,
212 detail=f"converter registers {spec.architecture!r}",
213 )
214 return ProbeResult(
215 name="gguf_arch",
216 passed=False,
217 detail=(
218 f"{spec.architecture!r} not in convert_hf_to_gguf.py "
219 f"(scanned {len(found_archs)} registrations)"
220 ),
221 )
222
223
224 def probe_pretokenizer_label(
225 spec: BaseModelSpec,
226 *,
227 hashes_path: Path | None = None,
228 ) -> ProbeResult:
229 """Check `spec.tokenizer_pre` is a known pre-tokenizer label.
230
231 The vendored table is a JSON array of label strings that llama.cpp
232 recognizes in `get_vocab_base_pre()`. Missing table → skip.
233
234 NOTE: this is a *label* probe, not a hash probe.
235 `probe_pretokenizer_hash` is the canonical fingerprint check; this
236 probe only checks coarse compatibility via the label.
237 """
238 path = hashes_path or VENDOR_PRETOKENIZER_HASHES_DEFAULT
239 if not path.exists():
240 return ProbeResult(
241 name="pretokenizer_label",
242 passed=True,
243 detail=f"skipped: {path} not present (bump-llama-cpp.sh maintains it)",
244 skipped=True,
245 )
246
247 try:
248 labels = set(json.loads(path.read_text(encoding="utf-8")))
249 except (OSError, json.JSONDecodeError) as exc:
250 return ProbeResult(
251 name="pretokenizer_label",
252 passed=False,
253 detail=f"table unreadable: {exc}",
254 )
255 except TypeError as exc:
256 return ProbeResult(
257 name="pretokenizer_label",
258 passed=False,
259 detail=f"table has wrong shape (expected list[str]): {exc}",
260 )
261
262 if spec.tokenizer_pre in labels:
263 return ProbeResult(
264 name="pretokenizer_label",
265 passed=True,
266 detail=f"{spec.tokenizer_pre!r} known to llama.cpp",
267 )
268 return ProbeResult(
269 name="pretokenizer_label",
270 passed=False,
271 detail=(
272 f"{spec.tokenizer_pre!r} not in vendored label table; "
273 "run scripts/bump-llama-cpp.sh or pick another base"
274 ),
275 )
276
277
278 def probe_pretokenizer_hash(
279 spec: BaseModelSpec,
280 *,
281 fingerprints_path: Path | None = None,
282 ) -> ProbeResult:
283 """Compute the real llama.cpp pre-tokenizer fingerprint and compare.
284
285 See CLAUDE.md pitfall #5. The label probe (above) only checks
286 membership in a string table; llama.cpp itself identifies the
287 pre-tokenizer by sha256-hashing the token-id sequence produced by
288 tokenizing a stable test string (`_LLAMA_CPP_CHKTXT`). We do the
289 same here — if the upstream tokenizer changes behavior (new
290 revision, silently different merges), the fingerprint drifts and
291 this probe fails loudly *before* a broken GGUF reaches Ollama.
292
293 The fingerprint table at
294 `vendor/llama_cpp_pretokenizer_fingerprints.json` is maintained by
295 `scripts/bump-llama-cpp.sh`. Missing table or no entry for the
296 spec's `tokenizer_pre` label → skip (the label probe still runs).
297
298 Requires a local HF cache (`local_files_only=True`); skipped
299 cleanly in CI environments without the tokenizer downloaded.
300 """
301 import hashlib
302
303 path = fingerprints_path or VENDOR_PRETOKENIZER_FINGERPRINTS_DEFAULT
304 if not path.exists():
305 return ProbeResult(
306 name="pretokenizer_hash",
307 passed=True,
308 detail=f"skipped: {path} not present (bump-llama-cpp.sh maintains it)",
309 skipped=True,
310 )
311
312 try:
313 table = json.loads(path.read_text(encoding="utf-8"))
314 except (OSError, json.JSONDecodeError) as exc:
315 return ProbeResult(
316 name="pretokenizer_hash",
317 passed=False,
318 detail=f"fingerprint table unreadable: {exc}",
319 )
320 if not isinstance(table, dict):
321 return ProbeResult(
322 name="pretokenizer_hash",
323 passed=False,
324 detail="fingerprint table has wrong shape (expected {label: sha256})",
325 )
326
327 expected = table.get(spec.tokenizer_pre)
328 if not isinstance(expected, str):
329 return ProbeResult(
330 name="pretokenizer_hash",
331 passed=True,
332 detail=(
333 f"skipped: no fingerprint recorded for {spec.tokenizer_pre!r}; "
334 "run scripts/bump-llama-cpp.sh to refresh the table"
335 ),
336 skipped=True,
337 )
338
339 try:
340 from huggingface_hub.errors import GatedRepoError
341 from transformers import AutoTokenizer
342 except ImportError as exc: # pragma: no cover — dev env always has transformers
343 return ProbeResult(
344 name="pretokenizer_hash",
345 passed=True,
346 detail=f"skipped: transformers unavailable ({exc})",
347 skipped=True,
348 )
349
350 try:
351 tok = AutoTokenizer.from_pretrained(
352 spec.hf_id, revision=spec.revision, local_files_only=True
353 )
354 except GatedRepoError as exc:
355 raise GatedModelError(spec.hf_id, spec.license_url) from exc
356 except Exception as exc:
357 # Not a probe *failure* — tokenizer simply isn't cached locally.
358 # Online refresh-registry runs will exercise the real check.
359 return ProbeResult(
360 name="pretokenizer_hash",
361 passed=True,
362 detail=f"skipped: cannot load tokenizer offline ({type(exc).__name__})",
363 skipped=True,
364 )
365
366 try:
367 tokens = tok.encode(_LLAMA_CPP_CHKTXT)
368 except Exception as exc:
369 return ProbeResult(
370 name="pretokenizer_hash",
371 passed=False,
372 detail=f"tokenizer.encode failed on chktxt: {type(exc).__name__}: {exc}",
373 )
374
375 digest = hashlib.sha256(str(tokens).encode()).hexdigest()
376 if digest != expected:
377 return ProbeResult(
378 name="pretokenizer_hash",
379 passed=False,
380 detail=(
381 f"pre-tokenizer drifted for {spec.tokenizer_pre!r}: "
382 f"expected {expected[:12]}…, got {digest[:12]}…. "
383 "Upstream may have changed tokenization; re-pin revision "
384 "or run scripts/bump-llama-cpp.sh to refresh the fingerprint."
385 ),
386 )
387 return ProbeResult(
388 name="pretokenizer_hash",
389 passed=True,
390 detail=f"fingerprint matches {spec.tokenizer_pre!r} ({digest[:12]}…)",
391 )
392
393
394 def probe_vl_image_token(spec: BaseModelSpec) -> ProbeResult:
395 """Verify the processor exposes the spec's image-placeholder token.
396
397 For `modality="vision-language"` bases the preprocessor plan pins
398 `image_token` (e.g. `"<image>"`). `AutoProcessor.from_pretrained`
399 must expose it as a known additional-special token — otherwise
400 mixed-row collation can't expand the placeholder into the model's
401 fixed `num_image_tokens` slots and training silently runs on
402 text-only rows.
403
404 Non-VL bases skip this probe cleanly.
405 """
406 if spec.modality != "vision-language" or spec.vl_preprocessor_plan is None:
407 return ProbeResult(
408 name="vl_image_token",
409 passed=True,
410 detail="skipped: spec is not a vision-language base",
411 skipped=True,
412 )
413
414 try:
415 from huggingface_hub.errors import GatedRepoError
416
417 from dlm.base_models._typed_shims import load_auto_processor
418 except ImportError as exc: # pragma: no cover
419 return ProbeResult(
420 name="vl_image_token",
421 passed=True,
422 detail=f"skipped: transformers unavailable ({exc})",
423 skipped=True,
424 )
425
426 try:
427 processor = load_auto_processor(spec.hf_id, revision=spec.revision)
428 except GatedRepoError as exc:
429 raise GatedModelError(spec.hf_id, spec.license_url) from exc
430 except Exception as exc:
431 return ProbeResult(
432 name="vl_image_token",
433 passed=False,
434 detail=f"processor load failed: {type(exc).__name__}: {exc}",
435 )
436
437 # AutoProcessor wraps a tokenizer on `.tokenizer`. The image
438 # placeholder must tokenize to a *single* known token — otherwise
439 # the collator can't locate the insertion points deterministically.
440 placeholder = spec.vl_preprocessor_plan.image_token
441 tokenizer = getattr(processor, "tokenizer", None)
442 if tokenizer is None:
443 return ProbeResult(
444 name="vl_image_token",
445 passed=False,
446 detail="processor has no `.tokenizer` attribute",
447 )
448 try:
449 token_ids = tokenizer.encode(placeholder, add_special_tokens=False)
450 except Exception as exc:
451 return ProbeResult(
452 name="vl_image_token",
453 passed=False,
454 detail=f"tokenizer rejected placeholder {placeholder!r}: {exc}",
455 )
456 if len(token_ids) != 1:
457 return ProbeResult(
458 name="vl_image_token",
459 passed=False,
460 detail=(
461 f"placeholder {placeholder!r} tokenized to {len(token_ids)} tokens (expected 1)"
462 ),
463 )
464 return ProbeResult(
465 name="vl_image_token",
466 passed=True,
467 detail=f"placeholder {placeholder!r} resolves to token id {token_ids[0]}",
468 )
469
470
471 def probe_audio_token(spec: BaseModelSpec) -> ProbeResult:
472 """Verify the processor exposes the spec's audio-placeholder token.
473
474 Parallel to `probe_vl_image_token` — for `modality="audio-language"`
475 bases the preprocessor plan pins `audio_token` (e.g. `"<|AUDIO|>"`).
476 `AutoProcessor.from_pretrained` must expose it as a single known
477 token; otherwise the custom audio collator can't locate the
478 insertion point when expanding the placeholder into the model's
479 fixed audio-token window.
480
481 Non-audio bases skip this probe cleanly.
482 """
483 if spec.modality != "audio-language" or spec.audio_preprocessor_plan is None:
484 return ProbeResult(
485 name="audio_token",
486 passed=True,
487 detail="skipped: spec is not an audio-language base",
488 skipped=True,
489 )
490
491 try:
492 from huggingface_hub.errors import GatedRepoError
493
494 from dlm.base_models._typed_shims import load_auto_processor
495 except ImportError as exc: # pragma: no cover
496 return ProbeResult(
497 name="audio_token",
498 passed=True,
499 detail=f"skipped: transformers unavailable ({exc})",
500 skipped=True,
501 )
502
503 try:
504 processor = load_auto_processor(spec.hf_id, revision=spec.revision)
505 except GatedRepoError as exc:
506 raise GatedModelError(spec.hf_id, spec.license_url) from exc
507 except Exception as exc:
508 return ProbeResult(
509 name="audio_token",
510 passed=False,
511 detail=f"processor load failed: {type(exc).__name__}: {exc}",
512 )
513
514 placeholder = spec.audio_preprocessor_plan.audio_token
515 tokenizer = getattr(processor, "tokenizer", None)
516 if tokenizer is None:
517 return ProbeResult(
518 name="audio_token",
519 passed=False,
520 detail="processor has no `.tokenizer` attribute",
521 )
522 try:
523 token_ids = tokenizer.encode(placeholder, add_special_tokens=False)
524 except Exception as exc:
525 return ProbeResult(
526 name="audio_token",
527 passed=False,
528 detail=f"tokenizer rejected placeholder {placeholder!r}: {exc}",
529 )
530 if len(token_ids) != 1:
531 return ProbeResult(
532 name="audio_token",
533 passed=False,
534 detail=(
535 f"placeholder {placeholder!r} tokenized to {len(token_ids)} tokens (expected 1)"
536 ),
537 )
538 return ProbeResult(
539 name="audio_token",
540 passed=True,
541 detail=f"placeholder {placeholder!r} resolves to token id {token_ids[0]}",
542 )
543
544
545 # --- aggregate ---------------------------------------------------------------
546
547
548 def run_all(spec: BaseModelSpec, *, skip_export_probes: bool = False) -> ProbeReport:
549 """Run every probe; aggregate into a `ProbeReport`.
550
551 `GatedModelError` from an individual probe propagates immediately —
552 it's not a "probe failure" in the registry-drift sense; it's an
553 acceptance-flow signal.
554
555 `skip_export_probes=True` drops the three llama.cpp / GGUF-conversion
556 checks (`gguf_arch_supported`, `pretokenizer_label`,
557 `pretokenizer_hash`). Users opt into this when they want training
558 + HF inference on a base whose architecture ships faster than our
559 vendored llama.cpp can absorb (e.g. brand-new Qwen3 on a llama.cpp
560 pin from last month). They forfeit `dlm export` to Ollama until
561 the vendored copy catches up. VL bases auto-opt-out of export
562 probes because current GGUF export does not support them.
563 """
564 from dlm.modality import modality_for
565
566 dispatch = modality_for(spec)
567 core: tuple[ProbeResult, ...] = (probe_architecture(spec),)
568 if dispatch.accepts_images:
569 core = (*core, probe_vl_image_token(spec))
570 elif dispatch.accepts_audio:
571 core = (*core, probe_audio_token(spec))
572 else:
573 core = (*core, probe_chat_template(spec))
574
575 # Media bases (VL + audio) bypass the llama.cpp-converter probes.
576 # The export path refuses GGUF cleanly for both and emits an HF
577 # snapshot instead.
578 is_media = dispatch.requires_processor
579 if skip_export_probes or is_media:
580 return ProbeReport(hf_id=spec.hf_id, results=core)
581 results = (
582 *core,
583 probe_gguf_arch_supported(spec),
584 probe_pretokenizer_label(spec),
585 probe_pretokenizer_hash(spec),
586 )
587 return ProbeReport(hf_id=spec.hf_id, results=results)