"""Compatibility probes run against a `BaseModelSpec`. Each probe is an independent function returning `ProbeResult`. `run_all` aggregates them into a `ProbeReport`. Probes must be non-destructive (read-only) and offline-safe where possible — the refresh-registry script exercises them online. Five probes: 1. `probe_architecture` — `AutoConfig(hf_id).architectures[0]` matches `spec.architecture`. Catches model-surgery mismatches and wrong revisions. 2. `probe_chat_template` — tokenizer has a non-empty `chat_template` attribute. Essential for Modelfile emission. 3. `probe_gguf_arch_supported` — scans the vendored `convert_hf_to_gguf.py` for a `@Model.register("")` matching `spec.gguf_arch`. If the vendored submodule is absent, the probe skips with a clear message. 4. `probe_pretokenizer_label` — reads `vendor/llama_cpp_pretokenizer_hashes.json` (populated by `scripts/bump-llama-cpp.sh`) and checks the spec's `tokenizer_pre` is a known **label**. Silent drift here causes silent GGUF export failures per findings §9; the probe catches it early. This is the offline fast-check. 5. `probe_pretokenizer_hash` — real fingerprint check (see CLAUDE.md pitfall #5). Tokenizes `_LLAMA_CPP_CHKTXT` and compares the sha256 of the stringified token sequence against a vendored per-label fingerprint table. Detects silent upstream tokenization changes that the label probe would miss. Requires a local HF cache; skipped cleanly otherwise. Heavy imports (`transformers.AutoConfig`, `AutoTokenizer`) happen inside each probe so the module loads cheaply. """ from __future__ import annotations import json import logging import re from pathlib import Path from typing import Final from dlm.base_models.errors import GatedModelError, ProbeReport, ProbeResult from dlm.base_models.schema import BaseModelSpec _LOG = logging.getLogger(__name__) # Vendored artifact locations. _REPO_ROOT: Final[Path] = Path(__file__).resolve().parents[3] VENDOR_LLAMA_CPP_DEFAULT: Final[Path] = _REPO_ROOT / "vendor" / "llama.cpp" VENDOR_PRETOKENIZER_HASHES_DEFAULT: Final[Path] = ( _REPO_ROOT / "vendor" / "llama_cpp_pretokenizer_hashes.json" ) VENDOR_PRETOKENIZER_FINGERPRINTS_DEFAULT: Final[Path] = ( _REPO_ROOT / "vendor" / "llama_cpp_pretokenizer_fingerprints.json" ) # The canonical test string llama.cpp uses at `convert_hf_to_gguf.py:: # get_vocab_base_pre`. Tokenize this under the model's BPE tokenizer, # stringify the resulting token-id list, sha256 it — that digest is # the fingerprint llama.cpp maps to one of its pre-tokenizer types. # Keep verbatim; any edit here desynchronizes us from llama.cpp's # identification logic (see CLAUDE.md pitfall #5). _LLAMA_CPP_CHKTXT: Final[str] = ( "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n" "🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ " "🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 " "កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ " "------======= нещо на Български '''''''```````\"\"\"\"......!!!!!!?????? " "I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, " "'D you like some tea? We'Ve a'lL" ) # --- individual probes -------------------------------------------------------- def probe_architecture(spec: BaseModelSpec) -> ProbeResult: """`AutoConfig.from_pretrained(hf_id, revision).architectures[0]` matches.""" try: from huggingface_hub.errors import GatedRepoError from transformers import AutoConfig except ImportError as exc: # pragma: no cover — dev env always has transformers return ProbeResult( name="architecture", passed=True, detail=f"skipped: transformers unavailable ({exc})", skipped=True, ) try: cfg = AutoConfig.from_pretrained(spec.hf_id, revision=spec.revision) except GatedRepoError as exc: raise GatedModelError(spec.hf_id, spec.license_url) from exc except Exception as exc: return ProbeResult( name="architecture", passed=False, detail=f"load failed: {type(exc).__name__}: {exc}", ) architectures = getattr(cfg, "architectures", None) if not architectures: return ProbeResult( name="architecture", passed=False, detail="config.json has no `architectures` entry", ) observed = architectures[0] if observed != spec.architecture: return ProbeResult( name="architecture", passed=False, detail=f"expected {spec.architecture!r}, got {observed!r}", ) return ProbeResult( name="architecture", passed=True, detail=f"matched {observed!r}", ) def probe_chat_template(spec: BaseModelSpec) -> ProbeResult: """Tokenizer carries a non-empty `chat_template` attribute.""" try: from huggingface_hub.errors import GatedRepoError from transformers import AutoTokenizer except ImportError as exc: # pragma: no cover return ProbeResult( name="chat_template", passed=True, detail=f"skipped: transformers unavailable ({exc})", skipped=True, ) try: tokenizer = AutoTokenizer.from_pretrained(spec.hf_id, revision=spec.revision) except GatedRepoError as exc: raise GatedModelError(spec.hf_id, spec.license_url) from exc except Exception as exc: return ProbeResult( name="chat_template", passed=False, detail=f"load failed: {type(exc).__name__}: {exc}", ) template = getattr(tokenizer, "chat_template", None) if not template: return ProbeResult( name="chat_template", passed=False, detail="tokenizer has no chat_template", ) return ProbeResult( name="chat_template", passed=True, detail=f"present ({len(template)} chars)", ) def probe_gguf_arch_supported( spec: BaseModelSpec, *, vendor_path: Path | None = None, ) -> ProbeResult: """Scan vendored ``convert_hf_to_gguf.py`` for ``@Model.register("")`` or ``@ModelBase.register(...)``. If the vendored converter submodule is absent, this probe skips. """ script = (vendor_path or VENDOR_LLAMA_CPP_DEFAULT) / "convert_hf_to_gguf.py" if not script.exists(): return ProbeResult( name="gguf_arch", passed=True, detail=f"skipped: {script} not present (vendor/llama.cpp missing)", skipped=True, ) try: source = script.read_text(encoding="utf-8", errors="replace") except OSError as exc: return ProbeResult( name="gguf_arch", passed=False, detail=f"read failed: {exc}", ) # llama.cpp's converter registers HF architecture class names via # ``@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model", ...)`` (the # class was renamed from ``@Model.register`` mid-2024; we accept both # forms so this probe stays tolerant if the vendored copy is ever # pinned to an older tag). A single decorator may list *multiple* # architectures, so we capture the full parenthesized arg list and # then extract every quoted string from it. decorator_re = re.compile(r"""@(?:Model|ModelBase)\.register\(([^)]*)\)""") arg_string_re = re.compile(r"""["']([^"']+)["']""") found_archs: set[str] = set() for args in decorator_re.findall(source): found_archs.update(arg_string_re.findall(args)) # Compare against the HF architecture (what the decorator actually # registers), not the short gguf label. Historically the probe # compared ``spec.gguf_arch`` — a silent false-negative, because # llama.cpp registers ``"Qwen2ForCausalLM"`` not ``"qwen2"``; the # probe only passed for registered models, which bypass this code # path entirely. if spec.architecture in found_archs: return ProbeResult( name="gguf_arch", passed=True, detail=f"converter registers {spec.architecture!r}", ) return ProbeResult( name="gguf_arch", passed=False, detail=( f"{spec.architecture!r} not in convert_hf_to_gguf.py " f"(scanned {len(found_archs)} registrations)" ), ) def probe_pretokenizer_label( spec: BaseModelSpec, *, hashes_path: Path | None = None, ) -> ProbeResult: """Check `spec.tokenizer_pre` is a known pre-tokenizer label. The vendored table is a JSON array of label strings that llama.cpp recognizes in `get_vocab_base_pre()`. Missing table → skip. NOTE: this is a *label* probe, not a hash probe. `probe_pretokenizer_hash` is the canonical fingerprint check; this probe only checks coarse compatibility via the label. """ path = hashes_path or VENDOR_PRETOKENIZER_HASHES_DEFAULT if not path.exists(): return ProbeResult( name="pretokenizer_label", passed=True, detail=f"skipped: {path} not present (bump-llama-cpp.sh maintains it)", skipped=True, ) try: labels = set(json.loads(path.read_text(encoding="utf-8"))) except (OSError, json.JSONDecodeError) as exc: return ProbeResult( name="pretokenizer_label", passed=False, detail=f"table unreadable: {exc}", ) except TypeError as exc: return ProbeResult( name="pretokenizer_label", passed=False, detail=f"table has wrong shape (expected list[str]): {exc}", ) if spec.tokenizer_pre in labels: return ProbeResult( name="pretokenizer_label", passed=True, detail=f"{spec.tokenizer_pre!r} known to llama.cpp", ) return ProbeResult( name="pretokenizer_label", passed=False, detail=( f"{spec.tokenizer_pre!r} not in vendored label table; " "run scripts/bump-llama-cpp.sh or pick another base" ), ) def probe_pretokenizer_hash( spec: BaseModelSpec, *, fingerprints_path: Path | None = None, ) -> ProbeResult: """Compute the real llama.cpp pre-tokenizer fingerprint and compare. See CLAUDE.md pitfall #5. The label probe (above) only checks membership in a string table; llama.cpp itself identifies the pre-tokenizer by sha256-hashing the token-id sequence produced by tokenizing a stable test string (`_LLAMA_CPP_CHKTXT`). We do the same here — if the upstream tokenizer changes behavior (new revision, silently different merges), the fingerprint drifts and this probe fails loudly *before* a broken GGUF reaches Ollama. The fingerprint table at `vendor/llama_cpp_pretokenizer_fingerprints.json` is maintained by `scripts/bump-llama-cpp.sh`. Missing table or no entry for the spec's `tokenizer_pre` label → skip (the label probe still runs). Requires a local HF cache (`local_files_only=True`); skipped cleanly in CI environments without the tokenizer downloaded. """ import hashlib path = fingerprints_path or VENDOR_PRETOKENIZER_FINGERPRINTS_DEFAULT if not path.exists(): return ProbeResult( name="pretokenizer_hash", passed=True, detail=f"skipped: {path} not present (bump-llama-cpp.sh maintains it)", skipped=True, ) try: table = json.loads(path.read_text(encoding="utf-8")) except (OSError, json.JSONDecodeError) as exc: return ProbeResult( name="pretokenizer_hash", passed=False, detail=f"fingerprint table unreadable: {exc}", ) if not isinstance(table, dict): return ProbeResult( name="pretokenizer_hash", passed=False, detail="fingerprint table has wrong shape (expected {label: sha256})", ) expected = table.get(spec.tokenizer_pre) if not isinstance(expected, str): return ProbeResult( name="pretokenizer_hash", passed=True, detail=( f"skipped: no fingerprint recorded for {spec.tokenizer_pre!r}; " "run scripts/bump-llama-cpp.sh to refresh the table" ), skipped=True, ) try: from huggingface_hub.errors import GatedRepoError from transformers import AutoTokenizer except ImportError as exc: # pragma: no cover — dev env always has transformers return ProbeResult( name="pretokenizer_hash", passed=True, detail=f"skipped: transformers unavailable ({exc})", skipped=True, ) try: tok = AutoTokenizer.from_pretrained( spec.hf_id, revision=spec.revision, local_files_only=True ) except GatedRepoError as exc: raise GatedModelError(spec.hf_id, spec.license_url) from exc except Exception as exc: # Not a probe *failure* — tokenizer simply isn't cached locally. # Online refresh-registry runs will exercise the real check. return ProbeResult( name="pretokenizer_hash", passed=True, detail=f"skipped: cannot load tokenizer offline ({type(exc).__name__})", skipped=True, ) try: tokens = tok.encode(_LLAMA_CPP_CHKTXT) except Exception as exc: return ProbeResult( name="pretokenizer_hash", passed=False, detail=f"tokenizer.encode failed on chktxt: {type(exc).__name__}: {exc}", ) digest = hashlib.sha256(str(tokens).encode()).hexdigest() if digest != expected: return ProbeResult( name="pretokenizer_hash", passed=False, detail=( f"pre-tokenizer drifted for {spec.tokenizer_pre!r}: " f"expected {expected[:12]}…, got {digest[:12]}…. " "Upstream may have changed tokenization; re-pin revision " "or run scripts/bump-llama-cpp.sh to refresh the fingerprint." ), ) return ProbeResult( name="pretokenizer_hash", passed=True, detail=f"fingerprint matches {spec.tokenizer_pre!r} ({digest[:12]}…)", ) def probe_vl_image_token(spec: BaseModelSpec) -> ProbeResult: """Verify the processor exposes the spec's image-placeholder token. For `modality="vision-language"` bases the preprocessor plan pins `image_token` (e.g. `""`). `AutoProcessor.from_pretrained` must expose it as a known additional-special token — otherwise mixed-row collation can't expand the placeholder into the model's fixed `num_image_tokens` slots and training silently runs on text-only rows. Non-VL bases skip this probe cleanly. """ if spec.modality != "vision-language" or spec.vl_preprocessor_plan is None: return ProbeResult( name="vl_image_token", passed=True, detail="skipped: spec is not a vision-language base", skipped=True, ) try: from huggingface_hub.errors import GatedRepoError from dlm.base_models._typed_shims import load_auto_processor except ImportError as exc: # pragma: no cover return ProbeResult( name="vl_image_token", passed=True, detail=f"skipped: transformers unavailable ({exc})", skipped=True, ) try: processor = load_auto_processor(spec.hf_id, revision=spec.revision) except GatedRepoError as exc: raise GatedModelError(spec.hf_id, spec.license_url) from exc except Exception as exc: return ProbeResult( name="vl_image_token", passed=False, detail=f"processor load failed: {type(exc).__name__}: {exc}", ) # AutoProcessor wraps a tokenizer on `.tokenizer`. The image # placeholder must tokenize to a *single* known token — otherwise # the collator can't locate the insertion points deterministically. placeholder = spec.vl_preprocessor_plan.image_token tokenizer = getattr(processor, "tokenizer", None) if tokenizer is None: return ProbeResult( name="vl_image_token", passed=False, detail="processor has no `.tokenizer` attribute", ) try: token_ids = tokenizer.encode(placeholder, add_special_tokens=False) except Exception as exc: return ProbeResult( name="vl_image_token", passed=False, detail=f"tokenizer rejected placeholder {placeholder!r}: {exc}", ) if len(token_ids) != 1: return ProbeResult( name="vl_image_token", passed=False, detail=( f"placeholder {placeholder!r} tokenized to {len(token_ids)} tokens (expected 1)" ), ) return ProbeResult( name="vl_image_token", passed=True, detail=f"placeholder {placeholder!r} resolves to token id {token_ids[0]}", ) def probe_audio_token(spec: BaseModelSpec) -> ProbeResult: """Verify the processor exposes the spec's audio-placeholder token. Parallel to `probe_vl_image_token` — for `modality="audio-language"` bases the preprocessor plan pins `audio_token` (e.g. `"<|AUDIO|>"`). `AutoProcessor.from_pretrained` must expose it as a single known token; otherwise the custom audio collator can't locate the insertion point when expanding the placeholder into the model's fixed audio-token window. Non-audio bases skip this probe cleanly. """ if spec.modality != "audio-language" or spec.audio_preprocessor_plan is None: return ProbeResult( name="audio_token", passed=True, detail="skipped: spec is not an audio-language base", skipped=True, ) try: from huggingface_hub.errors import GatedRepoError from dlm.base_models._typed_shims import load_auto_processor except ImportError as exc: # pragma: no cover return ProbeResult( name="audio_token", passed=True, detail=f"skipped: transformers unavailable ({exc})", skipped=True, ) try: processor = load_auto_processor(spec.hf_id, revision=spec.revision) except GatedRepoError as exc: raise GatedModelError(spec.hf_id, spec.license_url) from exc except Exception as exc: return ProbeResult( name="audio_token", passed=False, detail=f"processor load failed: {type(exc).__name__}: {exc}", ) placeholder = spec.audio_preprocessor_plan.audio_token tokenizer = getattr(processor, "tokenizer", None) if tokenizer is None: return ProbeResult( name="audio_token", passed=False, detail="processor has no `.tokenizer` attribute", ) try: token_ids = tokenizer.encode(placeholder, add_special_tokens=False) except Exception as exc: return ProbeResult( name="audio_token", passed=False, detail=f"tokenizer rejected placeholder {placeholder!r}: {exc}", ) if len(token_ids) != 1: return ProbeResult( name="audio_token", passed=False, detail=( f"placeholder {placeholder!r} tokenized to {len(token_ids)} tokens (expected 1)" ), ) return ProbeResult( name="audio_token", passed=True, detail=f"placeholder {placeholder!r} resolves to token id {token_ids[0]}", ) # --- aggregate --------------------------------------------------------------- def run_all(spec: BaseModelSpec, *, skip_export_probes: bool = False) -> ProbeReport: """Run every probe; aggregate into a `ProbeReport`. `GatedModelError` from an individual probe propagates immediately — it's not a "probe failure" in the registry-drift sense; it's an acceptance-flow signal. `skip_export_probes=True` drops the three llama.cpp / GGUF-conversion checks (`gguf_arch_supported`, `pretokenizer_label`, `pretokenizer_hash`). Users opt into this when they want training + HF inference on a base whose architecture ships faster than our vendored llama.cpp can absorb (e.g. brand-new Qwen3 on a llama.cpp pin from last month). They forfeit `dlm export` to Ollama until the vendored copy catches up. VL bases auto-opt-out of export probes because current GGUF export does not support them. """ from dlm.modality import modality_for dispatch = modality_for(spec) core: tuple[ProbeResult, ...] = (probe_architecture(spec),) if dispatch.accepts_images: core = (*core, probe_vl_image_token(spec)) elif dispatch.accepts_audio: core = (*core, probe_audio_token(spec)) else: core = (*core, probe_chat_template(spec)) # Media bases (VL + audio) bypass the llama.cpp-converter probes. # The export path refuses GGUF cleanly for both and emits an HF # snapshot instead. is_media = dispatch.requires_processor if skip_export_probes or is_media: return ProbeReport(hf_id=spec.hf_id, results=core) results = ( *core, probe_gguf_arch_supported(spec), probe_pretokenizer_label(spec), probe_pretokenizer_hash(spec), ) return ProbeReport(hf_id=spec.hf_id, results=results)