"""Cross-check adapter embedding rows against base GGUF rows. The pad-fallback path sets `modules_to_save=["embed_tokens","lm_head"]` so the LoRA adapter carries its own trained embedding / lm-head rows. At export time the base GGUF's corresponding rows must match byte-for-byte, or the adapter's added-token embeddings end up multiplied against uninitialized base rows — the "gibberish on `<|pad|>`" failure mode. Contract: - Runs only when `adapter_config.json::modules_to_save` includes either `embed_tokens` or `lm_head`. Otherwise nothing to compare. - Hashes the added-special-token rows from both sides (PEFT safetensors, base GGUF) and compares. - Skips cleanly when the adapter tokenizer has no added specials (nothing changed; base rows are authoritative). - Skips cleanly when the base GGUF's embedding is block-quantized (re-quantize-after-the-fact pipeline; row-level check impossible) with an explanatory `PreflightError.probe="embedding_row_sha"` detail. Kept informational rather than failing until we see a real user hit it. The per-architecture tensor-name map is small — llama-family / chatml / phi3 / mistral all use the same convention (`token_embd.weight` / `output.weight`), so one entry covers our v1 registry. Future archs can extend `_ARCH_TENSOR_NAMES` without touching the assertion logic. """ from __future__ import annotations import hashlib import json from dataclasses import dataclass from pathlib import Path from typing import Any, Final from dlm.export.errors import PreflightError from dlm.export.gguf_tensors import _SCALAR_BYTES, load_tensor_index @dataclass(frozen=True) class _TensorMap: """Maps a dlm logical module name to GGUF + PEFT name fragments.""" gguf_name: str safetensors_suffix: str # dlm-internal logical name → (GGUF tensor name, PEFT safetensors key suffix). # `modules_to_save` layers are written under # `base_model.model..modules_to_save.default.weight` by PEFT. # The prefix varies per arch but the trailing suffix is stable; # we scan the file's key index for the suffix rather than hard-coding # the full path. _DEFAULT_TENSOR_MAP: Final[dict[str, _TensorMap]] = { "embed_tokens": _TensorMap( gguf_name="token_embd.weight", safetensors_suffix="embed_tokens.modules_to_save.default.weight", ), "lm_head": _TensorMap( gguf_name="output.weight", safetensors_suffix="lm_head.modules_to_save.default.weight", ), } def assert_embedding_rows_match( adapter_dir: Path, base_gguf: Path, ) -> None: """Verify added-token rows agree between adapter safetensors and base GGUF. Skip conditions (no-raise): - `adapter_config.json` missing or unreadable — export preflight's `check_adapter_config` already owns that error path. - `modules_to_save` absent / doesn't include embed_tokens or lm_head — adapter doesn't own any embedding rows. - Tokenizer config has no added special tokens — nothing to check; base rows are authoritative. Raises `PreflightError(probe="embedding_row_sha")` on a real mismatch or on a file-level corruption (missing safetensors, absent GGUF tensor, dtype unsupported). """ cfg_path = adapter_dir / "adapter_config.json" if not cfg_path.is_file(): return try: cfg = json.loads(cfg_path.read_text(encoding="utf-8")) except (OSError, json.JSONDecodeError): # Export preflight owns the "unreadable adapter config" # error path; we silently opt out here rather than double-report. return saved_modules = cfg.get("modules_to_save") or [] if not isinstance(saved_modules, list): return applicable = [m for m in saved_modules if m in _DEFAULT_TENSOR_MAP] if not applicable: return added_token_ids = _added_special_token_ids(adapter_dir) if not added_token_ids: return # Load the base GGUF's tensor index once; reused across modules. index = load_tensor_index(base_gguf) # Load the adapter safetensors once; reused across modules. safetensors_data = _load_adapter_safetensors(adapter_dir) mismatches: list[str] = [] for module in applicable: tmap = _DEFAULT_TENSOR_MAP[module] adapter_tensor = _find_module_tensor(safetensors_data, tmap.safetensors_suffix) if adapter_tensor is None: # modules_to_save declared but the tensor isn't in the file — # surface as a real error; adapter is malformed. raise PreflightError( probe="embedding_row_sha", detail=( f"adapter declares modules_to_save={module!r} but " f"{tmap.safetensors_suffix} is absent from adapter " f"safetensors in {adapter_dir}" ), ) gguf_entry = index.find(tmap.gguf_name) if gguf_entry is None: raise PreflightError( probe="embedding_row_sha", detail=( f"base GGUF {base_gguf.name} is missing tensor " f"{tmap.gguf_name!r}; cannot verify adapter " f"modules_to_save={module!r}" ), ) if gguf_entry.dtype not in _SCALAR_BYTES: # The base was quantized to a block-quantized type (user # ran the flow with a non-default quant that touches the # embedding). We can't read rows. Surface as a preflight # error so the operator knows the check didn't run rather # than silently passing. raise PreflightError( probe="embedding_row_sha", detail=( f"{tmap.gguf_name!r} in {base_gguf.name} is " f"block-quantized (ggml dtype {gguf_entry.dtype}); " "re-export with embedding tensors left at F16 or " "disable the embedding_rows checker explicitly." ), ) adapter_rows = _as_row_list(adapter_tensor) for tid in added_token_ids: if tid < 0 or tid >= len(adapter_rows): # Added token id is out of the adapter-tensor vocab # range. Either the tokenizer added tokens without # resizing embed_tokens, or the safetensors shape is stale. raise PreflightError( probe="embedding_row_sha", detail=( f"added token id {tid} is out of range for " f"{module} (adapter has {len(adapter_rows)} rows)" ), ) adapter_sha = hashlib.sha256(adapter_rows[tid]).hexdigest() # `index.row_bytes` raises `PreflightError` on dtype mismatch / # out-of-range — those bubble up naturally; no catch-and-rethrow. base_row = index.row_bytes(tmap.gguf_name, tid) base_sha = hashlib.sha256(base_row).hexdigest() if adapter_sha != base_sha: mismatches.append( f"{module}[{tid}]: adapter={adapter_sha[:12]}… base={base_sha[:12]}…" ) if mismatches: raise PreflightError( probe="embedding_row_sha", detail=( "adapter embedding rows disagree with base GGUF for " f"{len(mismatches)} added token(s): {'; '.join(mismatches)}. " "The base was regenerated against a different tokenizer; " "re-run `dlm export` with a fresh base conversion." ), ) # --- internals ---------------------------------------------------------------- def _added_special_token_ids(adapter_dir: Path) -> list[int]: """Return the sorted list of added-special-token ids, or `[]`.""" cfg_path = adapter_dir / "tokenizer_config.json" if not cfg_path.is_file(): return [] try: cfg = json.loads(cfg_path.read_text(encoding="utf-8")) except (OSError, json.JSONDecodeError): return [] added = cfg.get("added_tokens_decoder") or {} if not isinstance(added, dict): return [] ids: list[int] = [] for key, entry in added.items(): if not isinstance(entry, dict): continue if entry.get("special") is not True: continue try: tid = int(key) except (TypeError, ValueError): continue ids.append(tid) return sorted(set(ids)) def _load_adapter_safetensors(adapter_dir: Path) -> Any: """Open the PEFT `adapter_model.safetensors` for lazy tensor access. Returns the `safetensors.safe_open` handle (context-managed by the caller) OR a dict of {key: numpy-array} if we need to materialize for row extraction. For simplicity we materialize the two modules_to_save tensors eagerly when present — they're the biggest tensors in a modules_to_save adapter by far, but still only one embedding matrix's worth, which is bounded by `vocab_size * hidden * dtype_bytes` (~100 MB worst-case for our launch registry). """ path = adapter_dir / "adapter_model.safetensors" if not path.is_file(): raise PreflightError( probe="embedding_row_sha", detail=( f"adapter_model.safetensors not found in {adapter_dir}; " "PEFT writes this on save_pretrained — the adapter may " "have been interrupted mid-save" ), ) from safetensors import safe_open materialized: dict[str, Any] = {} try: with safe_open(str(path), framework="numpy") as handle: # type: ignore[no-untyped-call] for key in handle.keys(): # noqa: SIM118 — safetensors API # Only materialize the two modules_to_save shapes we # care about; LoRA A/B matrices are smaller but numerous. if key.endswith( ( "embed_tokens.modules_to_save.default.weight", "lm_head.modules_to_save.default.weight", ) ): materialized[key] = handle.get_tensor(key) except OSError as exc: raise PreflightError( probe="embedding_row_sha", detail=f"cannot read adapter safetensors at {path}: {exc}", ) from exc return materialized def _find_module_tensor(safetensors_data: Any, suffix: str) -> Any: """Return the safetensors entry whose key ends with `suffix`, or None. PEFT prefixes the key with `base_model.model.` or `base_model.model.model.` depending on whether the base model has a `model` submodule; matching on suffix sidesteps that variation. """ for key, tensor in safetensors_data.items(): if key.endswith(suffix): return tensor return None def _as_row_list(tensor: Any) -> list[bytes]: """Turn a numpy-like tensor into a per-row `bytes` list. The PEFT-saved embedding is (vocab_size, hidden). We slice row i to bytes of that single row's elements — then sha256 compares row-by-row without materializing the whole matrix twice. We don't convert dtypes here; the comparison is byte-level on both sides, and `convert_hf_to_gguf.py` writes the embedding in the tensor's native dtype (F16 → F16, F32 → F32). If the adapter was saved in BF16 but the base GGUF ended up as F16, the bytes differ even for mathematically-equal values — that's a real mismatch we want to flag (silent dtype conversions break inference). """ import numpy as np arr = np.asarray(tensor) if arr.ndim < 2: # embed_tokens / lm_head are always 2D; a 1D tensor means a # shape mismatch. Return empty to cause the caller's bounds # check to surface the problem. return [] # Slice ensures contiguity; tobytes on a non-contiguous row would # silently re-pack and mask dtype drift. rows: list[bytes] = [] for i in range(arr.shape[0]): row = np.ascontiguousarray(arr[i]) rows.append(bytes(row.tobytes())) return rows