| 1 |
"""Single-file VL GGUF emitter for SUPPORTED vision-language bases. |
| 2 |
|
| 3 |
Entered only when `arch_probe.probe_gguf_arch` returns |
| 4 |
`SupportLevel.SUPPORTED` (today: `Qwen2VLForConditionalGeneration` at |
| 5 |
the pinned llama.cpp tag). PaliGemma and InternVL2 remain UNSUPPORTED |
| 6 |
upstream and route to the HF-snapshot fallback via the dispatcher. |
| 7 |
|
| 8 |
**Merged-only.** `convert_lora_to_gguf.py` doesn't recognize the |
| 9 |
`model.language_model.*` layer-naming convention that VL adapters |
| 10 |
carry, so we refuse non-merged plans and let the dispatcher fall back |
| 11 |
to HF-snapshot. The renderer emits `FROM ./base.<quant>.gguf` with no |
| 12 |
`ADAPTER` line as a consequence. See `render_vl_modelfile` docstring. |
| 13 |
|
| 14 |
**Single-file vs mmproj.** Upstream at tag `b8816` writes a single |
| 15 |
GGUF for Qwen2-VL — the ViT is dropped and Ollama's preprocessor runs |
| 16 |
the vision path on its own. Manifest records `mmproj_path=None`; a |
| 17 |
future tag that changes this (split emission with an `mmproj-*.gguf` |
| 18 |
sidecar) would add a new field without breaking the single-file |
| 19 |
contract. |
| 20 |
|
| 21 |
**No imatrix.** The replay corpus is text-only; imatrix calibration |
| 22 |
would mis-weight vision-adjacent layers on any future VL base that |
| 23 |
bakes projector tensors into the GGUF. Quant proceeds without |
| 24 |
`--imatrix`. |
| 25 |
""" |
| 26 |
|
| 27 |
from __future__ import annotations |
| 28 |
|
| 29 |
import json |
| 30 |
import logging |
| 31 |
import tempfile |
| 32 |
from collections.abc import Callable, Sequence |
| 33 |
from dataclasses import dataclass |
| 34 |
from pathlib import Path |
| 35 |
from typing import TYPE_CHECKING, Any |
| 36 |
|
| 37 |
from dlm.export import base_gguf, merge, preflight |
| 38 |
from dlm.export.arch_probe import ArchProbeResult, SupportLevel |
| 39 |
from dlm.export.errors import ExportError, VlGgufUnsupportedError |
| 40 |
from dlm.export.manifest import ( |
| 41 |
ExportManifest, |
| 42 |
build_artifact, |
| 43 |
compute_sha256, |
| 44 |
save_export_manifest, |
| 45 |
utc_now, |
| 46 |
) |
| 47 |
from dlm.export.ollama.vl_modelfile import VlModelfileContext, render_vl_modelfile |
| 48 |
from dlm.export.plan import ExportPlan |
| 49 |
from dlm.export.precision_safety import require_dequantize_or_refuse |
| 50 |
from dlm.export.quantize import run_checked |
| 51 |
from dlm.io.atomic import write_text as atomic_write_text |
| 52 |
|
| 53 |
if TYPE_CHECKING: |
| 54 |
from dlm.base_models import BaseModelSpec |
| 55 |
from dlm.store.paths import StorePath |
| 56 |
|
| 57 |
_LOG = logging.getLogger(__name__) |
| 58 |
|
| 59 |
# Injection seam matching `runner.py` — a callable that runs subprocess |
| 60 |
# args and returns a `CompletedProcess`-shaped result. Unit tests |
| 61 |
# substitute a recorder; production passes through to `run_checked`. |
| 62 |
SubprocessRunner = Callable[[Sequence[str]], Any] |
| 63 |
|
| 64 |
|
| 65 |
@dataclass(frozen=True) |
| 66 |
class VlGgufResult: |
| 67 |
"""Return value of `run_vl_gguf_export` — what the CLI prints.""" |
| 68 |
|
| 69 |
export_dir: Path |
| 70 |
manifest_path: Path |
| 71 |
modelfile_path: Path |
| 72 |
gguf_path: Path |
| 73 |
mmproj_path: Path | None # reserved for future split-emission archs |
| 74 |
quant: str |
| 75 |
llama_cpp_tag: str | None |
| 76 |
artifacts: list[Path] |
| 77 |
|
| 78 |
|
| 79 |
def _assert_supported(verdict: ArchProbeResult, plan: ExportPlan) -> None: |
| 80 |
"""Gate: refuse anything the emitter doesn't claim to handle. |
| 81 |
|
| 82 |
Three preconditions, each with a distinct error message so the |
| 83 |
dispatcher's fallback banner names the actual reason: |
| 84 |
|
| 85 |
1. `verdict.support is SupportLevel.SUPPORTED` — otherwise upstream |
| 86 |
`convert_hf_to_gguf.py` would either fail outright (UNSUPPORTED) |
| 87 |
or drop critical tensors (PARTIAL → would ship an |
| 88 |
under-converted GGUF silently). |
| 89 |
2. `plan.merged is True` — LoRA-to-GGUF for VL archs isn't |
| 90 |
plumbed upstream at our tag; merged-only is the safe shape. |
| 91 |
3. `plan.imatrix == "off"` — the replay corpus is text-only; |
| 92 |
importance-matrix calibration would mis-weight vision-adjacent |
| 93 |
layers once a future arch bakes them into the GGUF. |
| 94 |
""" |
| 95 |
if verdict.support is not SupportLevel.SUPPORTED: |
| 96 |
raise VlGgufUnsupportedError( |
| 97 |
f"arch {verdict.arch_class!r} verdict={verdict.support.value!r} " |
| 98 |
f"at llama.cpp tag={verdict.llama_cpp_tag!r}; single-file VL GGUF " |
| 99 |
"emission requires SUPPORTED. Fallback: HF-snapshot." |
| 100 |
) |
| 101 |
if not plan.merged: |
| 102 |
raise VlGgufUnsupportedError( |
| 103 |
"VL GGUF emission is merged-only at this upstream tag (LoRA-to-GGUF " |
| 104 |
"for VL archs isn't supported by convert_lora_to_gguf.py). Pass " |
| 105 |
"--merged, or fall back to HF-snapshot." |
| 106 |
) |
| 107 |
if plan.imatrix != "off": |
| 108 |
raise VlGgufUnsupportedError( |
| 109 |
"VL GGUF emission refuses imatrix calibration: the replay corpus " |
| 110 |
"is text-only and would mis-weight vision-adjacent quant stats. " |
| 111 |
"Pass --no-imatrix (or omit --imatrix) to proceed." |
| 112 |
) |
| 113 |
|
| 114 |
|
| 115 |
def _resolve_adapter( |
| 116 |
store: StorePath, |
| 117 |
*, |
| 118 |
adapter_name: str | None, |
| 119 |
) -> tuple[Path, int]: |
| 120 |
"""Return (adapter_path, version) with a typed refusal on empty stores. |
| 121 |
|
| 122 |
Mirrors the dispatch logic in `runner._resolve_adapter_for_export` |
| 123 |
but scoped to the VL path so the VL module stays self-contained. |
| 124 |
Callers receive a `VlGgufUnsupportedError` (not a generic |
| 125 |
`ExportError`) so the dispatcher's fallback banner fires correctly. |
| 126 |
""" |
| 127 |
if adapter_name is None: |
| 128 |
resolved = store.resolve_current_adapter() |
| 129 |
pointer = store.adapter_current_pointer |
| 130 |
else: |
| 131 |
resolved = store.resolve_current_adapter_for(adapter_name) |
| 132 |
pointer = store.adapter_current_pointer_for(adapter_name) |
| 133 |
if resolved is None or not resolved.exists(): |
| 134 |
raise VlGgufUnsupportedError( |
| 135 |
f"no current adapter under {pointer}; run `dlm train` before exporting." |
| 136 |
) |
| 137 |
version = _version_from_dir_name(resolved) |
| 138 |
return resolved, version |
| 139 |
|
| 140 |
|
| 141 |
def _version_from_dir_name(path: Path) -> int: |
| 142 |
stem = path.name |
| 143 |
if not stem.startswith("v") or not stem[1:].isdigit(): |
| 144 |
return 1 |
| 145 |
return int(stem[1:]) |
| 146 |
|
| 147 |
|
| 148 |
def run_vl_gguf_export( |
| 149 |
store: StorePath, |
| 150 |
spec: BaseModelSpec, |
| 151 |
plan: ExportPlan, |
| 152 |
*, |
| 153 |
verdict: ArchProbeResult, |
| 154 |
cached_base_dir: Path, |
| 155 |
adapter_name: str | None = None, |
| 156 |
system_prompt: str | None = None, |
| 157 |
source_dlm_path: Path | None = None, |
| 158 |
dlm_version: str = "dev", |
| 159 |
training_sequence_len: int | None = None, |
| 160 |
subprocess_runner: SubprocessRunner | None = None, |
| 161 |
merge_runner: Callable[..., None] | None = None, |
| 162 |
llama_cpp_root_override: Path | None = None, |
| 163 |
) -> VlGgufResult: |
| 164 |
"""Orchestrate merge → convert → quantize → Modelfile for a VL base. |
| 165 |
|
| 166 |
`subprocess_runner` and `merge_runner` are injection seams: the |
| 167 |
production path wires in `run_checked` + `merge.perform_vl_merge`; |
| 168 |
unit tests substitute recorders. Every arg after `plan` is |
| 169 |
keyword-only — the production call is verbose but unambiguous. |
| 170 |
|
| 171 |
Returns a `VlGgufResult`; raises `VlGgufUnsupportedError` or |
| 172 |
`ExportError` on any precondition or subprocess failure. The |
| 173 |
dispatcher catches and falls back to HF-snapshot. |
| 174 |
""" |
| 175 |
_assert_supported(verdict, plan) |
| 176 |
|
| 177 |
adapter_path, adapter_version = _resolve_adapter(store, adapter_name=adapter_name) |
| 178 |
|
| 179 |
preflight.check_adapter_config(adapter_path, spec) |
| 180 |
preflight.check_tokenizer_vocab(adapter_path) |
| 181 |
preflight.check_chat_template(adapter_path, required=False) |
| 182 |
preflight.check_vl_target_modules_lm_only(adapter_path) |
| 183 |
require_dequantize_or_refuse(plan, adapter_path) |
| 184 |
|
| 185 |
export_dir = store.exports / f"vl-gguf-{plan.quant}" |
| 186 |
export_dir.mkdir(parents=True, exist_ok=True) |
| 187 |
|
| 188 |
base_gguf_name = f"base.{plan.quant}.gguf" |
| 189 |
gguf_path = export_dir / base_gguf_name |
| 190 |
modelfile_path = export_dir / "Modelfile" |
| 191 |
|
| 192 |
run = subprocess_runner if subprocess_runner is not None else _default_runner |
| 193 |
do_merge = merge_runner if merge_runner is not None else merge.perform_vl_merge |
| 194 |
|
| 195 |
# Merge → fp16 HF dir → GGUF → quantized GGUF. Temp dir holds the |
| 196 |
# merged HF snapshot + fp16 GGUF — both are multi-GB artifacts we |
| 197 |
# don't need after the quantized GGUF lands in `export_dir`. |
| 198 |
with tempfile.TemporaryDirectory(prefix="dlm-vl-gguf-") as tmp_s: |
| 199 |
tmp = Path(tmp_s) |
| 200 |
merged_hf = tmp / "merged" |
| 201 |
fp16_gguf = tmp / f"base.{plan.quant}.f16.gguf" |
| 202 |
|
| 203 |
do_merge(adapter_path, merged_hf, cached_base_dir=cached_base_dir) |
| 204 |
|
| 205 |
run( |
| 206 |
base_gguf.build_convert_hf_args( |
| 207 |
merged_hf, |
| 208 |
out_fp16=fp16_gguf, |
| 209 |
script_override=llama_cpp_root_override, |
| 210 |
) |
| 211 |
) |
| 212 |
run( |
| 213 |
base_gguf.build_quantize_args( |
| 214 |
fp16_gguf, |
| 215 |
out_quant=gguf_path, |
| 216 |
quant=plan.quant, |
| 217 |
bin_override=llama_cpp_root_override, |
| 218 |
) |
| 219 |
) |
| 220 |
|
| 221 |
if not gguf_path.exists(): |
| 222 |
raise ExportError( |
| 223 |
f"VL GGUF emission: expected {gguf_path} after llama-quantize; " |
| 224 |
"subprocess succeeded but the file is missing (check disk space + " |
| 225 |
"vendored build)." |
| 226 |
) |
| 227 |
|
| 228 |
modelfile_body = render_vl_modelfile( |
| 229 |
VlModelfileContext( |
| 230 |
spec=spec, |
| 231 |
plan=plan, |
| 232 |
adapter_dir=adapter_path, |
| 233 |
base_gguf_name=base_gguf_name, |
| 234 |
adapter_gguf_name=None, # merged-only path, no ADAPTER directive |
| 235 |
dlm_id=store.root.name, |
| 236 |
adapter_version=adapter_version, |
| 237 |
system_prompt=system_prompt, |
| 238 |
source_dlm_path=source_dlm_path, |
| 239 |
dlm_version=dlm_version, |
| 240 |
training_sequence_len=training_sequence_len, |
| 241 |
) |
| 242 |
) |
| 243 |
atomic_write_text(modelfile_path, modelfile_body) |
| 244 |
|
| 245 |
artifacts_files = [gguf_path, modelfile_path] |
| 246 |
manifest = ExportManifest( |
| 247 |
created_at=utc_now(), |
| 248 |
created_by=dlm_version, |
| 249 |
base_model_hf_id=spec.hf_id, |
| 250 |
base_model_revision=spec.revision, |
| 251 |
quant=plan.quant, |
| 252 |
merged=plan.merged, |
| 253 |
ollama_name=None, |
| 254 |
llama_cpp_tag=verdict.llama_cpp_tag, |
| 255 |
adapter_version=adapter_version, |
| 256 |
artifacts=[build_artifact(export_dir, p) for p in artifacts_files], |
| 257 |
) |
| 258 |
manifest_path = save_export_manifest(export_dir, manifest) |
| 259 |
|
| 260 |
# Attach a compact `vl_gguf.json` sidecar capturing the arch verdict |
| 261 |
# + the mmproj contract explicitly. The main manifest carries the |
| 262 |
# llama.cpp tag but not the SUPPORTED/PARTIAL verdict string. |
| 263 |
_write_vl_sidecar(export_dir, verdict=verdict, gguf_sha=compute_sha256(gguf_path)) |
| 264 |
|
| 265 |
return VlGgufResult( |
| 266 |
export_dir=export_dir, |
| 267 |
manifest_path=manifest_path, |
| 268 |
modelfile_path=modelfile_path, |
| 269 |
gguf_path=gguf_path, |
| 270 |
mmproj_path=None, |
| 271 |
quant=plan.quant, |
| 272 |
llama_cpp_tag=verdict.llama_cpp_tag, |
| 273 |
artifacts=artifacts_files, |
| 274 |
) |
| 275 |
|
| 276 |
|
| 277 |
def _default_runner(args: Sequence[str]) -> Any: |
| 278 |
"""Production subprocess runner — `run_checked` from the quantize module. |
| 279 |
|
| 280 |
Pulled out so `run_vl_gguf_export`'s default is easy to override in |
| 281 |
tests via the `subprocess_runner` kwarg. |
| 282 |
""" |
| 283 |
return run_checked(list(args), timeout=60 * 60) |
| 284 |
|
| 285 |
|
| 286 |
def _write_vl_sidecar( |
| 287 |
export_dir: Path, |
| 288 |
*, |
| 289 |
verdict: ArchProbeResult, |
| 290 |
gguf_sha: str, |
| 291 |
) -> None: |
| 292 |
"""Persist the arch verdict + mmproj contract alongside the manifest.""" |
| 293 |
sidecar = { |
| 294 |
"schema": 1, |
| 295 |
"arch_verdict": { |
| 296 |
"architecture": verdict.arch_class, |
| 297 |
"support": verdict.support.value, |
| 298 |
"llama_cpp_tag": verdict.llama_cpp_tag, |
| 299 |
"reason": verdict.reason, |
| 300 |
}, |
| 301 |
"mmproj_path": None, |
| 302 |
"gguf_sha256": gguf_sha, |
| 303 |
} |
| 304 |
atomic_write_text( |
| 305 |
export_dir / "vl_gguf.json", |
| 306 |
json.dumps(sidecar, indent=2, sort_keys=True) + "\n", |
| 307 |
) |