Python · 11508 bytes Raw Blame History
1 """Single-file VL GGUF emitter for SUPPORTED vision-language bases.
2
3 Entered only when `arch_probe.probe_gguf_arch` returns
4 `SupportLevel.SUPPORTED` (today: `Qwen2VLForConditionalGeneration` at
5 the pinned llama.cpp tag). PaliGemma and InternVL2 remain UNSUPPORTED
6 upstream and route to the HF-snapshot fallback via the dispatcher.
7
8 **Merged-only.** `convert_lora_to_gguf.py` doesn't recognize the
9 `model.language_model.*` layer-naming convention that VL adapters
10 carry, so we refuse non-merged plans and let the dispatcher fall back
11 to HF-snapshot. The renderer emits `FROM ./base.<quant>.gguf` with no
12 `ADAPTER` line as a consequence. See `render_vl_modelfile` docstring.
13
14 **Single-file vs mmproj.** Upstream at tag `b8816` writes a single
15 GGUF for Qwen2-VL — the ViT is dropped and Ollama's preprocessor runs
16 the vision path on its own. Manifest records `mmproj_path=None`; a
17 future tag that changes this (split emission with an `mmproj-*.gguf`
18 sidecar) would add a new field without breaking the single-file
19 contract.
20
21 **No imatrix.** The replay corpus is text-only; imatrix calibration
22 would mis-weight vision-adjacent layers on any future VL base that
23 bakes projector tensors into the GGUF. Quant proceeds without
24 `--imatrix`.
25 """
26
27 from __future__ import annotations
28
29 import json
30 import logging
31 import tempfile
32 from collections.abc import Callable, Sequence
33 from dataclasses import dataclass
34 from pathlib import Path
35 from typing import TYPE_CHECKING, Any
36
37 from dlm.export import base_gguf, merge, preflight
38 from dlm.export.arch_probe import ArchProbeResult, SupportLevel
39 from dlm.export.errors import ExportError, VlGgufUnsupportedError
40 from dlm.export.manifest import (
41 ExportManifest,
42 build_artifact,
43 compute_sha256,
44 save_export_manifest,
45 utc_now,
46 )
47 from dlm.export.ollama.vl_modelfile import VlModelfileContext, render_vl_modelfile
48 from dlm.export.plan import ExportPlan
49 from dlm.export.precision_safety import require_dequantize_or_refuse
50 from dlm.export.quantize import run_checked
51 from dlm.io.atomic import write_text as atomic_write_text
52
53 if TYPE_CHECKING:
54 from dlm.base_models import BaseModelSpec
55 from dlm.store.paths import StorePath
56
57 _LOG = logging.getLogger(__name__)
58
59 # Injection seam matching `runner.py` — a callable that runs subprocess
60 # args and returns a `CompletedProcess`-shaped result. Unit tests
61 # substitute a recorder; production passes through to `run_checked`.
62 SubprocessRunner = Callable[[Sequence[str]], Any]
63
64
65 @dataclass(frozen=True)
66 class VlGgufResult:
67 """Return value of `run_vl_gguf_export` — what the CLI prints."""
68
69 export_dir: Path
70 manifest_path: Path
71 modelfile_path: Path
72 gguf_path: Path
73 mmproj_path: Path | None # reserved for future split-emission archs
74 quant: str
75 llama_cpp_tag: str | None
76 artifacts: list[Path]
77
78
79 def _assert_supported(verdict: ArchProbeResult, plan: ExportPlan) -> None:
80 """Gate: refuse anything the emitter doesn't claim to handle.
81
82 Three preconditions, each with a distinct error message so the
83 dispatcher's fallback banner names the actual reason:
84
85 1. `verdict.support is SupportLevel.SUPPORTED` — otherwise upstream
86 `convert_hf_to_gguf.py` would either fail outright (UNSUPPORTED)
87 or drop critical tensors (PARTIAL → would ship an
88 under-converted GGUF silently).
89 2. `plan.merged is True` — LoRA-to-GGUF for VL archs isn't
90 plumbed upstream at our tag; merged-only is the safe shape.
91 3. `plan.imatrix == "off"` — the replay corpus is text-only;
92 importance-matrix calibration would mis-weight vision-adjacent
93 layers once a future arch bakes them into the GGUF.
94 """
95 if verdict.support is not SupportLevel.SUPPORTED:
96 raise VlGgufUnsupportedError(
97 f"arch {verdict.arch_class!r} verdict={verdict.support.value!r} "
98 f"at llama.cpp tag={verdict.llama_cpp_tag!r}; single-file VL GGUF "
99 "emission requires SUPPORTED. Fallback: HF-snapshot."
100 )
101 if not plan.merged:
102 raise VlGgufUnsupportedError(
103 "VL GGUF emission is merged-only at this upstream tag (LoRA-to-GGUF "
104 "for VL archs isn't supported by convert_lora_to_gguf.py). Pass "
105 "--merged, or fall back to HF-snapshot."
106 )
107 if plan.imatrix != "off":
108 raise VlGgufUnsupportedError(
109 "VL GGUF emission refuses imatrix calibration: the replay corpus "
110 "is text-only and would mis-weight vision-adjacent quant stats. "
111 "Pass --no-imatrix (or omit --imatrix) to proceed."
112 )
113
114
115 def _resolve_adapter(
116 store: StorePath,
117 *,
118 adapter_name: str | None,
119 ) -> tuple[Path, int]:
120 """Return (adapter_path, version) with a typed refusal on empty stores.
121
122 Mirrors the dispatch logic in `runner._resolve_adapter_for_export`
123 but scoped to the VL path so the VL module stays self-contained.
124 Callers receive a `VlGgufUnsupportedError` (not a generic
125 `ExportError`) so the dispatcher's fallback banner fires correctly.
126 """
127 if adapter_name is None:
128 resolved = store.resolve_current_adapter()
129 pointer = store.adapter_current_pointer
130 else:
131 resolved = store.resolve_current_adapter_for(adapter_name)
132 pointer = store.adapter_current_pointer_for(adapter_name)
133 if resolved is None or not resolved.exists():
134 raise VlGgufUnsupportedError(
135 f"no current adapter under {pointer}; run `dlm train` before exporting."
136 )
137 version = _version_from_dir_name(resolved)
138 return resolved, version
139
140
141 def _version_from_dir_name(path: Path) -> int:
142 stem = path.name
143 if not stem.startswith("v") or not stem[1:].isdigit():
144 return 1
145 return int(stem[1:])
146
147
148 def run_vl_gguf_export(
149 store: StorePath,
150 spec: BaseModelSpec,
151 plan: ExportPlan,
152 *,
153 verdict: ArchProbeResult,
154 cached_base_dir: Path,
155 adapter_name: str | None = None,
156 system_prompt: str | None = None,
157 source_dlm_path: Path | None = None,
158 dlm_version: str = "dev",
159 training_sequence_len: int | None = None,
160 subprocess_runner: SubprocessRunner | None = None,
161 merge_runner: Callable[..., None] | None = None,
162 llama_cpp_root_override: Path | None = None,
163 ) -> VlGgufResult:
164 """Orchestrate merge → convert → quantize → Modelfile for a VL base.
165
166 `subprocess_runner` and `merge_runner` are injection seams: the
167 production path wires in `run_checked` + `merge.perform_vl_merge`;
168 unit tests substitute recorders. Every arg after `plan` is
169 keyword-only — the production call is verbose but unambiguous.
170
171 Returns a `VlGgufResult`; raises `VlGgufUnsupportedError` or
172 `ExportError` on any precondition or subprocess failure. The
173 dispatcher catches and falls back to HF-snapshot.
174 """
175 _assert_supported(verdict, plan)
176
177 adapter_path, adapter_version = _resolve_adapter(store, adapter_name=adapter_name)
178
179 preflight.check_adapter_config(adapter_path, spec)
180 preflight.check_tokenizer_vocab(adapter_path)
181 preflight.check_chat_template(adapter_path, required=False)
182 preflight.check_vl_target_modules_lm_only(adapter_path)
183 require_dequantize_or_refuse(plan, adapter_path)
184
185 export_dir = store.exports / f"vl-gguf-{plan.quant}"
186 export_dir.mkdir(parents=True, exist_ok=True)
187
188 base_gguf_name = f"base.{plan.quant}.gguf"
189 gguf_path = export_dir / base_gguf_name
190 modelfile_path = export_dir / "Modelfile"
191
192 run = subprocess_runner if subprocess_runner is not None else _default_runner
193 do_merge = merge_runner if merge_runner is not None else merge.perform_vl_merge
194
195 # Merge → fp16 HF dir → GGUF → quantized GGUF. Temp dir holds the
196 # merged HF snapshot + fp16 GGUF — both are multi-GB artifacts we
197 # don't need after the quantized GGUF lands in `export_dir`.
198 with tempfile.TemporaryDirectory(prefix="dlm-vl-gguf-") as tmp_s:
199 tmp = Path(tmp_s)
200 merged_hf = tmp / "merged"
201 fp16_gguf = tmp / f"base.{plan.quant}.f16.gguf"
202
203 do_merge(adapter_path, merged_hf, cached_base_dir=cached_base_dir)
204
205 run(
206 base_gguf.build_convert_hf_args(
207 merged_hf,
208 out_fp16=fp16_gguf,
209 script_override=llama_cpp_root_override,
210 )
211 )
212 run(
213 base_gguf.build_quantize_args(
214 fp16_gguf,
215 out_quant=gguf_path,
216 quant=plan.quant,
217 bin_override=llama_cpp_root_override,
218 )
219 )
220
221 if not gguf_path.exists():
222 raise ExportError(
223 f"VL GGUF emission: expected {gguf_path} after llama-quantize; "
224 "subprocess succeeded but the file is missing (check disk space + "
225 "vendored build)."
226 )
227
228 modelfile_body = render_vl_modelfile(
229 VlModelfileContext(
230 spec=spec,
231 plan=plan,
232 adapter_dir=adapter_path,
233 base_gguf_name=base_gguf_name,
234 adapter_gguf_name=None, # merged-only path, no ADAPTER directive
235 dlm_id=store.root.name,
236 adapter_version=adapter_version,
237 system_prompt=system_prompt,
238 source_dlm_path=source_dlm_path,
239 dlm_version=dlm_version,
240 training_sequence_len=training_sequence_len,
241 )
242 )
243 atomic_write_text(modelfile_path, modelfile_body)
244
245 artifacts_files = [gguf_path, modelfile_path]
246 manifest = ExportManifest(
247 created_at=utc_now(),
248 created_by=dlm_version,
249 base_model_hf_id=spec.hf_id,
250 base_model_revision=spec.revision,
251 quant=plan.quant,
252 merged=plan.merged,
253 ollama_name=None,
254 llama_cpp_tag=verdict.llama_cpp_tag,
255 adapter_version=adapter_version,
256 artifacts=[build_artifact(export_dir, p) for p in artifacts_files],
257 )
258 manifest_path = save_export_manifest(export_dir, manifest)
259
260 # Attach a compact `vl_gguf.json` sidecar capturing the arch verdict
261 # + the mmproj contract explicitly. The main manifest carries the
262 # llama.cpp tag but not the SUPPORTED/PARTIAL verdict string.
263 _write_vl_sidecar(export_dir, verdict=verdict, gguf_sha=compute_sha256(gguf_path))
264
265 return VlGgufResult(
266 export_dir=export_dir,
267 manifest_path=manifest_path,
268 modelfile_path=modelfile_path,
269 gguf_path=gguf_path,
270 mmproj_path=None,
271 quant=plan.quant,
272 llama_cpp_tag=verdict.llama_cpp_tag,
273 artifacts=artifacts_files,
274 )
275
276
277 def _default_runner(args: Sequence[str]) -> Any:
278 """Production subprocess runner — `run_checked` from the quantize module.
279
280 Pulled out so `run_vl_gguf_export`'s default is easy to override in
281 tests via the `subprocess_runner` kwarg.
282 """
283 return run_checked(list(args), timeout=60 * 60)
284
285
286 def _write_vl_sidecar(
287 export_dir: Path,
288 *,
289 verdict: ArchProbeResult,
290 gguf_sha: str,
291 ) -> None:
292 """Persist the arch verdict + mmproj contract alongside the manifest."""
293 sidecar = {
294 "schema": 1,
295 "arch_verdict": {
296 "architecture": verdict.arch_class,
297 "support": verdict.support.value,
298 "llama_cpp_tag": verdict.llama_cpp_tag,
299 "reason": verdict.reason,
300 },
301 "mmproj_path": None,
302 "gguf_sha256": gguf_sha,
303 }
304 atomic_write_text(
305 export_dir / "vl_gguf.json",
306 json.dumps(sidecar, indent=2, sort_keys=True) + "\n",
307 )