documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 11508 bytes Raw Blame History

  
        1
        """Single-file VL GGUF emitter for SUPPORTED vision-language bases.
      
        2
        
        3
        Entered only when `arch_probe.probe_gguf_arch` returns
      
        4
        `SupportLevel.SUPPORTED` (today: `Qwen2VLForConditionalGeneration` at
      
        5
        the pinned llama.cpp tag). PaliGemma and InternVL2 remain UNSUPPORTED
      
        6
        upstream and route to the HF-snapshot fallback via the dispatcher.
      
        7
        
        8
        **Merged-only.** `convert_lora_to_gguf.py` doesn't recognize the
      
        9
        `model.language_model.*` layer-naming convention that VL adapters
      
        10
        carry, so we refuse non-merged plans and let the dispatcher fall back
      
        11
        to HF-snapshot. The renderer emits `FROM ./base.<quant>.gguf` with no
      
        12
        `ADAPTER` line as a consequence. See `render_vl_modelfile` docstring.
      
        13
        
        14
        **Single-file vs mmproj.** Upstream at tag `b8816` writes a single
      
        15
        GGUF for Qwen2-VL — the ViT is dropped and Ollama's preprocessor runs
      
        16
        the vision path on its own. Manifest records `mmproj_path=None`; a
      
        17
        future tag that changes this (split emission with an `mmproj-*.gguf`
      
        18
        sidecar) would add a new field without breaking the single-file
      
        19
        contract.
      
        20
        
        21
        **No imatrix.** The replay corpus is text-only; imatrix calibration
      
        22
        would mis-weight vision-adjacent layers on any future VL base that
      
        23
        bakes projector tensors into the GGUF. Quant proceeds without
      
        24
        `--imatrix`.
      
        25
        """
      
        26
        
        27
        from __future__ import annotations
      
        28
        
        29
        import json
      
        30
        import logging
      
        31
        import tempfile
      
        32
        from collections.abc import Callable, Sequence
      
        33
        from dataclasses import dataclass
      
        34
        from pathlib import Path
      
        35
        from typing import TYPE_CHECKING, Any
      
        36
        
        37
        from dlm.export import base_gguf, merge, preflight
      
        38
        from dlm.export.arch_probe import ArchProbeResult, SupportLevel
      
        39
        from dlm.export.errors import ExportError, VlGgufUnsupportedError
      
        40
        from dlm.export.manifest import (
      
        41
            ExportManifest,
      
        42
            build_artifact,
      
        43
            compute_sha256,
      
        44
            save_export_manifest,
      
        45
            utc_now,
      
        46
        )
      
        47
        from dlm.export.ollama.vl_modelfile import VlModelfileContext, render_vl_modelfile
      
        48
        from dlm.export.plan import ExportPlan
      
        49
        from dlm.export.precision_safety import require_dequantize_or_refuse
      
        50
        from dlm.export.quantize import run_checked
      
        51
        from dlm.io.atomic import write_text as atomic_write_text
      
        52
        
        53
        if TYPE_CHECKING:
      
        54
            from dlm.base_models import BaseModelSpec
      
        55
            from dlm.store.paths import StorePath
      
        56
        
        57
        _LOG = logging.getLogger(__name__)
      
        58
        
        59
        # Injection seam matching `runner.py` — a callable that runs subprocess
      
        60
        # args and returns a `CompletedProcess`-shaped result. Unit tests
      
        61
        # substitute a recorder; production passes through to `run_checked`.
      
        62
        SubprocessRunner = Callable[[Sequence[str]], Any]
      
        63
        
        64
        
        65
        @dataclass(frozen=True)
      
        66
        class VlGgufResult:
      
        67
            """Return value of `run_vl_gguf_export` — what the CLI prints."""
      
        68
        
        69
            export_dir: Path
      
        70
            manifest_path: Path
      
        71
            modelfile_path: Path
      
        72
            gguf_path: Path
      
        73
            mmproj_path: Path | None  # reserved for future split-emission archs
      
        74
            quant: str
      
        75
            llama_cpp_tag: str | None
      
        76
            artifacts: list[Path]
      
        77
        
        78
        
        79
        def _assert_supported(verdict: ArchProbeResult, plan: ExportPlan) -> None:
      
        80
            """Gate: refuse anything the emitter doesn't claim to handle.
      
        81
        
        82
            Three preconditions, each with a distinct error message so the
      
        83
            dispatcher's fallback banner names the actual reason:
      
        84
        
        85
            1. `verdict.support is SupportLevel.SUPPORTED` — otherwise upstream
      
        86
               `convert_hf_to_gguf.py` would either fail outright (UNSUPPORTED)
      
        87
               or drop critical tensors (PARTIAL → would ship an
      
        88
               under-converted GGUF silently).
      
        89
            2. `plan.merged is True` — LoRA-to-GGUF for VL archs isn't
      
        90
               plumbed upstream at our tag; merged-only is the safe shape.
      
        91
            3. `plan.imatrix == "off"` — the replay corpus is text-only;
      
        92
               importance-matrix calibration would mis-weight vision-adjacent
      
        93
               layers once a future arch bakes them into the GGUF.
      
        94
            """
      
        95
            if verdict.support is not SupportLevel.SUPPORTED:
      
        96
                raise VlGgufUnsupportedError(
      
        97
                    f"arch {verdict.arch_class!r} verdict={verdict.support.value!r} "
      
        98
                    f"at llama.cpp tag={verdict.llama_cpp_tag!r}; single-file VL GGUF "
      
        99
                    "emission requires SUPPORTED. Fallback: HF-snapshot."
      
        100
                )
      
        101
            if not plan.merged:
      
        102
                raise VlGgufUnsupportedError(
      
        103
                    "VL GGUF emission is merged-only at this upstream tag (LoRA-to-GGUF "
      
        104
                    "for VL archs isn't supported by convert_lora_to_gguf.py). Pass "
      
        105
                    "--merged, or fall back to HF-snapshot."
      
        106
                )
      
        107
            if plan.imatrix != "off":
      
        108
                raise VlGgufUnsupportedError(
      
        109
                    "VL GGUF emission refuses imatrix calibration: the replay corpus "
      
        110
                    "is text-only and would mis-weight vision-adjacent quant stats. "
      
        111
                    "Pass --no-imatrix (or omit --imatrix) to proceed."
      
        112
                )
      
        113
        
        114
        
        115
        def _resolve_adapter(
      
        116
            store: StorePath,
      
        117
            *,
      
        118
            adapter_name: str | None,
      
        119
        ) -> tuple[Path, int]:
      
        120
            """Return (adapter_path, version) with a typed refusal on empty stores.
      
        121
        
        122
            Mirrors the dispatch logic in `runner._resolve_adapter_for_export`
      
        123
            but scoped to the VL path so the VL module stays self-contained.
      
        124
            Callers receive a `VlGgufUnsupportedError` (not a generic
      
        125
            `ExportError`) so the dispatcher's fallback banner fires correctly.
      
        126
            """
      
        127
            if adapter_name is None:
      
        128
                resolved = store.resolve_current_adapter()
      
        129
                pointer = store.adapter_current_pointer
      
        130
            else:
      
        131
                resolved = store.resolve_current_adapter_for(adapter_name)
      
        132
                pointer = store.adapter_current_pointer_for(adapter_name)
      
        133
            if resolved is None or not resolved.exists():
      
        134
                raise VlGgufUnsupportedError(
      
        135
                    f"no current adapter under {pointer}; run `dlm train` before exporting."
      
        136
                )
      
        137
            version = _version_from_dir_name(resolved)
      
        138
            return resolved, version
      
        139
        
        140
        
        141
        def _version_from_dir_name(path: Path) -> int:
      
        142
            stem = path.name
      
        143
            if not stem.startswith("v") or not stem[1:].isdigit():
      
        144
                return 1
      
        145
            return int(stem[1:])
      
        146
        
        147
        
        148
        def run_vl_gguf_export(
      
        149
            store: StorePath,
      
        150
            spec: BaseModelSpec,
      
        151
            plan: ExportPlan,
      
        152
            *,
      
        153
            verdict: ArchProbeResult,
      
        154
            cached_base_dir: Path,
      
        155
            adapter_name: str | None = None,
      
        156
            system_prompt: str | None = None,
      
        157
            source_dlm_path: Path | None = None,
      
        158
            dlm_version: str = "dev",
      
        159
            training_sequence_len: int | None = None,
      
        160
            subprocess_runner: SubprocessRunner | None = None,
      
        161
            merge_runner: Callable[..., None] | None = None,
      
        162
            llama_cpp_root_override: Path | None = None,
      
        163
        ) -> VlGgufResult:
      
        164
            """Orchestrate merge → convert → quantize → Modelfile for a VL base.
      
        165
        
        166
            `subprocess_runner` and `merge_runner` are injection seams: the
      
        167
            production path wires in `run_checked` + `merge.perform_vl_merge`;
      
        168
            unit tests substitute recorders. Every arg after `plan` is
      
        169
            keyword-only — the production call is verbose but unambiguous.
      
        170
        
        171
            Returns a `VlGgufResult`; raises `VlGgufUnsupportedError` or
      
        172
            `ExportError` on any precondition or subprocess failure. The
      
        173
            dispatcher catches and falls back to HF-snapshot.
      
        174
            """
      
        175
            _assert_supported(verdict, plan)
      
        176
        
        177
            adapter_path, adapter_version = _resolve_adapter(store, adapter_name=adapter_name)
      
        178
        
        179
            preflight.check_adapter_config(adapter_path, spec)
      
        180
            preflight.check_tokenizer_vocab(adapter_path)
      
        181
            preflight.check_chat_template(adapter_path, required=False)
      
        182
            preflight.check_vl_target_modules_lm_only(adapter_path)
      
        183
            require_dequantize_or_refuse(plan, adapter_path)
      
        184
        
        185
            export_dir = store.exports / f"vl-gguf-{plan.quant}"
      
        186
            export_dir.mkdir(parents=True, exist_ok=True)
      
        187
        
        188
            base_gguf_name = f"base.{plan.quant}.gguf"
      
        189
            gguf_path = export_dir / base_gguf_name
      
        190
            modelfile_path = export_dir / "Modelfile"
      
        191
        
        192
            run = subprocess_runner if subprocess_runner is not None else _default_runner
      
        193
            do_merge = merge_runner if merge_runner is not None else merge.perform_vl_merge
      
        194
        
        195
            # Merge → fp16 HF dir → GGUF → quantized GGUF. Temp dir holds the
      
        196
            # merged HF snapshot + fp16 GGUF — both are multi-GB artifacts we
      
        197
            # don't need after the quantized GGUF lands in `export_dir`.
      
        198
            with tempfile.TemporaryDirectory(prefix="dlm-vl-gguf-") as tmp_s:
      
        199
                tmp = Path(tmp_s)
      
        200
                merged_hf = tmp / "merged"
      
        201
                fp16_gguf = tmp / f"base.{plan.quant}.f16.gguf"
      
        202
        
        203
                do_merge(adapter_path, merged_hf, cached_base_dir=cached_base_dir)
      
        204
        
        205
                run(
      
        206
                    base_gguf.build_convert_hf_args(
      
        207
                        merged_hf,
      
        208
                        out_fp16=fp16_gguf,
      
        209
                        script_override=llama_cpp_root_override,
      
        210
                    )
      
        211
                )
      
        212
                run(
      
        213
                    base_gguf.build_quantize_args(
      
        214
                        fp16_gguf,
      
        215
                        out_quant=gguf_path,
      
        216
                        quant=plan.quant,
      
        217
                        bin_override=llama_cpp_root_override,
      
        218
                    )
      
        219
                )
      
        220
        
        221
            if not gguf_path.exists():
      
        222
                raise ExportError(
      
        223
                    f"VL GGUF emission: expected {gguf_path} after llama-quantize; "
      
        224
                    "subprocess succeeded but the file is missing (check disk space + "
      
        225
                    "vendored build)."
      
        226
                )
      
        227
        
        228
            modelfile_body = render_vl_modelfile(
      
        229
                VlModelfileContext(
      
        230
                    spec=spec,
      
        231
                    plan=plan,
      
        232
                    adapter_dir=adapter_path,
      
        233
                    base_gguf_name=base_gguf_name,
      
        234
                    adapter_gguf_name=None,  # merged-only path, no ADAPTER directive
      
        235
                    dlm_id=store.root.name,
      
        236
                    adapter_version=adapter_version,
      
        237
                    system_prompt=system_prompt,
      
        238
                    source_dlm_path=source_dlm_path,
      
        239
                    dlm_version=dlm_version,
      
        240
                    training_sequence_len=training_sequence_len,
      
        241
                )
      
        242
            )
      
        243
            atomic_write_text(modelfile_path, modelfile_body)
      
        244
        
        245
            artifacts_files = [gguf_path, modelfile_path]
      
        246
            manifest = ExportManifest(
      
        247
                created_at=utc_now(),
      
        248
                created_by=dlm_version,
      
        249
                base_model_hf_id=spec.hf_id,
      
        250
                base_model_revision=spec.revision,
      
        251
                quant=plan.quant,
      
        252
                merged=plan.merged,
      
        253
                ollama_name=None,
      
        254
                llama_cpp_tag=verdict.llama_cpp_tag,
      
        255
                adapter_version=adapter_version,
      
        256
                artifacts=[build_artifact(export_dir, p) for p in artifacts_files],
      
        257
            )
      
        258
            manifest_path = save_export_manifest(export_dir, manifest)
      
        259
        
        260
            # Attach a compact `vl_gguf.json` sidecar capturing the arch verdict
      
        261
            # + the mmproj contract explicitly. The main manifest carries the
      
        262
            # llama.cpp tag but not the SUPPORTED/PARTIAL verdict string.
      
        263
            _write_vl_sidecar(export_dir, verdict=verdict, gguf_sha=compute_sha256(gguf_path))
      
        264
        
        265
            return VlGgufResult(
      
        266
                export_dir=export_dir,
      
        267
                manifest_path=manifest_path,
      
        268
                modelfile_path=modelfile_path,
      
        269
                gguf_path=gguf_path,
      
        270
                mmproj_path=None,
      
        271
                quant=plan.quant,
      
        272
                llama_cpp_tag=verdict.llama_cpp_tag,
      
        273
                artifacts=artifacts_files,
      
        274
            )
      
        275
        
        276
        
        277
        def _default_runner(args: Sequence[str]) -> Any:
      
        278
            """Production subprocess runner — `run_checked` from the quantize module.
      
        279
        
        280
            Pulled out so `run_vl_gguf_export`'s default is easy to override in
      
        281
            tests via the `subprocess_runner` kwarg.
      
        282
            """
      
        283
            return run_checked(list(args), timeout=60 * 60)
      
        284
        
        285
        
        286
        def _write_vl_sidecar(
      
        287
            export_dir: Path,
      
        288
            *,
      
        289
            verdict: ArchProbeResult,
      
        290
            gguf_sha: str,
      
        291
        ) -> None:
      
        292
            """Persist the arch verdict + mmproj contract alongside the manifest."""
      
        293
            sidecar = {
      
        294
                "schema": 1,
      
        295
                "arch_verdict": {
      
        296
                    "architecture": verdict.arch_class,
      
        297
                    "support": verdict.support.value,
      
        298
                    "llama_cpp_tag": verdict.llama_cpp_tag,
      
        299
                    "reason": verdict.reason,
      
        300
                },
      
        301
                "mmproj_path": None,
      
        302
                "gguf_sha256": gguf_sha,
      
        303
            }
      
        304
            atomic_write_text(
      
        305
                export_dir / "vl_gguf.json",
      
        306
                json.dumps(sidecar, indent=2, sort_keys=True) + "\n",
      
        307
            )

1	"""Single-file VL GGUF emitter for SUPPORTED vision-language bases.
2
3	Entered only when `arch_probe.probe_gguf_arch` returns
4	`SupportLevel.SUPPORTED` (today: `Qwen2VLForConditionalGeneration` at
5	the pinned llama.cpp tag). PaliGemma and InternVL2 remain UNSUPPORTED
6	upstream and route to the HF-snapshot fallback via the dispatcher.
7
8	Merged-only. `convert_lora_to_gguf.py` doesn't recognize the
9	`model.language_model.*` layer-naming convention that VL adapters
10	carry, so we refuse non-merged plans and let the dispatcher fall back
11	to HF-snapshot. The renderer emits `FROM ./base.<quant>.gguf` with no
12	`ADAPTER` line as a consequence. See `render_vl_modelfile` docstring.
13
14	Single-file vs mmproj. Upstream at tag `b8816` writes a single
15	GGUF for Qwen2-VL — the ViT is dropped and Ollama's preprocessor runs
16	the vision path on its own. Manifest records `mmproj_path=None`; a
17	future tag that changes this (split emission with an `mmproj-*.gguf`
18	sidecar) would add a new field without breaking the single-file
19	contract.
20
21	No imatrix. The replay corpus is text-only; imatrix calibration
22	would mis-weight vision-adjacent layers on any future VL base that
23	bakes projector tensors into the GGUF. Quant proceeds without
24	`--imatrix`.
25	"""
26
27	from __future__ import annotations
28
29	import json
30	import logging
31	import tempfile
32	from collections.abc import Callable, Sequence
33	from dataclasses import dataclass
34	from pathlib import Path
35	from typing import TYPE_CHECKING, Any
36
37	from dlm.export import base_gguf, merge, preflight
38	from dlm.export.arch_probe import ArchProbeResult, SupportLevel
39	from dlm.export.errors import ExportError, VlGgufUnsupportedError
40	from dlm.export.manifest import (
41	ExportManifest,
42	build_artifact,
43	compute_sha256,
44	save_export_manifest,
45	utc_now,
46	)
47	from dlm.export.ollama.vl_modelfile import VlModelfileContext, render_vl_modelfile
48	from dlm.export.plan import ExportPlan
49	from dlm.export.precision_safety import require_dequantize_or_refuse
50	from dlm.export.quantize import run_checked
51	from dlm.io.atomic import write_text as atomic_write_text
52
53	if TYPE_CHECKING:
54	from dlm.base_models import BaseModelSpec
55	from dlm.store.paths import StorePath
56
57	_LOG = logging.getLogger(__name__)
58
59	# Injection seam matching `runner.py` — a callable that runs subprocess
60	# args and returns a `CompletedProcess`-shaped result. Unit tests
61	# substitute a recorder; production passes through to `run_checked`.
62	SubprocessRunner = Callable[[Sequence[str]], Any]
63
64
65	@dataclass(frozen=True)
66	class VlGgufResult:
67	"""Return value of `run_vl_gguf_export` — what the CLI prints."""
68
69	export_dir: Path
70	manifest_path: Path
71	modelfile_path: Path
72	gguf_path: Path
73	mmproj_path: Path \| None # reserved for future split-emission archs
74	quant: str
75	llama_cpp_tag: str \| None
76	artifacts: list[Path]
77
78
79	def _assert_supported(verdict: ArchProbeResult, plan: ExportPlan) -> None:
80	"""Gate: refuse anything the emitter doesn't claim to handle.
81
82	Three preconditions, each with a distinct error message so the
83	dispatcher's fallback banner names the actual reason:
84
85	1. `verdict.support is SupportLevel.SUPPORTED` — otherwise upstream
86	`convert_hf_to_gguf.py` would either fail outright (UNSUPPORTED)
87	or drop critical tensors (PARTIAL → would ship an
88	under-converted GGUF silently).
89	2. `plan.merged is True` — LoRA-to-GGUF for VL archs isn't
90	plumbed upstream at our tag; merged-only is the safe shape.
91	3. `plan.imatrix == "off"` — the replay corpus is text-only;
92	importance-matrix calibration would mis-weight vision-adjacent
93	layers once a future arch bakes them into the GGUF.
94	"""
95	if verdict.support is not SupportLevel.SUPPORTED:
96	raise VlGgufUnsupportedError(
97	f"arch {verdict.arch_class!r} verdict={verdict.support.value!r} "
98	f"at llama.cpp tag={verdict.llama_cpp_tag!r}; single-file VL GGUF "
99	"emission requires SUPPORTED. Fallback: HF-snapshot."
100	)
101	if not plan.merged:
102	raise VlGgufUnsupportedError(
103	"VL GGUF emission is merged-only at this upstream tag (LoRA-to-GGUF "
104	"for VL archs isn't supported by convert_lora_to_gguf.py). Pass "
105	"--merged, or fall back to HF-snapshot."
106	)
107	if plan.imatrix != "off":
108	raise VlGgufUnsupportedError(
109	"VL GGUF emission refuses imatrix calibration: the replay corpus "
110	"is text-only and would mis-weight vision-adjacent quant stats. "
111	"Pass --no-imatrix (or omit --imatrix) to proceed."
112	)
113
114
115	def _resolve_adapter(
116	store: StorePath,
117	*,
118	adapter_name: str \| None,
119	) -> tuple[Path, int]:
120	"""Return (adapter_path, version) with a typed refusal on empty stores.
121
122	Mirrors the dispatch logic in `runner._resolve_adapter_for_export`
123	but scoped to the VL path so the VL module stays self-contained.
124	Callers receive a `VlGgufUnsupportedError` (not a generic
125	`ExportError`) so the dispatcher's fallback banner fires correctly.
126	"""
127	if adapter_name is None:
128	resolved = store.resolve_current_adapter()
129	pointer = store.adapter_current_pointer
130	else:
131	resolved = store.resolve_current_adapter_for(adapter_name)
132	pointer = store.adapter_current_pointer_for(adapter_name)
133	if resolved is None or not resolved.exists():
134	raise VlGgufUnsupportedError(
135	f"no current adapter under {pointer}; run `dlm train` before exporting."
136	)
137	version = _version_from_dir_name(resolved)
138	return resolved, version
139
140
141	def _version_from_dir_name(path: Path) -> int:
142	stem = path.name
143	if not stem.startswith("v") or not stem[1:].isdigit():
144	return 1
145	return int(stem[1:])
146
147
148	def run_vl_gguf_export(
149	store: StorePath,
150	spec: BaseModelSpec,
151	plan: ExportPlan,
152	*,
153	verdict: ArchProbeResult,
154	cached_base_dir: Path,
155	adapter_name: str \| None = None,
156	system_prompt: str \| None = None,
157	source_dlm_path: Path \| None = None,
158	dlm_version: str = "dev",
159	training_sequence_len: int \| None = None,
160	subprocess_runner: SubprocessRunner \| None = None,
161	merge_runner: Callable[..., None] \| None = None,
162	llama_cpp_root_override: Path \| None = None,
163	) -> VlGgufResult:
164	"""Orchestrate merge → convert → quantize → Modelfile for a VL base.
165
166	`subprocess_runner` and `merge_runner` are injection seams: the
167	production path wires in `run_checked` + `merge.perform_vl_merge`;
168	unit tests substitute recorders. Every arg after `plan` is
169	keyword-only — the production call is verbose but unambiguous.
170
171	Returns a `VlGgufResult`; raises `VlGgufUnsupportedError` or
172	`ExportError` on any precondition or subprocess failure. The
173	dispatcher catches and falls back to HF-snapshot.
174	"""
175	_assert_supported(verdict, plan)
176
177	adapter_path, adapter_version = _resolve_adapter(store, adapter_name=adapter_name)
178
179	preflight.check_adapter_config(adapter_path, spec)
180	preflight.check_tokenizer_vocab(adapter_path)
181	preflight.check_chat_template(adapter_path, required=False)
182	preflight.check_vl_target_modules_lm_only(adapter_path)
183	require_dequantize_or_refuse(plan, adapter_path)
184
185	export_dir = store.exports / f"vl-gguf-{plan.quant}"
186	export_dir.mkdir(parents=True, exist_ok=True)
187
188	base_gguf_name = f"base.{plan.quant}.gguf"
189	gguf_path = export_dir / base_gguf_name
190	modelfile_path = export_dir / "Modelfile"
191
192	run = subprocess_runner if subprocess_runner is not None else _default_runner
193	do_merge = merge_runner if merge_runner is not None else merge.perform_vl_merge
194
195	# Merge → fp16 HF dir → GGUF → quantized GGUF. Temp dir holds the
196	# merged HF snapshot + fp16 GGUF — both are multi-GB artifacts we
197	# don't need after the quantized GGUF lands in `export_dir`.
198	with tempfile.TemporaryDirectory(prefix="dlm-vl-gguf-") as tmp_s:
199	tmp = Path(tmp_s)
200	merged_hf = tmp / "merged"
201	fp16_gguf = tmp / f"base.{plan.quant}.f16.gguf"
202
203	do_merge(adapter_path, merged_hf, cached_base_dir=cached_base_dir)
204
205	run(
206	base_gguf.build_convert_hf_args(
207	merged_hf,
208	out_fp16=fp16_gguf,
209	script_override=llama_cpp_root_override,
210	)
211	)
212	run(
213	base_gguf.build_quantize_args(
214	fp16_gguf,
215	out_quant=gguf_path,
216	quant=plan.quant,
217	bin_override=llama_cpp_root_override,
218	)
219	)
220
221	if not gguf_path.exists():
222	raise ExportError(
223	f"VL GGUF emission: expected {gguf_path} after llama-quantize; "
224	"subprocess succeeded but the file is missing (check disk space + "
225	"vendored build)."
226	)
227
228	modelfile_body = render_vl_modelfile(
229	VlModelfileContext(
230	spec=spec,
231	plan=plan,
232	adapter_dir=adapter_path,
233	base_gguf_name=base_gguf_name,
234	adapter_gguf_name=None, # merged-only path, no ADAPTER directive
235	dlm_id=store.root.name,
236	adapter_version=adapter_version,
237	system_prompt=system_prompt,
238	source_dlm_path=source_dlm_path,
239	dlm_version=dlm_version,
240	training_sequence_len=training_sequence_len,
241	)
242	)
243	atomic_write_text(modelfile_path, modelfile_body)
244
245	artifacts_files = [gguf_path, modelfile_path]
246	manifest = ExportManifest(
247	created_at=utc_now(),
248	created_by=dlm_version,
249	base_model_hf_id=spec.hf_id,
250	base_model_revision=spec.revision,
251	quant=plan.quant,
252	merged=plan.merged,
253	ollama_name=None,
254	llama_cpp_tag=verdict.llama_cpp_tag,
255	adapter_version=adapter_version,
256	artifacts=[build_artifact(export_dir, p) for p in artifacts_files],
257	)
258	manifest_path = save_export_manifest(export_dir, manifest)
259
260	# Attach a compact `vl_gguf.json` sidecar capturing the arch verdict
261	# + the mmproj contract explicitly. The main manifest carries the
262	# llama.cpp tag but not the SUPPORTED/PARTIAL verdict string.
263	_write_vl_sidecar(export_dir, verdict=verdict, gguf_sha=compute_sha256(gguf_path))
264
265	return VlGgufResult(
266	export_dir=export_dir,
267	manifest_path=manifest_path,
268	modelfile_path=modelfile_path,
269	gguf_path=gguf_path,
270	mmproj_path=None,
271	quant=plan.quant,
272	llama_cpp_tag=verdict.llama_cpp_tag,
273	artifacts=artifacts_files,
274	)
275
276
277	def _default_runner(args: Sequence[str]) -> Any:
278	"""Production subprocess runner — `run_checked` from the quantize module.
279
280	Pulled out so `run_vl_gguf_export`'s default is easy to override in
281	tests via the `subprocess_runner` kwarg.
282	"""
283	return run_checked(list(args), timeout=60 * 60)
284
285
286	def _write_vl_sidecar(
287	export_dir: Path,
288	*,
289	verdict: ArchProbeResult,
290	gguf_sha: str,
291	) -> None:
292	"""Persist the arch verdict + mmproj contract alongside the manifest."""
293	sidecar = {
294	"schema": 1,
295	"arch_verdict": {
296	"architecture": verdict.arch_class,
297	"support": verdict.support.value,
298	"llama_cpp_tag": verdict.llama_cpp_tag,
299	"reason": verdict.reason,
300	},
301	"mmproj_path": None,
302	"gguf_sha256": gguf_sha,
303	}
304	atomic_write_text(
305	export_dir / "vl_gguf.json",
306	json.dumps(sidecar, indent=2, sort_keys=True) + "\n",
307	)