Python · 11923 bytes Raw Blame History
1 """HF-snapshot export for vision-language bases.
2
3 GGUF conversion for VL architectures is in flux upstream
4 (`convert_hf_to_gguf.py` doesn't register PaliGemma / Qwen2-VL /
5 InternVL2 consistently yet), so this path refuses to emit GGUF and
6 writes an HF snapshot instead: a self-contained directory that a
7 downstream user can load with `AutoProcessor.from_pretrained` +
8 `AutoModelForImageTextToText.from_pretrained` + `PeftModel.from_pretrained`.
9
10 Layout under `exports/hf-snapshot/`:
11
12 adapter/ # PEFT adapter dir (copy of the current store adapter)
13 processor/ # processor config + tokenizer files
14 snapshot_manifest.json # see VlSnapshotManifest
15 README.md # how-to-load instructions for the recipient
16
17 The BASE model weights are NOT copied — they're pinned by `hf_id`
18 + `revision` in the manifest. Recipients download the base once,
19 keyed on that revision. Bundling the base would balloon pack size
20 to 6 GB+ and conflicts with Gemma / Llama licensing for most VL
21 bases (they are `redistributable=False`).
22
23 The `export_target` field on the manifest is the load-bearing flag:
24 a future GGUF path will write `export_target="gguf"` instead.
25 """
26
27 from __future__ import annotations
28
29 import json
30 import shutil
31 from dataclasses import dataclass
32 from datetime import UTC, datetime
33 from pathlib import Path
34 from typing import TYPE_CHECKING, Literal
35
36 from pydantic import BaseModel, ConfigDict, Field
37
38 from dlm.export.errors import ExportError, ExportManifestError
39 from dlm.export.manifest import ExportArtifact, build_artifact, compute_sha256
40 from dlm.io.atomic import write_text
41
42 if TYPE_CHECKING:
43 from dlm.base_models import BaseModelSpec
44 from dlm.store.paths import StorePath
45
46 VL_SNAPSHOT_SUBDIR = "hf-snapshot"
47 SNAPSHOT_MANIFEST_FILENAME = "snapshot_manifest.json"
48 SNAPSHOT_README_FILENAME = "README.md"
49
50
51 class VlSnapshotManifest(BaseModel):
52 """Self-describing record of one HF-snapshot export.
53
54 Parallel to `ExportManifest` but scoped to the VL path. No
55 `quant` / `llama_cpp_tag` — the snapshot doesn't run llama.cpp.
56 `export_target` is the discriminator: a future GGUF path adds a
57 `"gguf"` branch to the same file when upstream converter support
58 lands.
59 """
60
61 model_config = ConfigDict(extra="forbid", frozen=True)
62
63 export_target: Literal["hf_snapshot"] = "hf_snapshot"
64 created_at: datetime
65 created_by: str = Field(..., description="dlm version that wrote this manifest.")
66 base_model_hf_id: str
67 base_model_revision: str
68 base_model_architecture: str
69 modality: Literal["vision-language"] = "vision-language"
70 image_token: str
71 num_image_tokens: int
72 target_size: tuple[int, int]
73 adapter_version: int = Field(..., ge=1)
74 adapter_name: str | None = None
75 rationale: str = Field(
76 default=(
77 "Vision-language architectures in llama.cpp are in flux; "
78 "this build emits an HF-snapshot fallback. A future release "
79 "will add a GGUF path when upstream converter support "
80 "stabilizes."
81 ),
82 )
83 artifacts: list[ExportArtifact] = Field(default_factory=list)
84
85
86 @dataclass(frozen=True)
87 class VlSnapshotResult:
88 """Return value of `run_vl_snapshot_export` — what the CLI prints."""
89
90 export_dir: Path
91 manifest_path: Path
92 readme_path: Path
93 adapter_dir: Path
94 processor_dir: Path
95 artifacts: list[Path]
96
97
98 def run_vl_snapshot_export(
99 store: StorePath,
100 spec: BaseModelSpec,
101 *,
102 adapter_name: str | None = None,
103 adapter_path_override: Path | None = None,
104 dlm_version: str = "dlm-0",
105 processor: object | None = None,
106 ) -> VlSnapshotResult:
107 """Emit a VL HF-snapshot export under `exports/hf-snapshot/`.
108
109 Resolves the adapter dir, copies it into the export directory,
110 saves the processor (if supplied) under `processor/`, writes the
111 manifest + README, and returns the layout paths.
112
113 `processor=None` lets callers skip the processor save (tests, dry
114 runs). Production paths pass an `AutoProcessor` loaded via
115 `dlm.train.loader.load_processor`.
116 """
117 if spec.modality != "vision-language":
118 raise ExportError(
119 f"run_vl_snapshot_export: {spec.key!r} is modality={spec.modality!r}; "
120 "only vision-language bases go through the HF-snapshot path"
121 )
122 if spec.vl_preprocessor_plan is None:
123 raise ExportError(
124 f"run_vl_snapshot_export: {spec.key!r} has modality='vision-language' "
125 "but no vl_preprocessor_plan (this is a schema bug — file an issue)"
126 )
127
128 adapter_path, adapter_version = _resolve_adapter_for_export(
129 store=store,
130 adapter_name=adapter_name,
131 adapter_path_override=adapter_path_override,
132 )
133
134 export_dir = store.exports / VL_SNAPSHOT_SUBDIR
135 export_dir.mkdir(parents=True, exist_ok=True)
136
137 adapter_out = export_dir / "adapter"
138 _copy_adapter_dir(adapter_path, adapter_out)
139
140 processor_out = export_dir / "processor"
141 if processor is not None:
142 processor_out.mkdir(parents=True, exist_ok=True)
143 # HF processors implement `save_pretrained(dir)`. The method
144 # writes tokenizer + image_processor configs + the processor
145 # config itself — everything a recipient needs to re-hydrate.
146 save = getattr(processor, "save_pretrained", None)
147 if callable(save):
148 save(str(processor_out))
149
150 artifacts: list[Path] = []
151 for path in sorted(export_dir.rglob("*")):
152 if path.is_file() and path.name not in (
153 SNAPSHOT_MANIFEST_FILENAME,
154 SNAPSHOT_README_FILENAME,
155 ):
156 artifacts.append(path)
157
158 manifest = VlSnapshotManifest(
159 created_at=_utc_now(),
160 created_by=dlm_version,
161 base_model_hf_id=spec.hf_id,
162 base_model_revision=spec.revision,
163 base_model_architecture=spec.architecture,
164 image_token=spec.vl_preprocessor_plan.image_token,
165 num_image_tokens=spec.vl_preprocessor_plan.num_image_tokens,
166 target_size=spec.vl_preprocessor_plan.target_size,
167 adapter_version=adapter_version,
168 adapter_name=adapter_name,
169 artifacts=[build_artifact(export_dir, p) for p in artifacts],
170 )
171 manifest_path = _save_manifest(export_dir, manifest)
172 readme_path = _write_readme(export_dir, spec=spec, manifest=manifest)
173
174 return VlSnapshotResult(
175 export_dir=export_dir,
176 manifest_path=manifest_path,
177 readme_path=readme_path,
178 adapter_dir=adapter_out,
179 processor_dir=processor_out,
180 artifacts=artifacts,
181 )
182
183
184 # --- internals ---------------------------------------------------------------
185
186
187 def _resolve_adapter_for_export(
188 *,
189 store: StorePath,
190 adapter_name: str | None,
191 adapter_path_override: Path | None,
192 ) -> tuple[Path, int]:
193 """Return (adapter_dir, version) for the export.
194
195 Mirrors the dispatch logic in `run_export` but minus the GGUF-
196 specific concerns. Extracted so unit tests can exercise the
197 refusal messages without touching the rest of the pipeline.
198 """
199 if adapter_path_override is not None:
200 if not adapter_path_override.exists():
201 raise ExportError(f"adapter_path_override {adapter_path_override} does not exist")
202 return adapter_path_override, _version_from_dir_name(adapter_path_override)
203
204 if adapter_name is None:
205 resolved = store.resolve_current_adapter()
206 pointer = store.adapter_current_pointer
207 else:
208 resolved = store.resolve_current_adapter_for(adapter_name)
209 pointer = store.adapter_current_pointer_for(adapter_name)
210
211 if resolved is None or not resolved.exists():
212 raise ExportError(f"no current adapter under {pointer}; run `dlm train` before exporting.")
213 return resolved, _version_from_dir_name(resolved)
214
215
216 def _version_from_dir_name(path: Path) -> int:
217 """Parse the `vNNNN` suffix on an adapter version directory name."""
218 stem = path.name
219 if not stem.startswith("v") or not stem[1:].isdigit():
220 # Ephemeral merged-adapter dir won't match vNNNN; degrade to 1
221 # rather than refuse — the export still works, the version is
222 # just cosmetic in the manifest.
223 return 1
224 return int(stem[1:])
225
226
227 def _copy_adapter_dir(src: Path, dst: Path) -> None:
228 """Copy the PEFT adapter directory into the export tree.
229
230 `shutil.copytree` with `dirs_exist_ok=False` so repeat exports
231 don't silently mix versions. Callers that want to overwrite
232 delete `exports/hf-snapshot/` first.
233 """
234 if dst.exists():
235 shutil.rmtree(dst)
236 shutil.copytree(src, dst)
237
238
239 def _save_manifest(export_dir: Path, manifest: VlSnapshotManifest) -> Path:
240 path = export_dir / SNAPSHOT_MANIFEST_FILENAME
241 payload = manifest.model_dump(mode="json")
242 blob = json.dumps(payload, sort_keys=True, indent=2) + "\n"
243 write_text(path, blob)
244 return path
245
246
247 def _write_readme(
248 export_dir: Path,
249 *,
250 spec: BaseModelSpec,
251 manifest: VlSnapshotManifest,
252 ) -> Path:
253 """Write a human-readable load-instruction file.
254
255 Recipients of the snapshot directory use this to understand what's
256 in the tarball without opening the manifest JSON. Kept terse; the
257 manifest is the authoritative record.
258 """
259 path = export_dir / SNAPSHOT_README_FILENAME
260 body = (
261 f"# HF-snapshot export\n"
262 f"\n"
263 f"Target: **{spec.key}** ({spec.hf_id} @ {spec.revision[:12]}…)\n"
264 f"Adapter version: v{manifest.adapter_version:04d}"
265 f"{f' ({manifest.adapter_name})' if manifest.adapter_name else ''}\n"
266 f"\n"
267 f"## Load this snapshot\n"
268 f"\n"
269 f"```python\n"
270 f"from transformers import AutoModelForImageTextToText, AutoProcessor\n"
271 f"from peft import PeftModel\n"
272 f"\n"
273 f"base = AutoModelForImageTextToText.from_pretrained(\n"
274 f' "{spec.hf_id}", revision="{spec.revision}",\n'
275 f")\n"
276 f'model = PeftModel.from_pretrained(base, "./adapter")\n'
277 f'processor = AutoProcessor.from_pretrained("./processor")\n'
278 f"```\n"
279 f"\n"
280 f"## Why HF snapshot (not GGUF)\n"
281 f"\n"
282 f"Vision-language converter support in `llama.cpp` is in flux.\n"
283 f"A future release adds GGUF export when upstream stabilizes.\n"
284 )
285 write_text(path, body)
286 return path
287
288
289 def _utc_now() -> datetime:
290 return datetime.now(UTC).replace(tzinfo=None, microsecond=0)
291
292
293 def load_vl_snapshot_manifest(export_dir: Path) -> VlSnapshotManifest:
294 """Read + validate `<export_dir>/snapshot_manifest.json`."""
295 path = export_dir / SNAPSHOT_MANIFEST_FILENAME
296 if not path.exists():
297 raise ExportManifestError(f"missing {path}")
298 try:
299 data = json.loads(path.read_text(encoding="utf-8"))
300 except (OSError, json.JSONDecodeError) as exc:
301 raise ExportManifestError(f"cannot parse {path}: {exc}") from exc
302 try:
303 return VlSnapshotManifest.model_validate(data)
304 except Exception as exc:
305 raise ExportManifestError(f"{path} has invalid shape: {exc}") from exc
306
307
308 def verify_artifacts(export_dir: Path, manifest: VlSnapshotManifest) -> None:
309 """Re-hash each declared artifact and raise on mismatch.
310
311 Used by a downstream `dlm verify` pass to make sure the snapshot
312 wasn't truncated in transit. Cheap — the snapshot is adapter-size,
313 not base-size.
314 """
315 for entry in manifest.artifacts:
316 on_disk = export_dir / entry.path
317 if not on_disk.exists():
318 raise ExportManifestError(f"missing declared artifact: {on_disk}")
319 actual = compute_sha256(on_disk)
320 if actual != entry.sha256:
321 raise ExportManifestError(
322 f"sha256 mismatch for {entry.path}: "
323 f"manifest={entry.sha256[:12]}… disk={actual[:12]}…"
324 )