| 1 |
"""HF-snapshot export for vision-language bases. |
| 2 |
|
| 3 |
GGUF conversion for VL architectures is in flux upstream |
| 4 |
(`convert_hf_to_gguf.py` doesn't register PaliGemma / Qwen2-VL / |
| 5 |
InternVL2 consistently yet), so this path refuses to emit GGUF and |
| 6 |
writes an HF snapshot instead: a self-contained directory that a |
| 7 |
downstream user can load with `AutoProcessor.from_pretrained` + |
| 8 |
`AutoModelForImageTextToText.from_pretrained` + `PeftModel.from_pretrained`. |
| 9 |
|
| 10 |
Layout under `exports/hf-snapshot/`: |
| 11 |
|
| 12 |
adapter/ # PEFT adapter dir (copy of the current store adapter) |
| 13 |
processor/ # processor config + tokenizer files |
| 14 |
snapshot_manifest.json # see VlSnapshotManifest |
| 15 |
README.md # how-to-load instructions for the recipient |
| 16 |
|
| 17 |
The BASE model weights are NOT copied — they're pinned by `hf_id` |
| 18 |
+ `revision` in the manifest. Recipients download the base once, |
| 19 |
keyed on that revision. Bundling the base would balloon pack size |
| 20 |
to 6 GB+ and conflicts with Gemma / Llama licensing for most VL |
| 21 |
bases (they are `redistributable=False`). |
| 22 |
|
| 23 |
The `export_target` field on the manifest is the load-bearing flag: |
| 24 |
a future GGUF path will write `export_target="gguf"` instead. |
| 25 |
""" |
| 26 |
|
| 27 |
from __future__ import annotations |
| 28 |
|
| 29 |
import json |
| 30 |
import shutil |
| 31 |
from dataclasses import dataclass |
| 32 |
from datetime import UTC, datetime |
| 33 |
from pathlib import Path |
| 34 |
from typing import TYPE_CHECKING, Literal |
| 35 |
|
| 36 |
from pydantic import BaseModel, ConfigDict, Field |
| 37 |
|
| 38 |
from dlm.export.errors import ExportError, ExportManifestError |
| 39 |
from dlm.export.manifest import ExportArtifact, build_artifact, compute_sha256 |
| 40 |
from dlm.io.atomic import write_text |
| 41 |
|
| 42 |
if TYPE_CHECKING: |
| 43 |
from dlm.base_models import BaseModelSpec |
| 44 |
from dlm.store.paths import StorePath |
| 45 |
|
| 46 |
VL_SNAPSHOT_SUBDIR = "hf-snapshot" |
| 47 |
SNAPSHOT_MANIFEST_FILENAME = "snapshot_manifest.json" |
| 48 |
SNAPSHOT_README_FILENAME = "README.md" |
| 49 |
|
| 50 |
|
| 51 |
class VlSnapshotManifest(BaseModel): |
| 52 |
"""Self-describing record of one HF-snapshot export. |
| 53 |
|
| 54 |
Parallel to `ExportManifest` but scoped to the VL path. No |
| 55 |
`quant` / `llama_cpp_tag` — the snapshot doesn't run llama.cpp. |
| 56 |
`export_target` is the discriminator: a future GGUF path adds a |
| 57 |
`"gguf"` branch to the same file when upstream converter support |
| 58 |
lands. |
| 59 |
""" |
| 60 |
|
| 61 |
model_config = ConfigDict(extra="forbid", frozen=True) |
| 62 |
|
| 63 |
export_target: Literal["hf_snapshot"] = "hf_snapshot" |
| 64 |
created_at: datetime |
| 65 |
created_by: str = Field(..., description="dlm version that wrote this manifest.") |
| 66 |
base_model_hf_id: str |
| 67 |
base_model_revision: str |
| 68 |
base_model_architecture: str |
| 69 |
modality: Literal["vision-language"] = "vision-language" |
| 70 |
image_token: str |
| 71 |
num_image_tokens: int |
| 72 |
target_size: tuple[int, int] |
| 73 |
adapter_version: int = Field(..., ge=1) |
| 74 |
adapter_name: str | None = None |
| 75 |
rationale: str = Field( |
| 76 |
default=( |
| 77 |
"Vision-language architectures in llama.cpp are in flux; " |
| 78 |
"this build emits an HF-snapshot fallback. A future release " |
| 79 |
"will add a GGUF path when upstream converter support " |
| 80 |
"stabilizes." |
| 81 |
), |
| 82 |
) |
| 83 |
artifacts: list[ExportArtifact] = Field(default_factory=list) |
| 84 |
|
| 85 |
|
| 86 |
@dataclass(frozen=True) |
| 87 |
class VlSnapshotResult: |
| 88 |
"""Return value of `run_vl_snapshot_export` — what the CLI prints.""" |
| 89 |
|
| 90 |
export_dir: Path |
| 91 |
manifest_path: Path |
| 92 |
readme_path: Path |
| 93 |
adapter_dir: Path |
| 94 |
processor_dir: Path |
| 95 |
artifacts: list[Path] |
| 96 |
|
| 97 |
|
| 98 |
def run_vl_snapshot_export( |
| 99 |
store: StorePath, |
| 100 |
spec: BaseModelSpec, |
| 101 |
*, |
| 102 |
adapter_name: str | None = None, |
| 103 |
adapter_path_override: Path | None = None, |
| 104 |
dlm_version: str = "dlm-0", |
| 105 |
processor: object | None = None, |
| 106 |
) -> VlSnapshotResult: |
| 107 |
"""Emit a VL HF-snapshot export under `exports/hf-snapshot/`. |
| 108 |
|
| 109 |
Resolves the adapter dir, copies it into the export directory, |
| 110 |
saves the processor (if supplied) under `processor/`, writes the |
| 111 |
manifest + README, and returns the layout paths. |
| 112 |
|
| 113 |
`processor=None` lets callers skip the processor save (tests, dry |
| 114 |
runs). Production paths pass an `AutoProcessor` loaded via |
| 115 |
`dlm.train.loader.load_processor`. |
| 116 |
""" |
| 117 |
if spec.modality != "vision-language": |
| 118 |
raise ExportError( |
| 119 |
f"run_vl_snapshot_export: {spec.key!r} is modality={spec.modality!r}; " |
| 120 |
"only vision-language bases go through the HF-snapshot path" |
| 121 |
) |
| 122 |
if spec.vl_preprocessor_plan is None: |
| 123 |
raise ExportError( |
| 124 |
f"run_vl_snapshot_export: {spec.key!r} has modality='vision-language' " |
| 125 |
"but no vl_preprocessor_plan (this is a schema bug — file an issue)" |
| 126 |
) |
| 127 |
|
| 128 |
adapter_path, adapter_version = _resolve_adapter_for_export( |
| 129 |
store=store, |
| 130 |
adapter_name=adapter_name, |
| 131 |
adapter_path_override=adapter_path_override, |
| 132 |
) |
| 133 |
|
| 134 |
export_dir = store.exports / VL_SNAPSHOT_SUBDIR |
| 135 |
export_dir.mkdir(parents=True, exist_ok=True) |
| 136 |
|
| 137 |
adapter_out = export_dir / "adapter" |
| 138 |
_copy_adapter_dir(adapter_path, adapter_out) |
| 139 |
|
| 140 |
processor_out = export_dir / "processor" |
| 141 |
if processor is not None: |
| 142 |
processor_out.mkdir(parents=True, exist_ok=True) |
| 143 |
# HF processors implement `save_pretrained(dir)`. The method |
| 144 |
# writes tokenizer + image_processor configs + the processor |
| 145 |
# config itself — everything a recipient needs to re-hydrate. |
| 146 |
save = getattr(processor, "save_pretrained", None) |
| 147 |
if callable(save): |
| 148 |
save(str(processor_out)) |
| 149 |
|
| 150 |
artifacts: list[Path] = [] |
| 151 |
for path in sorted(export_dir.rglob("*")): |
| 152 |
if path.is_file() and path.name not in ( |
| 153 |
SNAPSHOT_MANIFEST_FILENAME, |
| 154 |
SNAPSHOT_README_FILENAME, |
| 155 |
): |
| 156 |
artifacts.append(path) |
| 157 |
|
| 158 |
manifest = VlSnapshotManifest( |
| 159 |
created_at=_utc_now(), |
| 160 |
created_by=dlm_version, |
| 161 |
base_model_hf_id=spec.hf_id, |
| 162 |
base_model_revision=spec.revision, |
| 163 |
base_model_architecture=spec.architecture, |
| 164 |
image_token=spec.vl_preprocessor_plan.image_token, |
| 165 |
num_image_tokens=spec.vl_preprocessor_plan.num_image_tokens, |
| 166 |
target_size=spec.vl_preprocessor_plan.target_size, |
| 167 |
adapter_version=adapter_version, |
| 168 |
adapter_name=adapter_name, |
| 169 |
artifacts=[build_artifact(export_dir, p) for p in artifacts], |
| 170 |
) |
| 171 |
manifest_path = _save_manifest(export_dir, manifest) |
| 172 |
readme_path = _write_readme(export_dir, spec=spec, manifest=manifest) |
| 173 |
|
| 174 |
return VlSnapshotResult( |
| 175 |
export_dir=export_dir, |
| 176 |
manifest_path=manifest_path, |
| 177 |
readme_path=readme_path, |
| 178 |
adapter_dir=adapter_out, |
| 179 |
processor_dir=processor_out, |
| 180 |
artifacts=artifacts, |
| 181 |
) |
| 182 |
|
| 183 |
|
| 184 |
# --- internals --------------------------------------------------------------- |
| 185 |
|
| 186 |
|
| 187 |
def _resolve_adapter_for_export( |
| 188 |
*, |
| 189 |
store: StorePath, |
| 190 |
adapter_name: str | None, |
| 191 |
adapter_path_override: Path | None, |
| 192 |
) -> tuple[Path, int]: |
| 193 |
"""Return (adapter_dir, version) for the export. |
| 194 |
|
| 195 |
Mirrors the dispatch logic in `run_export` but minus the GGUF- |
| 196 |
specific concerns. Extracted so unit tests can exercise the |
| 197 |
refusal messages without touching the rest of the pipeline. |
| 198 |
""" |
| 199 |
if adapter_path_override is not None: |
| 200 |
if not adapter_path_override.exists(): |
| 201 |
raise ExportError(f"adapter_path_override {adapter_path_override} does not exist") |
| 202 |
return adapter_path_override, _version_from_dir_name(adapter_path_override) |
| 203 |
|
| 204 |
if adapter_name is None: |
| 205 |
resolved = store.resolve_current_adapter() |
| 206 |
pointer = store.adapter_current_pointer |
| 207 |
else: |
| 208 |
resolved = store.resolve_current_adapter_for(adapter_name) |
| 209 |
pointer = store.adapter_current_pointer_for(adapter_name) |
| 210 |
|
| 211 |
if resolved is None or not resolved.exists(): |
| 212 |
raise ExportError(f"no current adapter under {pointer}; run `dlm train` before exporting.") |
| 213 |
return resolved, _version_from_dir_name(resolved) |
| 214 |
|
| 215 |
|
| 216 |
def _version_from_dir_name(path: Path) -> int: |
| 217 |
"""Parse the `vNNNN` suffix on an adapter version directory name.""" |
| 218 |
stem = path.name |
| 219 |
if not stem.startswith("v") or not stem[1:].isdigit(): |
| 220 |
# Ephemeral merged-adapter dir won't match vNNNN; degrade to 1 |
| 221 |
# rather than refuse — the export still works, the version is |
| 222 |
# just cosmetic in the manifest. |
| 223 |
return 1 |
| 224 |
return int(stem[1:]) |
| 225 |
|
| 226 |
|
| 227 |
def _copy_adapter_dir(src: Path, dst: Path) -> None: |
| 228 |
"""Copy the PEFT adapter directory into the export tree. |
| 229 |
|
| 230 |
`shutil.copytree` with `dirs_exist_ok=False` so repeat exports |
| 231 |
don't silently mix versions. Callers that want to overwrite |
| 232 |
delete `exports/hf-snapshot/` first. |
| 233 |
""" |
| 234 |
if dst.exists(): |
| 235 |
shutil.rmtree(dst) |
| 236 |
shutil.copytree(src, dst) |
| 237 |
|
| 238 |
|
| 239 |
def _save_manifest(export_dir: Path, manifest: VlSnapshotManifest) -> Path: |
| 240 |
path = export_dir / SNAPSHOT_MANIFEST_FILENAME |
| 241 |
payload = manifest.model_dump(mode="json") |
| 242 |
blob = json.dumps(payload, sort_keys=True, indent=2) + "\n" |
| 243 |
write_text(path, blob) |
| 244 |
return path |
| 245 |
|
| 246 |
|
| 247 |
def _write_readme( |
| 248 |
export_dir: Path, |
| 249 |
*, |
| 250 |
spec: BaseModelSpec, |
| 251 |
manifest: VlSnapshotManifest, |
| 252 |
) -> Path: |
| 253 |
"""Write a human-readable load-instruction file. |
| 254 |
|
| 255 |
Recipients of the snapshot directory use this to understand what's |
| 256 |
in the tarball without opening the manifest JSON. Kept terse; the |
| 257 |
manifest is the authoritative record. |
| 258 |
""" |
| 259 |
path = export_dir / SNAPSHOT_README_FILENAME |
| 260 |
body = ( |
| 261 |
f"# HF-snapshot export\n" |
| 262 |
f"\n" |
| 263 |
f"Target: **{spec.key}** ({spec.hf_id} @ {spec.revision[:12]}…)\n" |
| 264 |
f"Adapter version: v{manifest.adapter_version:04d}" |
| 265 |
f"{f' ({manifest.adapter_name})' if manifest.adapter_name else ''}\n" |
| 266 |
f"\n" |
| 267 |
f"## Load this snapshot\n" |
| 268 |
f"\n" |
| 269 |
f"```python\n" |
| 270 |
f"from transformers import AutoModelForImageTextToText, AutoProcessor\n" |
| 271 |
f"from peft import PeftModel\n" |
| 272 |
f"\n" |
| 273 |
f"base = AutoModelForImageTextToText.from_pretrained(\n" |
| 274 |
f' "{spec.hf_id}", revision="{spec.revision}",\n' |
| 275 |
f")\n" |
| 276 |
f'model = PeftModel.from_pretrained(base, "./adapter")\n' |
| 277 |
f'processor = AutoProcessor.from_pretrained("./processor")\n' |
| 278 |
f"```\n" |
| 279 |
f"\n" |
| 280 |
f"## Why HF snapshot (not GGUF)\n" |
| 281 |
f"\n" |
| 282 |
f"Vision-language converter support in `llama.cpp` is in flux.\n" |
| 283 |
f"A future release adds GGUF export when upstream stabilizes.\n" |
| 284 |
) |
| 285 |
write_text(path, body) |
| 286 |
return path |
| 287 |
|
| 288 |
|
| 289 |
def _utc_now() -> datetime: |
| 290 |
return datetime.now(UTC).replace(tzinfo=None, microsecond=0) |
| 291 |
|
| 292 |
|
| 293 |
def load_vl_snapshot_manifest(export_dir: Path) -> VlSnapshotManifest: |
| 294 |
"""Read + validate `<export_dir>/snapshot_manifest.json`.""" |
| 295 |
path = export_dir / SNAPSHOT_MANIFEST_FILENAME |
| 296 |
if not path.exists(): |
| 297 |
raise ExportManifestError(f"missing {path}") |
| 298 |
try: |
| 299 |
data = json.loads(path.read_text(encoding="utf-8")) |
| 300 |
except (OSError, json.JSONDecodeError) as exc: |
| 301 |
raise ExportManifestError(f"cannot parse {path}: {exc}") from exc |
| 302 |
try: |
| 303 |
return VlSnapshotManifest.model_validate(data) |
| 304 |
except Exception as exc: |
| 305 |
raise ExportManifestError(f"{path} has invalid shape: {exc}") from exc |
| 306 |
|
| 307 |
|
| 308 |
def verify_artifacts(export_dir: Path, manifest: VlSnapshotManifest) -> None: |
| 309 |
"""Re-hash each declared artifact and raise on mismatch. |
| 310 |
|
| 311 |
Used by a downstream `dlm verify` pass to make sure the snapshot |
| 312 |
wasn't truncated in transit. Cheap — the snapshot is adapter-size, |
| 313 |
not base-size. |
| 314 |
""" |
| 315 |
for entry in manifest.artifacts: |
| 316 |
on_disk = export_dir / entry.path |
| 317 |
if not on_disk.exists(): |
| 318 |
raise ExportManifestError(f"missing declared artifact: {on_disk}") |
| 319 |
actual = compute_sha256(on_disk) |
| 320 |
if actual != entry.sha256: |
| 321 |
raise ExportManifestError( |
| 322 |
f"sha256 mismatch for {entry.path}: " |
| 323 |
f"manifest={entry.sha256[:12]}… disk={actual[:12]}…" |
| 324 |
) |