Python · 26154 bytes Raw Blame History
1 """Curated launch registry of supported base models.
2
3 Every entry pins an exact HuggingFace commit SHA. Refreshed by
4 `scripts/refresh-registry.py`; weekly CI opens a PR on drift.
5
6 Notes on individual entries:
7
8 - `qwen2.5-3b` ships under the Qwen Research License (free for entities
9 with <100M MAU). We record it as `license_spdx="Other"` and surface
10 the URL via `license_url`; it remains `redistributable=True` because
11 the license permits bundling + redistribution with attribution.
12 **Caveat:** the boolean `redistributable` field does not express the
13 MAU threshold or attribution requirement. A
14 `redistributable_conditions: str | None` field on `BaseModelSpec`
15 plus a pack-time attestation checkbox would encode this properly —
16 deferred follow-up work. Until then, users at the scale threshold
17 must consult the license text themselves.
18 - Llama-3.2 models are gated on HuggingFace. Llama-3.3 8B currently
19 needs a mirror-backed fetch path because Meta exposes it through the
20 Llama API but not a first-party HF repo. DLM still keeps the same
21 acceptance + non-redistribution policy surface for the whole Llama
22 family (`requires_acceptance=True`, `redistributable=False`) —
23 enforced by the pack gate and share-protocol refusal.
24 - SmolLM2 / SmolLM3 and Phi-3.5-mini are permissive (Apache-2.0 / MIT).
25 - `size_gb_fp16` is approximate; the hardware doctor uses it to seed
26 VRAM estimates, which then get refined by runtime checks.
27 """
28
29 from __future__ import annotations
30
31 from typing import Final
32
33 from dlm.base_models.schema import AudioPreprocessorPlan, BaseModelSpec, VlPreprocessorPlan
34
35 _ENTRIES: tuple[BaseModelSpec, ...] = (
36 BaseModelSpec(
37 key="qwen2.5-0.5b",
38 hf_id="Qwen/Qwen2.5-0.5B-Instruct",
39 revision="7ae557604adf67be50417f59c2c2f167def9a775",
40 architecture="Qwen2ForCausalLM",
41 params=500_000_000,
42 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
43 template="chatml",
44 gguf_arch="qwen2",
45 tokenizer_pre="qwen2",
46 license_spdx="Apache-2.0",
47 license_url="https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct/blob/main/LICENSE",
48 requires_acceptance=False,
49 redistributable=True,
50 size_gb_fp16=1.0,
51 context_length=32_768,
52 recommended_seq_len=2048,
53 ),
54 BaseModelSpec(
55 key="qwen2.5-1.5b",
56 hf_id="Qwen/Qwen2.5-1.5B-Instruct",
57 revision="989aa7980e4cf806f80c7fef2b1adb7bc71aa306",
58 architecture="Qwen2ForCausalLM",
59 params=1_500_000_000,
60 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
61 template="chatml",
62 gguf_arch="qwen2",
63 tokenizer_pre="qwen2",
64 license_spdx="Apache-2.0",
65 license_url="https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct/blob/main/LICENSE",
66 requires_acceptance=False,
67 redistributable=True,
68 size_gb_fp16=3.1,
69 context_length=32_768,
70 recommended_seq_len=2048,
71 ),
72 BaseModelSpec(
73 key="qwen2.5-3b",
74 hf_id="Qwen/Qwen2.5-3B-Instruct",
75 revision="aa8e72537993ba99e69dfaafa59ed015b17504d1",
76 architecture="Qwen2ForCausalLM",
77 params=3_000_000_000,
78 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
79 template="chatml",
80 gguf_arch="qwen2",
81 tokenizer_pre="qwen2",
82 license_spdx="Other",
83 license_url="https://huggingface.co/Qwen/Qwen2.5-3B-Instruct/blob/main/LICENSE",
84 requires_acceptance=False,
85 redistributable=True,
86 size_gb_fp16=6.2,
87 context_length=32_768,
88 recommended_seq_len=2048,
89 ),
90 BaseModelSpec(
91 key="qwen2.5-coder-1.5b",
92 hf_id="Qwen/Qwen2.5-Coder-1.5B-Instruct",
93 revision="2e1fd397ee46e1388853d2af2c993145b0f1098a",
94 architecture="Qwen2ForCausalLM",
95 params=1_500_000_000,
96 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
97 template="chatml",
98 gguf_arch="qwen2",
99 tokenizer_pre="qwen2",
100 license_spdx="Apache-2.0",
101 license_url="https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct/blob/main/LICENSE",
102 requires_acceptance=False,
103 redistributable=True,
104 size_gb_fp16=3.1,
105 context_length=32_768,
106 recommended_seq_len=2048,
107 ),
108 BaseModelSpec(
109 key="qwen3-1.7b",
110 hf_id="Qwen/Qwen3-1.7B",
111 revision="70d244cc86ccca08cf5af4e1e306ecf908b1ad5e",
112 architecture="Qwen3ForCausalLM",
113 params=1_700_000_000,
114 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
115 template="chatml",
116 gguf_arch="qwen3",
117 tokenizer_pre="qwen2",
118 license_spdx="Apache-2.0",
119 license_url="https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/LICENSE",
120 requires_acceptance=False,
121 redistributable=True,
122 size_gb_fp16=3.4,
123 context_length=32_768,
124 recommended_seq_len=2048,
125 reasoning_tuned=True,
126 ),
127 BaseModelSpec(
128 key="qwen3-1.7b-thinking",
129 hf_id="Qwen/Qwen3-1.7B",
130 revision="70d244cc86ccca08cf5af4e1e306ecf908b1ad5e",
131 architecture="Qwen3ForCausalLM",
132 params=1_700_000_000,
133 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
134 template="qwen3thinking",
135 gguf_arch="qwen3",
136 tokenizer_pre="qwen2",
137 license_spdx="Apache-2.0",
138 license_url="https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/LICENSE",
139 requires_acceptance=False,
140 redistributable=True,
141 size_gb_fp16=3.4,
142 context_length=32_768,
143 recommended_seq_len=2048,
144 reasoning_tuned=True,
145 ),
146 BaseModelSpec(
147 key="qwen3-4b",
148 hf_id="Qwen/Qwen3-4B",
149 revision="1cfa9a7208912126459214e8b04321603b3df60c",
150 architecture="Qwen3ForCausalLM",
151 params=4_000_000_000,
152 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
153 template="chatml",
154 gguf_arch="qwen3",
155 tokenizer_pre="qwen2",
156 license_spdx="Apache-2.0",
157 license_url="https://huggingface.co/Qwen/Qwen3-4B/blob/main/LICENSE",
158 requires_acceptance=False,
159 redistributable=True,
160 size_gb_fp16=8.0,
161 context_length=32_768,
162 recommended_seq_len=2048,
163 reasoning_tuned=True,
164 ),
165 BaseModelSpec(
166 key="qwen3-8b",
167 hf_id="Qwen/Qwen3-8B",
168 revision="b968826d9c46dd6066d109eabc6255188de91218",
169 architecture="Qwen3ForCausalLM",
170 params=8_000_000_000,
171 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
172 template="chatml",
173 gguf_arch="qwen3",
174 tokenizer_pre="qwen2",
175 license_spdx="Apache-2.0",
176 license_url="https://huggingface.co/Qwen/Qwen3-8B/blob/main/LICENSE",
177 requires_acceptance=False,
178 redistributable=True,
179 size_gb_fp16=16.0,
180 context_length=32_768,
181 recommended_seq_len=2048,
182 reasoning_tuned=True,
183 ),
184 BaseModelSpec(
185 key="llama-3.2-1b",
186 hf_id="meta-llama/Llama-3.2-1B-Instruct",
187 revision="9213176726f574b556790deb65791e0c5aa438b6",
188 architecture="LlamaForCausalLM",
189 params=1_000_000_000,
190 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
191 template="llama3",
192 gguf_arch="llama",
193 tokenizer_pre="llama-bpe",
194 license_spdx="Other",
195 license_url="https://www.llama.com/llama3_2/license/",
196 requires_acceptance=True,
197 redistributable=False,
198 size_gb_fp16=2.5,
199 context_length=131_072,
200 recommended_seq_len=4096,
201 ),
202 BaseModelSpec(
203 key="llama-3.2-3b",
204 hf_id="meta-llama/Llama-3.2-3B-Instruct",
205 revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
206 architecture="LlamaForCausalLM",
207 params=3_000_000_000,
208 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
209 template="llama3",
210 gguf_arch="llama",
211 tokenizer_pre="llama-bpe",
212 license_spdx="Other",
213 license_url="https://www.llama.com/llama3_2/license/",
214 requires_acceptance=True,
215 redistributable=False,
216 size_gb_fp16=6.5,
217 context_length=131_072,
218 recommended_seq_len=4096,
219 ),
220 BaseModelSpec(
221 key="llama-3.3-8b-instruct",
222 # Meta's first-party LlamaCon announcement explicitly says the
223 # Llama API can fine-tune "o novo modelo Llama 3.3 8B", but
224 # there is still no first-party HF repo. DLM therefore fetches
225 # weights from the community mirror below while
226 # refresh-registry separately probes Meta's newsroom article
227 # for provenance.
228 hf_id="allura-forge/Llama-3.3-8B-Instruct",
229 revision="df95224cf87c32d9f4958dd284a07ded620aa4fc",
230 architecture="LlamaForCausalLM",
231 params=8_000_000_000,
232 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
233 template="llama3",
234 gguf_arch="llama",
235 tokenizer_pre="llama-bpe",
236 license_spdx="Other",
237 license_url="https://llama.meta.com/llama3/license",
238 requires_acceptance=True,
239 redistributable=False,
240 size_gb_fp16=16.5,
241 context_length=131_072,
242 context_length_effective=8_192,
243 recommended_seq_len=4096,
244 refresh_check_hf_gating=False,
245 provenance_url=(
246 "https://about.fb.com/br/news/2025/04/tudo-o-que-anunciamos-no-nosso-primeiro-llamacon/"
247 ),
248 provenance_match_text="novo modelo Llama 3.3 8B",
249 ),
250 BaseModelSpec(
251 key="smollm3-3b",
252 hf_id="HuggingFaceTB/SmolLM3-3B",
253 revision="a07cc9a04f16550a088caea529712d1d335b0ac1",
254 architecture="SmolLM3ForCausalLM",
255 params=3_000_000_000,
256 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
257 template="smollm3",
258 gguf_arch="llama",
259 tokenizer_pre="smollm",
260 license_spdx="Apache-2.0",
261 license_url="https://huggingface.co/HuggingFaceTB/SmolLM3-3B",
262 requires_acceptance=False,
263 redistributable=True,
264 size_gb_fp16=6.2,
265 context_length=65_536,
266 recommended_seq_len=4096,
267 reasoning_tuned=True,
268 ),
269 BaseModelSpec(
270 key="olmo-2-7b-instruct",
271 hf_id="allenai/OLMo-2-1124-7B-Instruct",
272 revision="470b1fba1ae01581f270116362ee4aa1b97f4c84",
273 architecture="Olmo2ForCausalLM",
274 params=7_000_000_000,
275 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
276 template="olmo2",
277 gguf_arch="olmo2",
278 tokenizer_pre="superbpe",
279 license_spdx="Apache-2.0",
280 license_url="https://huggingface.co/allenai/OLMo-2-1124-7B-Instruct",
281 requires_acceptance=False,
282 redistributable=True,
283 size_gb_fp16=14.6,
284 context_length=4096,
285 recommended_seq_len=2048,
286 ),
287 BaseModelSpec(
288 key="gemma-2-2b-it",
289 hf_id="google/gemma-2-2b-it",
290 revision="299a8560bedf22ed1c72a8a11e7dce4a7f9f51f8",
291 architecture="Gemma2ForCausalLM",
292 params=2_600_000_000,
293 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
294 template="gemma2",
295 gguf_arch="gemma2",
296 tokenizer_pre="gemma",
297 license_spdx="Gemma",
298 license_url="https://ai.google.dev/gemma/terms",
299 requires_acceptance=True,
300 redistributable=False,
301 size_gb_fp16=5.2,
302 context_length=8192,
303 recommended_seq_len=2048,
304 ),
305 BaseModelSpec(
306 key="gemma-2-9b-it",
307 hf_id="google/gemma-2-9b-it",
308 revision="11c9b309abf73637e4b6f9a3fa1e92e615547819",
309 architecture="Gemma2ForCausalLM",
310 params=9_000_000_000,
311 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
312 template="gemma2",
313 gguf_arch="gemma2",
314 tokenizer_pre="gemma",
315 license_spdx="Gemma",
316 license_url="https://ai.google.dev/gemma/terms",
317 requires_acceptance=True,
318 redistributable=False,
319 size_gb_fp16=18.0,
320 context_length=8192,
321 recommended_seq_len=2048,
322 ),
323 BaseModelSpec(
324 key="smollm2-135m",
325 hf_id="HuggingFaceTB/SmolLM2-135M-Instruct",
326 revision="12fd25f77366fa6b3b4b768ec3050bf629380bac",
327 architecture="LlamaForCausalLM",
328 params=135_000_000,
329 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
330 template="chatml",
331 gguf_arch="llama",
332 tokenizer_pre="smollm",
333 license_spdx="Apache-2.0",
334 license_url="https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct",
335 requires_acceptance=False,
336 redistributable=True,
337 size_gb_fp16=0.27,
338 context_length=8_192,
339 recommended_seq_len=1024,
340 ),
341 BaseModelSpec(
342 key="smollm2-360m",
343 hf_id="HuggingFaceTB/SmolLM2-360M-Instruct",
344 revision="a10cc1512eabd3dde888204e902eca88bddb4951",
345 architecture="LlamaForCausalLM",
346 params=360_000_000,
347 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
348 template="chatml",
349 gguf_arch="llama",
350 tokenizer_pre="smollm",
351 license_spdx="Apache-2.0",
352 license_url="https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct",
353 requires_acceptance=False,
354 redistributable=True,
355 size_gb_fp16=0.72,
356 context_length=8_192,
357 recommended_seq_len=1024,
358 ),
359 BaseModelSpec(
360 key="smollm2-1.7b",
361 hf_id="HuggingFaceTB/SmolLM2-1.7B-Instruct",
362 revision="31b70e2e869a7173562077fd711b654946d38674",
363 architecture="LlamaForCausalLM",
364 params=1_700_000_000,
365 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
366 template="chatml",
367 gguf_arch="llama",
368 tokenizer_pre="smollm",
369 license_spdx="Apache-2.0",
370 license_url="https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct",
371 requires_acceptance=False,
372 redistributable=True,
373 size_gb_fp16=3.4,
374 context_length=8_192,
375 recommended_seq_len=2048,
376 ),
377 BaseModelSpec(
378 key="phi-3.5-mini",
379 hf_id="microsoft/Phi-3.5-mini-instruct",
380 revision="2fe192450127e6a83f7441aef6e3ca586c338b77",
381 architecture="Phi3ForCausalLM",
382 params=3_800_000_000,
383 target_modules=["qkv_proj", "o_proj", "gate_up_proj", "down_proj"],
384 template="phi3",
385 gguf_arch="phi3",
386 tokenizer_pre="phi-2",
387 license_spdx="MIT",
388 license_url="https://huggingface.co/microsoft/Phi-3.5-mini-instruct/blob/main/LICENSE",
389 requires_acceptance=False,
390 redistributable=True,
391 size_gb_fp16=7.6,
392 context_length=131_072,
393 recommended_seq_len=2048,
394 ),
395 BaseModelSpec(
396 key="phi-4-mini-reasoning",
397 hf_id="microsoft/Phi-4-mini-reasoning",
398 revision="0e3b1e2d02ee478a3743abe3f629e9c0cb722e0a",
399 architecture="Phi3ForCausalLM",
400 params=3_800_000_000,
401 target_modules=["qkv_proj", "o_proj", "gate_up_proj", "down_proj"],
402 template="phi4mini",
403 gguf_arch="phi3",
404 tokenizer_pre="phi-2",
405 license_spdx="MIT",
406 license_url="https://huggingface.co/microsoft/Phi-4-mini-reasoning/blob/main/LICENSE",
407 requires_acceptance=False,
408 redistributable=True,
409 size_gb_fp16=7.6,
410 context_length=131_072,
411 recommended_seq_len=2048,
412 reasoning_tuned=True,
413 ),
414 # Mixtral-8x7B-Instruct-v0.1 — Apache-2.0 sparse MoE base.
415 #
416 # HF exposes this as `MixtralForCausalLM`, but the current vendored
417 # llama.cpp converter routes it through the Llama path rather than a
418 # distinct Mixtral architecture class. We therefore keep
419 # `gguf_arch="llama"` while marking the modality as `text-moe` so
420 # DLM's gate substrate can detect the sparse-MoE family explicitly.
421 BaseModelSpec(
422 key="mixtral-8x7b-instruct",
423 hf_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
424 revision="eba92302a2861cdc0098cc54bc9f17cb2c47eb61",
425 architecture="MixtralForCausalLM",
426 params=46_700_000_000,
427 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
428 template="mistral",
429 gguf_arch="llama",
430 tokenizer_pre="llama-bpe",
431 license_spdx="Apache-2.0",
432 license_url="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
433 requires_acceptance=False,
434 redistributable=True,
435 size_gb_fp16=93.4,
436 context_length=32_768,
437 recommended_seq_len=2048,
438 modality="text-moe",
439 ),
440 # Mistral Small 3.1 24B Instruct — Apache-2.0 multimodal base with
441 # native vision support and 128k context.
442 #
443 # An earlier draft treated this as text-only; the live HF config
444 # is `Mistral3ForConditionalGeneration` with both text and
445 # vision towers, so we register it as vision-language. The current
446 # processor config pins `[IMG]` as the image placeholder and a
447 # longest edge of 1540 px. DLM's current `VlPreprocessorPlan`
448 # abstraction is fixed-size only, so we conservatively pin
449 # 1540×1540 here until dynamic ranges land.
450 BaseModelSpec(
451 key="mistral-small-3.1-24b-instruct",
452 hf_id="mistralai/Mistral-Small-3.1-24B-Instruct-2503",
453 revision="68faf511d618ef198fef186659617cfd2eb8e33a",
454 architecture="Mistral3ForConditionalGeneration",
455 params=24_000_000_000,
456 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
457 template="mistral",
458 gguf_arch="mistral3",
459 tokenizer_pre="tekken",
460 license_spdx="Apache-2.0",
461 license_url="https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503",
462 requires_acceptance=False,
463 redistributable=True,
464 size_gb_fp16=48.0,
465 context_length=131_072,
466 recommended_seq_len=4096,
467 modality="vision-language",
468 vl_preprocessor_plan=VlPreprocessorPlan(
469 target_size=(1540, 1540),
470 resize_policy="fixed",
471 image_token="[IMG]",
472 num_image_tokens=3025,
473 ),
474 ),
475 # --- Vision-language bases ----------------------------------------------
476 # PaliGemma-3B-mix-224 — Google's instruction-tuned VL base built on
477 # Gemma-2B + SigLIP-So400m. Gated under the Gemma license; cannot
478 # redistribute inside a `.dlm.pack` (same pattern as Llama-3.2).
479 # Training targets Gemma's transformer blocks; the vision tower is
480 # trained jointly when modules_to_save expands to ["embed_tokens",
481 # "lm_head"], but the current entry keeps modules_to_save empty so
482 # only the LLM-side LoRA adapters move — the vision tower is frozen.
483 #
484 # `gguf_arch` / `tokenizer_pre` are set to tags the current vendored
485 # llama.cpp doesn't recognize; the export probes surface
486 # UNSUPPORTED + refuse GGUF conversion until GGUF support lands.
487 # HF-snapshot export (`dlm export --hf-snapshot`) still works.
488 BaseModelSpec(
489 key="paligemma-3b-mix-224",
490 hf_id="google/paligemma-3b-mix-224",
491 revision="d1d8734c9c3ad0ccfeea4afc270faa356c2ba515",
492 architecture="PaliGemmaForConditionalGeneration",
493 params=2_900_000_000,
494 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
495 template="paligemma",
496 gguf_arch="paligemma",
497 tokenizer_pre="gemma",
498 license_spdx="Other",
499 license_url="https://ai.google.dev/gemma/terms",
500 requires_acceptance=True,
501 redistributable=False,
502 size_gb_fp16=6.5,
503 context_length=8_192,
504 recommended_seq_len=2048,
505 modality="vision-language",
506 vl_preprocessor_plan=VlPreprocessorPlan(
507 target_size=(224, 224),
508 resize_policy="fixed",
509 image_token="<image>",
510 num_image_tokens=256,
511 ),
512 ),
513 # Qwen2-VL-2B-Instruct — Alibaba's Apache-2.0 VL base with dynamic-
514 # resolution support in native HF. The current entry pins a
515 # conservative fixed 672×672 preprocessing plan to avoid growing
516 # the VlPreprocessorPlan abstraction for dynamic ranges yet; a
517 # future extension can add {min_pixels, max_pixels} when needed.
518 #
519 # 672×672 with Qwen2-VL's 28-pixel patch-merger grid yields 24×24 =
520 # 576 vision tokens per image. `<|image_pad|>` is the runtime
521 # placeholder the processor expands into that window.
522 #
523 # Apache-2.0 (redistributable, no acceptance). `AutoModelForImageTextToText`
524 # handles this arch natively since transformers ≥4.45 — same path
525 # PaliGemma loads through.
526 BaseModelSpec(
527 key="qwen2-vl-2b-instruct",
528 hf_id="Qwen/Qwen2-VL-2B-Instruct",
529 revision="895c3a49bc3fa70a340399125c650a463535e71c",
530 architecture="Qwen2VLForConditionalGeneration",
531 params=2_200_000_000,
532 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
533 template="qwen2-vl",
534 gguf_arch="qwen2-vl",
535 tokenizer_pre="qwen2",
536 license_spdx="Apache-2.0",
537 license_url="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/blob/main/LICENSE",
538 requires_acceptance=False,
539 redistributable=True,
540 size_gb_fp16=4.5,
541 context_length=32_768,
542 recommended_seq_len=2048,
543 modality="vision-language",
544 vl_preprocessor_plan=VlPreprocessorPlan(
545 target_size=(672, 672),
546 resize_policy="fixed",
547 image_token="<|image_pad|>",
548 num_image_tokens=576,
549 ),
550 ),
551 # InternVL2-2B — OpenGVLab's MIT-licensed 2B VL model. Uses fixed
552 # 448×448 input (32×32 patch grid with 2×2 pixel-shuffle → 256
553 # vision tokens per image).
554 #
555 # **Security surface: trust_remote_code=True**. InternVL2's HF
556 # integration is `InternVLChatModel`, a custom class defined in
557 # `modeling_internvl_chat.py` inside the model repo — not in
558 # transformers. Loading it requires executing that repo's code.
559 # The loader sets `trust_remote_code=True` when this spec is
560 # picked (`trust_remote_code` field below), so picking this base
561 # as `base_model: internvl2-2b` in a .dlm is the user's
562 # informed acknowledgment that remote code runs at load time.
563 # The cookbook + vl-memory.md flag this too.
564 BaseModelSpec(
565 key="internvl2-2b",
566 hf_id="OpenGVLab/InternVL2-2B",
567 revision="e4f6747bd20f139e637642c6a058c6bd00b36919",
568 architecture="InternVLChatModel",
569 params=2_200_000_000,
570 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
571 template="internvl2",
572 gguf_arch="internvl2",
573 tokenizer_pre="internvl2",
574 license_spdx="MIT",
575 license_url="https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/LICENSE",
576 requires_acceptance=False,
577 redistributable=True,
578 trust_remote_code=True,
579 size_gb_fp16=4.4,
580 context_length=8_192,
581 recommended_seq_len=2048,
582 modality="vision-language",
583 vl_preprocessor_plan=VlPreprocessorPlan(
584 target_size=(448, 448),
585 resize_policy="fixed",
586 image_token="<IMG_CONTEXT>",
587 num_image_tokens=256,
588 ),
589 ),
590 BaseModelSpec(
591 key="internvl3-2b",
592 hf_id="OpenGVLab/InternVL3-2B",
593 revision="899155015275a9b7338c7f4677e19c784e0e5a21",
594 architecture="InternVLChatModel",
595 params=2_000_000_000,
596 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
597 template="internvl2",
598 gguf_arch="internvl3",
599 tokenizer_pre="internvl3",
600 license_spdx="Apache-2.0",
601 license_url="https://huggingface.co/OpenGVLab/InternVL3-2B",
602 requires_acceptance=False,
603 redistributable=True,
604 trust_remote_code=True,
605 size_gb_fp16=4.0,
606 context_length=32_768,
607 recommended_seq_len=2048,
608 modality="vision-language",
609 vl_preprocessor_plan=VlPreprocessorPlan(
610 target_size=(448, 448),
611 resize_policy="dynamic",
612 image_token="<image>",
613 num_image_tokens=256,
614 ),
615 ),
616 # --- Audio-language bases -----------------------------------------------
617 # Qwen2-Audio-7B-Instruct — Alibaba's open audio-text model. Uses
618 # the Qwen2 LLM backbone + a dedicated audio encoder. Apache-2.0
619 # and currently ungated on HF, so the registry keeps it open and
620 # redistributable like the other permissive Qwen rows.
621 #
622 # The 16 kHz pin + 30 s max-length match the training-time
623 # defaults documented in the Qwen2-Audio card. Resampling support
624 # lands as follow-up work; current releases refuse mismatched
625 # sample rates with an actionable error at preprocess time.
626 #
627 # Placeholder SHA flagged the same way as paligemma — the weekly
628 # `scripts/refresh-registry.py --check` run surfaces drift and a
629 # maintainer pastes in the real SHA.
630 BaseModelSpec(
631 key="qwen2-audio-7b-instruct",
632 hf_id="Qwen/Qwen2-Audio-7B-Instruct",
633 revision="0a095220c30b7b31434169c3086508ef3ea5bf0a",
634 architecture="Qwen2AudioForConditionalGeneration",
635 params=8_400_000_000,
636 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
637 template="qwen2-audio",
638 gguf_arch="qwen2-audio",
639 tokenizer_pre="qwen2",
640 license_spdx="Apache-2.0",
641 license_url="https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct",
642 requires_acceptance=False,
643 redistributable=True,
644 size_gb_fp16=15.5,
645 context_length=8_192,
646 recommended_seq_len=2048,
647 modality="audio-language",
648 audio_preprocessor_plan=AudioPreprocessorPlan(
649 sample_rate=16_000,
650 max_length_seconds=30.0,
651 audio_token="<|AUDIO|>",
652 num_audio_tokens=750,
653 ),
654 ),
655 )
656
657
658 BASE_MODELS: Final[dict[str, BaseModelSpec]] = {entry.key: entry for entry in _ENTRIES}
659
660
661 def known_keys() -> tuple[str, ...]:
662 """Stable ordering for use in error messages / CLI listings."""
663 return tuple(BASE_MODELS.keys())