Add Mixtral MoE registry entry
- SHA
49601e0e277143e8784f7498eb94852ee6db40d9- Parents
-
7635122 - Tree
7c4275b
49601e0
49601e0e277143e8784f7498eb94852ee6db40d97635122
7c4275b| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm/base_models/registry.py
|
29 | 0 |
| M |
src/dlm/base_models/resolver.py
|
2 | 1 |
| M |
tests/unit/base_models/test_registry.py
|
1 | 0 |
| M |
tests/unit/base_models/test_registry_2026.py
|
27 | 0 |
| M |
tests/unit/base_models/test_resolver_hf_escape.py
|
2 | 0 |
src/dlm/base_models/registry.pymodified@@ -401,6 +401,35 @@ _ENTRIES: tuple[BaseModelSpec, ...] = ( | ||
| 401 | 401 | recommended_seq_len=2048, |
| 402 | 402 | reasoning_tuned=True, |
| 403 | 403 | ), |
| 404 | + # Mixtral-8x7B-Instruct-v0.1 — Apache-2.0 sparse MoE base. | |
| 405 | + # | |
| 406 | + # HF exposes this as `MixtralForCausalLM`, but the current vendored | |
| 407 | + # llama.cpp converter routes it through the Llama path rather than a | |
| 408 | + # distinct Mixtral architecture class. We therefore keep | |
| 409 | + # `gguf_arch="llama"` while marking the modality as `text-moe` so | |
| 410 | + # DLM's gate substrate can detect the sparse-MoE family explicitly. | |
| 411 | + BaseModelSpec( | |
| 412 | + key="mixtral-8x7b-instruct", | |
| 413 | + hf_id="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| 414 | + # Placeholder SHA: format-valid, not a real HF commit. The | |
| 415 | + # weekly `scripts/refresh-registry.py --check` run surfaces | |
| 416 | + # drift and prints the live value for manual review. | |
| 417 | + revision="bc0deffedcba0987654321abc2d3e4f5a6b7c8d9", | |
| 418 | + architecture="MixtralForCausalLM", | |
| 419 | + params=46_700_000_000, | |
| 420 | + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], | |
| 421 | + template="mistral", | |
| 422 | + gguf_arch="llama", | |
| 423 | + tokenizer_pre="llama-bpe", | |
| 424 | + license_spdx="Apache-2.0", | |
| 425 | + license_url="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| 426 | + requires_acceptance=False, | |
| 427 | + redistributable=True, | |
| 428 | + size_gb_fp16=93.4, | |
| 429 | + context_length=32_768, | |
| 430 | + recommended_seq_len=2048, | |
| 431 | + modality="text-moe", | |
| 432 | + ), | |
| 404 | 433 | # Mistral Small 3.1 24B Instruct — Apache-2.0 multimodal base with |
| 405 | 434 | # native vision support and 128k context. |
| 406 | 435 | # |
src/dlm/base_models/resolver.pymodified@@ -238,6 +238,7 @@ def _infer_gguf_arch(architecture: str) -> str: | ||
| 238 | 238 | "Qwen2ForCausalLM": "qwen2", |
| 239 | 239 | "Qwen3ForCausalLM": "qwen3", |
| 240 | 240 | "MistralForCausalLM": "llama", |
| 241 | + "MixtralForCausalLM": "llama", | |
| 241 | 242 | "Phi3ForCausalLM": "phi3", |
| 242 | 243 | "GemmaForCausalLM": "gemma", |
| 243 | 244 | "Gemma2ForCausalLM": "gemma2", |
@@ -260,7 +261,7 @@ def _infer_template(hf_id: str, architecture: str) -> TemplateDialect: | ||
| 260 | 261 | return "phi4mini" |
| 261 | 262 | if architecture.startswith("Phi"): |
| 262 | 263 | return "phi3" |
| 263 | - if architecture.startswith("Mistral"): | |
| 264 | + if architecture.startswith(("Mistral", "Mixtral")): | |
| 264 | 265 | return "mistral" |
| 265 | 266 | return "chatml" |
| 266 | 267 | |
tests/unit/base_models/test_registry.pymodified@@ -75,6 +75,7 @@ class TestLicenseFields: | ||
| 75 | 75 | "qwen3-1.7b", |
| 76 | 76 | "qwen3-4b", |
| 77 | 77 | "qwen3-8b", |
| 78 | + "mixtral-8x7b-instruct", | |
| 78 | 79 | "smollm3-3b", |
| 79 | 80 | "olmo-2-7b-instruct", |
| 80 | 81 | "smollm2-135m", |
tests/unit/base_models/test_registry_2026.pymodified@@ -183,3 +183,30 @@ class TestPhi4MiniReasoningRegistryEntry: | ||
| 183 | 183 | assert spec.size_gb_fp16 == pytest.approx(7.6) |
| 184 | 184 | assert spec.context_length == 131_072 |
| 185 | 185 | assert spec.recommended_seq_len == 2048 |
| 186 | + | |
| 187 | + | |
| 188 | +class TestMixtralRegistryEntry: | |
| 189 | + def test_entry_present(self) -> None: | |
| 190 | + assert "mixtral-8x7b-instruct" in BASE_MODELS | |
| 191 | + | |
| 192 | + def test_entry_is_open_sparse_moe(self) -> None: | |
| 193 | + spec = BASE_MODELS["mixtral-8x7b-instruct"] | |
| 194 | + assert spec.license_spdx == "Apache-2.0" | |
| 195 | + assert spec.requires_acceptance is False | |
| 196 | + assert spec.redistributable is True | |
| 197 | + assert spec.modality == "text-moe" | |
| 198 | + | |
| 199 | + def test_entry_matches_live_family_shape(self) -> None: | |
| 200 | + spec = BASE_MODELS["mixtral-8x7b-instruct"] | |
| 201 | + assert spec.hf_id == "mistralai/Mixtral-8x7B-Instruct-v0.1" | |
| 202 | + assert spec.architecture == "MixtralForCausalLM" | |
| 203 | + assert spec.template == "mistral" | |
| 204 | + assert spec.gguf_arch == "llama" | |
| 205 | + assert spec.tokenizer_pre == "llama-bpe" | |
| 206 | + | |
| 207 | + def test_entry_uses_total_parameter_and_context_hints(self) -> None: | |
| 208 | + spec = BASE_MODELS["mixtral-8x7b-instruct"] | |
| 209 | + assert spec.params == 46_700_000_000 | |
| 210 | + assert spec.size_gb_fp16 == pytest.approx(93.4) | |
| 211 | + assert spec.context_length == 32_768 | |
| 212 | + assert spec.recommended_seq_len == 2048 | |
tests/unit/base_models/test_resolver_hf_escape.pymodified@@ -61,6 +61,7 @@ class TestInferGgufArch: | ||
| 61 | 61 | ("Qwen2ForCausalLM", "qwen2"), |
| 62 | 62 | ("Qwen3ForCausalLM", "qwen3"), |
| 63 | 63 | ("MistralForCausalLM", "llama"), |
| 64 | + ("MixtralForCausalLM", "llama"), | |
| 64 | 65 | ("Phi3ForCausalLM", "phi3"), |
| 65 | 66 | ("GemmaForCausalLM", "gemma"), |
| 66 | 67 | ("Gemma2ForCausalLM", "gemma2"), |
@@ -86,6 +87,7 @@ class TestInferTemplate: | ||
| 86 | 87 | ("microsoft/Phi-4-mini-reasoning", "Phi3ForCausalLM", "phi4mini"), |
| 87 | 88 | ("microsoft/Phi-3.5-mini-instruct", "Phi3ForCausalLM", "phi3"), |
| 88 | 89 | ("mistralai/Mistral-7B-Instruct", "MistralForCausalLM", "mistral"), |
| 90 | + ("mistralai/Mixtral-8x7B-Instruct-v0.1", "MixtralForCausalLM", "mistral"), | |
| 89 | 91 | ("Qwen/Qwen2.5-1.5B-Instruct", "Qwen2ForCausalLM", "chatml"), |
| 90 | 92 | ], |
| 91 | 93 | ) |