tenseleyflow/documentlanguagemodel / 49601e0

Browse files

Add Mixtral MoE registry entry

Authored by espadonne
SHA
49601e0e277143e8784f7498eb94852ee6db40d9
Parents
7635122
Tree
7c4275b

5 changed files

StatusFile+-
M src/dlm/base_models/registry.py 29 0
M src/dlm/base_models/resolver.py 2 1
M tests/unit/base_models/test_registry.py 1 0
M tests/unit/base_models/test_registry_2026.py 27 0
M tests/unit/base_models/test_resolver_hf_escape.py 2 0
src/dlm/base_models/registry.pymodified
@@ -401,6 +401,35 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
401401
         recommended_seq_len=2048,
402402
         reasoning_tuned=True,
403403
     ),
404
+    # Mixtral-8x7B-Instruct-v0.1 — Apache-2.0 sparse MoE base.
405
+    #
406
+    # HF exposes this as `MixtralForCausalLM`, but the current vendored
407
+    # llama.cpp converter routes it through the Llama path rather than a
408
+    # distinct Mixtral architecture class. We therefore keep
409
+    # `gguf_arch="llama"` while marking the modality as `text-moe` so
410
+    # DLM's gate substrate can detect the sparse-MoE family explicitly.
411
+    BaseModelSpec(
412
+        key="mixtral-8x7b-instruct",
413
+        hf_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
414
+        # Placeholder SHA: format-valid, not a real HF commit. The
415
+        # weekly `scripts/refresh-registry.py --check` run surfaces
416
+        # drift and prints the live value for manual review.
417
+        revision="bc0deffedcba0987654321abc2d3e4f5a6b7c8d9",
418
+        architecture="MixtralForCausalLM",
419
+        params=46_700_000_000,
420
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
421
+        template="mistral",
422
+        gguf_arch="llama",
423
+        tokenizer_pre="llama-bpe",
424
+        license_spdx="Apache-2.0",
425
+        license_url="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
426
+        requires_acceptance=False,
427
+        redistributable=True,
428
+        size_gb_fp16=93.4,
429
+        context_length=32_768,
430
+        recommended_seq_len=2048,
431
+        modality="text-moe",
432
+    ),
404433
     # Mistral Small 3.1 24B Instruct — Apache-2.0 multimodal base with
405434
     # native vision support and 128k context.
406435
     #
src/dlm/base_models/resolver.pymodified
@@ -238,6 +238,7 @@ def _infer_gguf_arch(architecture: str) -> str:
238238
         "Qwen2ForCausalLM": "qwen2",
239239
         "Qwen3ForCausalLM": "qwen3",
240240
         "MistralForCausalLM": "llama",
241
+        "MixtralForCausalLM": "llama",
241242
         "Phi3ForCausalLM": "phi3",
242243
         "GemmaForCausalLM": "gemma",
243244
         "Gemma2ForCausalLM": "gemma2",
@@ -260,7 +261,7 @@ def _infer_template(hf_id: str, architecture: str) -> TemplateDialect:
260261
         return "phi4mini"
261262
     if architecture.startswith("Phi"):
262263
         return "phi3"
263
-    if architecture.startswith("Mistral"):
264
+    if architecture.startswith(("Mistral", "Mixtral")):
264265
         return "mistral"
265266
     return "chatml"
266267
 
tests/unit/base_models/test_registry.pymodified
@@ -75,6 +75,7 @@ class TestLicenseFields:
7575
             "qwen3-1.7b",
7676
             "qwen3-4b",
7777
             "qwen3-8b",
78
+            "mixtral-8x7b-instruct",
7879
             "smollm3-3b",
7980
             "olmo-2-7b-instruct",
8081
             "smollm2-135m",
tests/unit/base_models/test_registry_2026.pymodified
@@ -183,3 +183,30 @@ class TestPhi4MiniReasoningRegistryEntry:
183183
         assert spec.size_gb_fp16 == pytest.approx(7.6)
184184
         assert spec.context_length == 131_072
185185
         assert spec.recommended_seq_len == 2048
186
+
187
+
188
+class TestMixtralRegistryEntry:
189
+    def test_entry_present(self) -> None:
190
+        assert "mixtral-8x7b-instruct" in BASE_MODELS
191
+
192
+    def test_entry_is_open_sparse_moe(self) -> None:
193
+        spec = BASE_MODELS["mixtral-8x7b-instruct"]
194
+        assert spec.license_spdx == "Apache-2.0"
195
+        assert spec.requires_acceptance is False
196
+        assert spec.redistributable is True
197
+        assert spec.modality == "text-moe"
198
+
199
+    def test_entry_matches_live_family_shape(self) -> None:
200
+        spec = BASE_MODELS["mixtral-8x7b-instruct"]
201
+        assert spec.hf_id == "mistralai/Mixtral-8x7B-Instruct-v0.1"
202
+        assert spec.architecture == "MixtralForCausalLM"
203
+        assert spec.template == "mistral"
204
+        assert spec.gguf_arch == "llama"
205
+        assert spec.tokenizer_pre == "llama-bpe"
206
+
207
+    def test_entry_uses_total_parameter_and_context_hints(self) -> None:
208
+        spec = BASE_MODELS["mixtral-8x7b-instruct"]
209
+        assert spec.params == 46_700_000_000
210
+        assert spec.size_gb_fp16 == pytest.approx(93.4)
211
+        assert spec.context_length == 32_768
212
+        assert spec.recommended_seq_len == 2048
tests/unit/base_models/test_resolver_hf_escape.pymodified
@@ -61,6 +61,7 @@ class TestInferGgufArch:
6161
             ("Qwen2ForCausalLM", "qwen2"),
6262
             ("Qwen3ForCausalLM", "qwen3"),
6363
             ("MistralForCausalLM", "llama"),
64
+            ("MixtralForCausalLM", "llama"),
6465
             ("Phi3ForCausalLM", "phi3"),
6566
             ("GemmaForCausalLM", "gemma"),
6667
             ("Gemma2ForCausalLM", "gemma2"),
@@ -86,6 +87,7 @@ class TestInferTemplate:
8687
             ("microsoft/Phi-4-mini-reasoning", "Phi3ForCausalLM", "phi4mini"),
8788
             ("microsoft/Phi-3.5-mini-instruct", "Phi3ForCausalLM", "phi3"),
8889
             ("mistralai/Mistral-7B-Instruct", "MistralForCausalLM", "mistral"),
90
+            ("mistralai/Mixtral-8x7B-Instruct-v0.1", "MixtralForCausalLM", "mistral"),
8991
             ("Qwen/Qwen2.5-1.5B-Instruct", "Qwen2ForCausalLM", "chatml"),
9092
         ],
9193
     )