`49601e0`

Add Mixtral MoE registry entry

Authored by

espadonne 3 weeks ago

SHA: 49601e0e277143e8784f7498eb94852ee6db40d9
Parents: 7635122
Tree: 7c4275b

5 changed files

Status	File	+	-
M	`src/dlm/base_models/registry.py`	29	0
M	`src/dlm/base_models/resolver.py`	2	1
M	`tests/unit/base_models/test_registry.py`	1	0
M	`tests/unit/base_models/test_registry_2026.py`	27	0
M	`tests/unit/base_models/test_resolver_hf_escape.py`	2	0

src/dlm/base_models/registry.pymodified

          recommended_seq_len=2048,
          reasoning_tuned=True,
      ),
 +    # Mixtral-8x7B-Instruct-v0.1 — Apache-2.0 sparse MoE base.
 +    #
 +    # HF exposes this as `MixtralForCausalLM`, but the current vendored
 +    # llama.cpp converter routes it through the Llama path rather than a
 +    # distinct Mixtral architecture class. We therefore keep
 +    # `gguf_arch="llama"` while marking the modality as `text-moe` so
 +    # DLM's gate substrate can detect the sparse-MoE family explicitly.
 +    BaseModelSpec(
 +        key="mixtral-8x7b-instruct",
 +        hf_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
 +        # Placeholder SHA: format-valid, not a real HF commit. The
 +        # weekly `scripts/refresh-registry.py --check` run surfaces
 +        # drift and prints the live value for manual review.
 +        revision="bc0deffedcba0987654321abc2d3e4f5a6b7c8d9",
 +        architecture="MixtralForCausalLM",
 +        params=46_700_000_000,
 +        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
 +        template="mistral",
 +        gguf_arch="llama",
 +        tokenizer_pre="llama-bpe",
 +        license_spdx="Apache-2.0",
 +        license_url="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
 +        requires_acceptance=False,
 +        redistributable=True,
 +        size_gb_fp16=93.4,
 +        context_length=32_768,
 +        recommended_seq_len=2048,
 +        modality="text-moe",
 +    ),
      # Mistral Small 3.1 24B Instruct — Apache-2.0 multimodal base with
      # native vision support and 128k context.
+     #

src/dlm/base_models/resolver.pymodified

          "Qwen2ForCausalLM": "qwen2",
          "Qwen3ForCausalLM": "qwen3",
          "MistralForCausalLM": "llama",
 +        "MixtralForCausalLM": "llama",
          "Phi3ForCausalLM": "phi3",
          "GemmaForCausalLM": "gemma",
          "Gemma2ForCausalLM": "gemma2",
          return "phi4mini"
      if architecture.startswith("Phi"):
          return "phi3"
 -    if architecture.startswith("Mistral"):
 +    if architecture.startswith(("Mistral", "Mixtral")):
          return "mistral"
      return "chatml"

tests/unit/base_models/test_registry.pymodified

              "qwen3-1.7b",
              "qwen3-4b",
              "qwen3-8b",
 +            "mixtral-8x7b-instruct",
              "smollm3-3b",
              "olmo-2-7b-instruct",
              "smollm2-135m",

tests/unit/base_models/test_registry_2026.pymodified

          assert spec.size_gb_fp16 == pytest.approx(7.6)
          assert spec.context_length == 131_072
          assert spec.recommended_seq_len == 2048
++
++
 +class TestMixtralRegistryEntry:
 +    def test_entry_present(self) -> None:
 +        assert "mixtral-8x7b-instruct" in BASE_MODELS
++
 +    def test_entry_is_open_sparse_moe(self) -> None:
 +        spec = BASE_MODELS["mixtral-8x7b-instruct"]
 +        assert spec.license_spdx == "Apache-2.0"
 +        assert spec.requires_acceptance is False
 +        assert spec.redistributable is True
 +        assert spec.modality == "text-moe"
++
 +    def test_entry_matches_live_family_shape(self) -> None:
 +        spec = BASE_MODELS["mixtral-8x7b-instruct"]
 +        assert spec.hf_id == "mistralai/Mixtral-8x7B-Instruct-v0.1"
 +        assert spec.architecture == "MixtralForCausalLM"
 +        assert spec.template == "mistral"
 +        assert spec.gguf_arch == "llama"
 +        assert spec.tokenizer_pre == "llama-bpe"
++
 +    def test_entry_uses_total_parameter_and_context_hints(self) -> None:
 +        spec = BASE_MODELS["mixtral-8x7b-instruct"]
 +        assert spec.params == 46_700_000_000
 +        assert spec.size_gb_fp16 == pytest.approx(93.4)
 +        assert spec.context_length == 32_768
 +        assert spec.recommended_seq_len == 2048

tests/unit/base_models/test_resolver_hf_escape.pymodified

              ("Qwen2ForCausalLM", "qwen2"),
              ("Qwen3ForCausalLM", "qwen3"),
              ("MistralForCausalLM", "llama"),
 +            ("MixtralForCausalLM", "llama"),
              ("Phi3ForCausalLM", "phi3"),
              ("GemmaForCausalLM", "gemma"),
              ("Gemma2ForCausalLM", "gemma2"),
              ("microsoft/Phi-4-mini-reasoning", "Phi3ForCausalLM", "phi4mini"),
              ("microsoft/Phi-3.5-mini-instruct", "Phi3ForCausalLM", "phi3"),
              ("mistralai/Mistral-7B-Instruct", "MistralForCausalLM", "mistral"),
 +            ("mistralai/Mixtral-8x7B-Instruct-v0.1", "MixtralForCausalLM", "mistral"),
              ("Qwen/Qwen2.5-1.5B-Instruct", "Qwen2ForCausalLM", "chatml"),
          ],
+     )