| 1 |
"""Audio-base registry + preprocessor plan + audio probe (Sprint 35.2). |
| 2 |
|
| 3 |
Mirrors `test_vl_registry.py` for the audio-language modality. Covers: |
| 4 |
|
| 5 |
- `qwen2-audio-7b-instruct` is present and has `modality="audio-language"`. |
| 6 |
- Its `AudioPreprocessorPlan` is pinned (16 kHz, 30 s, `<|AUDIO|>`, 750). |
| 7 |
- License is Apache-2.0 and the current HF row is no longer gated, so |
| 8 |
the spec stays redistributable. |
| 9 |
- `modality="audio-language"` without a plan rejects at validate time; |
| 10 |
text bases cannot carry an audio plan; VL bases cannot carry an audio |
| 11 |
plan; audio bases cannot carry a VL plan. |
| 12 |
- `run_all` on an audio spec skips the llama.cpp-converter probes (no |
| 13 |
audio-arch support on any llama.cpp roadmap). |
| 14 |
""" |
| 15 |
|
| 16 |
from __future__ import annotations |
| 17 |
|
| 18 |
import pytest |
| 19 |
from pydantic import ValidationError |
| 20 |
|
| 21 |
from dlm.base_models import BASE_MODELS |
| 22 |
from dlm.base_models.probes import run_all |
| 23 |
from dlm.base_models.schema import ( |
| 24 |
AudioPreprocessorPlan, |
| 25 |
BaseModelSpec, |
| 26 |
VlPreprocessorPlan, |
| 27 |
) |
| 28 |
|
| 29 |
|
| 30 |
class TestQwen2AudioRegistryEntry: |
| 31 |
def test_entry_present(self) -> None: |
| 32 |
assert "qwen2-audio-7b-instruct" in BASE_MODELS |
| 33 |
|
| 34 |
def test_modality_is_audio(self) -> None: |
| 35 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 36 |
assert spec.modality == "audio-language" |
| 37 |
|
| 38 |
def test_preprocessor_plan_pinned(self) -> None: |
| 39 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 40 |
plan = spec.audio_preprocessor_plan |
| 41 |
assert plan is not None |
| 42 |
assert plan.sample_rate == 16_000 |
| 43 |
assert plan.max_length_seconds == 30.0 |
| 44 |
assert plan.audio_token == "<|AUDIO|>" |
| 45 |
assert plan.num_audio_tokens == 750 |
| 46 |
|
| 47 |
def test_no_vl_plan(self) -> None: |
| 48 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 49 |
assert spec.vl_preprocessor_plan is None |
| 50 |
|
| 51 |
def test_license_open_and_redistributable(self) -> None: |
| 52 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 53 |
assert spec.requires_acceptance is False |
| 54 |
assert spec.redistributable is True |
| 55 |
|
| 56 |
def test_architecture_is_audio_conditional_generation(self) -> None: |
| 57 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 58 |
assert spec.architecture == "Qwen2AudioForConditionalGeneration" |
| 59 |
|
| 60 |
def test_template_is_qwen2_audio(self) -> None: |
| 61 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 62 |
assert spec.template == "qwen2-audio" |
| 63 |
|
| 64 |
|
| 65 |
class TestAudioPreprocessorPlan: |
| 66 |
def test_rejects_non_positive_sample_rate(self) -> None: |
| 67 |
with pytest.raises(ValidationError): |
| 68 |
AudioPreprocessorPlan( |
| 69 |
sample_rate=0, |
| 70 |
max_length_seconds=30.0, |
| 71 |
audio_token="<|AUDIO|>", |
| 72 |
num_audio_tokens=750, |
| 73 |
) |
| 74 |
|
| 75 |
def test_rejects_non_positive_max_length(self) -> None: |
| 76 |
with pytest.raises(ValidationError): |
| 77 |
AudioPreprocessorPlan( |
| 78 |
sample_rate=16_000, |
| 79 |
max_length_seconds=0.0, |
| 80 |
audio_token="<|AUDIO|>", |
| 81 |
num_audio_tokens=750, |
| 82 |
) |
| 83 |
|
| 84 |
def test_rejects_empty_audio_token(self) -> None: |
| 85 |
with pytest.raises(ValidationError): |
| 86 |
AudioPreprocessorPlan( |
| 87 |
sample_rate=16_000, |
| 88 |
max_length_seconds=30.0, |
| 89 |
audio_token="", |
| 90 |
num_audio_tokens=750, |
| 91 |
) |
| 92 |
|
| 93 |
def test_rejects_non_positive_num_audio_tokens(self) -> None: |
| 94 |
with pytest.raises(ValidationError): |
| 95 |
AudioPreprocessorPlan( |
| 96 |
sample_rate=16_000, |
| 97 |
max_length_seconds=30.0, |
| 98 |
audio_token="<|AUDIO|>", |
| 99 |
num_audio_tokens=0, |
| 100 |
) |
| 101 |
|
| 102 |
def test_frozen(self) -> None: |
| 103 |
plan = AudioPreprocessorPlan( |
| 104 |
sample_rate=16_000, |
| 105 |
max_length_seconds=30.0, |
| 106 |
audio_token="<|AUDIO|>", |
| 107 |
num_audio_tokens=750, |
| 108 |
) |
| 109 |
with pytest.raises(ValidationError): |
| 110 |
plan.num_audio_tokens = 1500 # type: ignore[misc] |
| 111 |
|
| 112 |
|
| 113 |
class TestSpecModalityInvariants: |
| 114 |
def _base_kwargs(self) -> dict[str, object]: |
| 115 |
return { |
| 116 |
"key": "test-entry", |
| 117 |
"hf_id": "test/entry", |
| 118 |
"revision": "a" * 40, |
| 119 |
"architecture": "LlamaForCausalLM", |
| 120 |
"params": 1_000_000, |
| 121 |
"target_modules": ["q_proj"], |
| 122 |
"template": "chatml", |
| 123 |
"gguf_arch": "llama", |
| 124 |
"tokenizer_pre": "llama-bpe", |
| 125 |
"license_spdx": "Apache-2.0", |
| 126 |
"redistributable": True, |
| 127 |
"size_gb_fp16": 0.5, |
| 128 |
"context_length": 4096, |
| 129 |
"recommended_seq_len": 1024, |
| 130 |
} |
| 131 |
|
| 132 |
def _audio_plan(self) -> AudioPreprocessorPlan: |
| 133 |
return AudioPreprocessorPlan( |
| 134 |
sample_rate=16_000, |
| 135 |
max_length_seconds=30.0, |
| 136 |
audio_token="<|AUDIO|>", |
| 137 |
num_audio_tokens=750, |
| 138 |
) |
| 139 |
|
| 140 |
def _vl_plan(self) -> VlPreprocessorPlan: |
| 141 |
return VlPreprocessorPlan( |
| 142 |
target_size=(224, 224), |
| 143 |
image_token="<image>", |
| 144 |
num_image_tokens=256, |
| 145 |
) |
| 146 |
|
| 147 |
def test_audio_without_plan_rejected(self) -> None: |
| 148 |
with pytest.raises(ValidationError, match="requires an audio_preprocessor_plan"): |
| 149 |
BaseModelSpec(**self._base_kwargs(), modality="audio-language") # type: ignore[arg-type] |
| 150 |
|
| 151 |
def test_text_with_audio_plan_rejected(self) -> None: |
| 152 |
with pytest.raises(ValidationError, match="only valid with"): |
| 153 |
BaseModelSpec( # type: ignore[arg-type] |
| 154 |
**self._base_kwargs(), |
| 155 |
modality="text", |
| 156 |
audio_preprocessor_plan=self._audio_plan(), |
| 157 |
) |
| 158 |
|
| 159 |
def test_vl_with_audio_plan_rejected(self) -> None: |
| 160 |
with pytest.raises(ValidationError, match="audio_preprocessor_plan is invalid"): |
| 161 |
BaseModelSpec( # type: ignore[arg-type] |
| 162 |
**self._base_kwargs(), |
| 163 |
modality="vision-language", |
| 164 |
vl_preprocessor_plan=self._vl_plan(), |
| 165 |
audio_preprocessor_plan=self._audio_plan(), |
| 166 |
) |
| 167 |
|
| 168 |
def test_audio_with_vl_plan_rejected(self) -> None: |
| 169 |
with pytest.raises(ValidationError, match="vl_preprocessor_plan is invalid"): |
| 170 |
BaseModelSpec( # type: ignore[arg-type] |
| 171 |
**self._base_kwargs(), |
| 172 |
modality="audio-language", |
| 173 |
audio_preprocessor_plan=self._audio_plan(), |
| 174 |
vl_preprocessor_plan=self._vl_plan(), |
| 175 |
) |
| 176 |
|
| 177 |
|
| 178 |
class TestRunAllSkipsExportProbesForAudio: |
| 179 |
"""`run_all` on an audio spec drops the llama.cpp-converter probes. |
| 180 |
|
| 181 |
Audio architectures aren't on any llama.cpp roadmap; GGUF export |
| 182 |
refuses cleanly and emits an HF snapshot instead. The dispatcher |
| 183 |
quietly omits the export probes to keep the report focused. |
| 184 |
""" |
| 185 |
|
| 186 |
def test_audio_spec_yields_two_probes(self) -> None: |
| 187 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 188 |
report = run_all(spec) |
| 189 |
probe_names = {r.name for r in report.results} |
| 190 |
assert "gguf_arch" not in probe_names |
| 191 |
assert "pretokenizer_label" not in probe_names |
| 192 |
assert "pretokenizer_hash" not in probe_names |
| 193 |
# audio_token is the audio-specific probe; it may skip if |
| 194 |
# transformers/processor isn't cached locally. |
| 195 |
assert "audio_token" in probe_names |
| 196 |
assert "architecture" in probe_names |
| 197 |
# Chat-template probe does not apply to audio bases. |
| 198 |
assert "chat_template" not in probe_names |
| 199 |
|
| 200 |
def test_audio_spec_skips_vl_probe(self) -> None: |
| 201 |
spec = BASE_MODELS["qwen2-audio-7b-instruct"] |
| 202 |
report = run_all(spec) |
| 203 |
probe_names = {r.name for r in report.results} |
| 204 |
assert "vl_image_token" not in probe_names |