Python · 12005 bytes Raw Blame History
1 """Audio cache — key stability, atomic I/O, processor fingerprint.
2
3 Mirrors `test_vl_cache.py`. Covers:
4
5 - `AudioCacheKey.as_filename` / `shard` are deterministic.
6 - Different sample_rate / max_length_ms / processor_sha produce
7 different filenames.
8 - Round-trip: put → get returns byte-identical array.
9 - Miss on empty store, miss on corrupt file.
10 - `processor_sha256` is stable across repeat calls + pinned on instance.
11 - Different feature-extractor constants drift the sha.
12 """
13
14 from __future__ import annotations
15
16 from pathlib import Path
17 from types import SimpleNamespace
18
19 import numpy as np
20 import pytest
21
22 from dlm.data.audio_cache import (
23 AudioCache,
24 AudioCacheKey,
25 WaveformCache,
26 WaveformCacheKey,
27 processor_sha256,
28 )
29
30
31 def _key(**overrides: object) -> AudioCacheKey:
32 defaults = {
33 "blob_sha": "a" * 64,
34 "processor_sha": "b" * 64,
35 "sample_rate": 16_000,
36 "max_length_ms": 30_000,
37 }
38 defaults.update(overrides)
39 return AudioCacheKey(**defaults) # type: ignore[arg-type]
40
41
42 class TestAudioCacheKey:
43 def test_filename_shape(self) -> None:
44 key = _key()
45 assert key.as_filename() == f"{'a' * 64}.{'b' * 12}.16000.30000.npz"
46
47 def test_shard_is_two_prefix(self) -> None:
48 assert _key(blob_sha="cd" + "0" * 62).shard() == "cd"
49
50 def test_different_sample_rate_different_filename(self) -> None:
51 a = _key(sample_rate=16_000)
52 b = _key(sample_rate=48_000)
53 assert a.as_filename() != b.as_filename()
54
55 def test_different_max_length_different_filename(self) -> None:
56 a = _key(max_length_ms=30_000)
57 b = _key(max_length_ms=60_000)
58 assert a.as_filename() != b.as_filename()
59
60 def test_different_processor_different_filename(self) -> None:
61 a = _key(processor_sha="1" * 64)
62 b = _key(processor_sha="2" * 64)
63 assert a.as_filename() != b.as_filename()
64
65 def test_key_is_frozen(self) -> None:
66 key = _key()
67 with pytest.raises(AttributeError):
68 key.blob_sha = "x" * 64 # type: ignore[misc]
69
70 def test_auto_resample_default_false_absent_from_filename(self) -> None:
71 """Default False → filename stays v11-compatible (no `.rs` suffix).
72
73 Guards backward-compat: an existing cache populated before the
74 auto_resample field lands still hits on the same filename when
75 the caller doesn't opt in.
76 """
77 assert ".rs" not in _key().as_filename()
78
79 def test_auto_resample_true_adds_suffix(self) -> None:
80 a = _key(auto_resample=False)
81 b = _key(auto_resample=True)
82 assert a.as_filename() != b.as_filename()
83 assert ".rs" in b.as_filename()
84
85
86 class TestAudioCacheRoundTrip:
87 def test_miss_on_empty(self, tmp_path: Path) -> None:
88 cache = AudioCache(tmp_path / "audio")
89 assert cache.get(_key()) is None
90
91 def test_put_then_get(self, tmp_path: Path) -> None:
92 cache = AudioCache(tmp_path / "audio")
93 tensor = np.arange(80 * 3000, dtype=np.float32).reshape(1, 80, 3000)
94 cache.put(_key(), tensor)
95 loaded = cache.get(_key())
96 assert loaded is not None
97 np.testing.assert_array_equal(loaded, tensor)
98 assert loaded.dtype == np.float32
99
100 def test_put_creates_shard_dir(self, tmp_path: Path) -> None:
101 cache = AudioCache(tmp_path / "audio")
102 key = _key(blob_sha="ef" + "0" * 62)
103 cache.put(key, np.zeros((1,), dtype=np.float32))
104 assert (tmp_path / "audio" / "ef").is_dir()
105
106 def test_exists_flips_after_put(self, tmp_path: Path) -> None:
107 cache = AudioCache(tmp_path / "audio")
108 key = _key()
109 assert cache.exists(key) is False
110 cache.put(key, np.zeros((1,), dtype=np.float32))
111 assert cache.exists(key) is True
112
113 def test_corrupt_file_treated_as_miss(self, tmp_path: Path) -> None:
114 cache = AudioCache(tmp_path / "audio")
115 key = _key()
116 cache.put(key, np.zeros((1,), dtype=np.float32))
117 cache.path_for(key).write_bytes(b"not a real npz")
118 assert cache.get(key) is None
119
120 def test_clear_removes_tree(self, tmp_path: Path) -> None:
121 cache = AudioCache(tmp_path / "audio")
122 cache.put(_key(), np.zeros((1,), dtype=np.float32))
123 cache.clear()
124 assert not (tmp_path / "audio").exists()
125
126
127 class TestProcessorSha256:
128 def _make_processor(self, **attrs: object) -> SimpleNamespace:
129 defaults: dict[str, object] = {
130 "sampling_rate": 16_000,
131 "feature_size": 80,
132 "n_fft": 400,
133 "hop_length": 160,
134 "chunk_length": 30,
135 "padding_value": 0.0,
136 "return_attention_mask": True,
137 }
138 defaults.update(attrs)
139 fe = SimpleNamespace(**defaults)
140 return SimpleNamespace(feature_extractor=fe)
141
142 def test_stable_across_calls(self) -> None:
143 proc = self._make_processor()
144 assert processor_sha256(proc) == processor_sha256(proc)
145
146 def test_pinned_on_instance(self) -> None:
147 proc = self._make_processor()
148 first = processor_sha256(proc)
149 # Mutate a field that would drift the sha if recomputed — the
150 # pinned cache returns the original so repeat calls stay O(1).
151 proc.feature_extractor.sampling_rate = 48_000
152 assert processor_sha256(proc) == first
153
154 def test_different_sample_rate_different_sha(self) -> None:
155 a = self._make_processor(sampling_rate=16_000)
156 b = self._make_processor(sampling_rate=48_000)
157 assert processor_sha256(a) != processor_sha256(b)
158
159 def test_different_n_fft_different_sha(self) -> None:
160 a = self._make_processor(n_fft=400)
161 b = self._make_processor(n_fft=1024)
162 assert processor_sha256(a) != processor_sha256(b)
163
164 def test_different_hop_length_different_sha(self) -> None:
165 a = self._make_processor(hop_length=160)
166 b = self._make_processor(hop_length=320)
167 assert processor_sha256(a) != processor_sha256(b)
168
169 def test_different_feature_extractor_class_different_sha(self) -> None:
170 class FeA:
171 sampling_rate = 16_000
172 feature_size = 80
173 n_fft = 400
174 hop_length = 160
175
176 class FeB:
177 sampling_rate = 16_000
178 feature_size = 80
179 n_fft = 400
180 hop_length = 160
181
182 proc_a = SimpleNamespace(feature_extractor=FeA())
183 proc_b = SimpleNamespace(feature_extractor=FeB())
184 assert processor_sha256(proc_a) != processor_sha256(proc_b)
185
186 def test_nested_feature_extractor_fields_are_readable(self) -> None:
187 proc = SimpleNamespace(
188 feature_extractor=SimpleNamespace(
189 sampling_rate=16_000,
190 feature_size=(80, 2),
191 n_fft=400,
192 hop_length=160,
193 chunk_length={"seconds": 30},
194 padding_value=0.0,
195 return_attention_mask=True,
196 )
197 )
198 sha = processor_sha256(proc)
199 assert len(sha) == 64
200
201 def test_exotic_feature_field_stringifies_stably(self) -> None:
202 proc = SimpleNamespace(
203 feature_extractor=SimpleNamespace(
204 sampling_rate=16_000,
205 feature_size=80,
206 n_fft=400,
207 hop_length=160,
208 chunk_length=object(),
209 padding_value=0.0,
210 return_attention_mask=True,
211 )
212 )
213 sha = processor_sha256(proc)
214 assert len(sha) == 64
215
216
217 # --- WaveformCache (35.2 deferred-item follow-up) ---------------------------
218
219
220 def _wkey(**overrides: object) -> WaveformCacheKey:
221 defaults: dict[str, object] = {
222 "blob_sha": "a" * 64,
223 "sample_rate": 16_000,
224 "max_length_ms": 30_000,
225 }
226 defaults.update(overrides)
227 return WaveformCacheKey(**defaults) # type: ignore[arg-type]
228
229
230 class TestWaveformCacheKey:
231 def test_filename_shape(self) -> None:
232 k = _wkey()
233 assert k.as_filename() == f"{'a' * 64}.16000.30000.wav.npz"
234
235 def test_shard_is_two_prefix(self) -> None:
236 assert _wkey(blob_sha="cd" + "0" * 62).shard() == "cd"
237
238 def test_different_sample_rate_different_filename(self) -> None:
239 assert _wkey(sample_rate=16_000).as_filename() != _wkey(sample_rate=48_000).as_filename()
240
241 def test_different_max_length_different_filename(self) -> None:
242 assert (
243 _wkey(max_length_ms=30_000).as_filename() != _wkey(max_length_ms=60_000).as_filename()
244 )
245
246 def test_key_no_processor_sha(self) -> None:
247 """Waveform cache is pre-processor; key should omit processor_sha."""
248 # Distinct filenames from AudioCacheKey even for overlapping params
249 # — the layout is intentionally separate.
250 k = _wkey()
251 assert "proc" not in k.as_filename().lower()
252
253 def test_key_is_frozen(self) -> None:
254 k = _wkey()
255 with pytest.raises(AttributeError):
256 k.blob_sha = "x" * 64 # type: ignore[misc]
257
258 def test_auto_resample_default_false_absent_from_filename(self) -> None:
259 assert ".rs" not in _wkey().as_filename()
260
261 def test_auto_resample_true_adds_suffix(self) -> None:
262 a = _wkey(auto_resample=False)
263 b = _wkey(auto_resample=True)
264 assert a.as_filename() != b.as_filename()
265 assert ".rs" in b.as_filename()
266
267
268 class TestWaveformCacheRoundTrip:
269 def test_miss_on_empty(self, tmp_path: Path) -> None:
270 cache = WaveformCache(tmp_path / "wav")
271 assert cache.get(_wkey()) is None
272
273 def test_put_then_get(self, tmp_path: Path) -> None:
274 cache = WaveformCache(tmp_path / "wav")
275 waveform = np.arange(16_000 * 1, dtype=np.float32) / 16_000.0
276 cache.put(_wkey(), waveform)
277 loaded = cache.get(_wkey())
278 assert loaded is not None
279 np.testing.assert_array_equal(loaded, waveform)
280 assert loaded.dtype == np.float32
281
282 def test_put_creates_shard_dir(self, tmp_path: Path) -> None:
283 cache = WaveformCache(tmp_path / "wav")
284 key = _wkey(blob_sha="ef" + "0" * 62)
285 cache.put(key, np.zeros((1,), dtype=np.float32))
286 assert (tmp_path / "wav" / "ef").is_dir()
287
288 def test_exists_flips_after_put(self, tmp_path: Path) -> None:
289 cache = WaveformCache(tmp_path / "wav")
290 key = _wkey()
291 assert cache.exists(key) is False
292 cache.put(key, np.zeros((1,), dtype=np.float32))
293 assert cache.exists(key) is True
294
295 def test_corrupt_file_treated_as_miss(self, tmp_path: Path) -> None:
296 cache = WaveformCache(tmp_path / "wav")
297 key = _wkey()
298 cache.put(key, np.zeros((1,), dtype=np.float32))
299 cache.path_for(key).write_bytes(b"not npz")
300 assert cache.get(key) is None
301
302 def test_clear_removes_tree(self, tmp_path: Path) -> None:
303 cache = WaveformCache(tmp_path / "wav")
304 cache.put(_wkey(), np.zeros((1,), dtype=np.float32))
305 cache.clear()
306 assert not (tmp_path / "wav").exists()
307
308
309 class TestWaveformAndFeatureCachesDistinct:
310 """The two audio caches must not collide on-disk or in-memory."""
311
312 def test_separate_roots_coexist(self, tmp_path: Path) -> None:
313 features = AudioCache(tmp_path / "audio-cache")
314 waveforms = WaveformCache(tmp_path / "audio-waveform-cache")
315 features.put(
316 AudioCacheKey(
317 blob_sha="a" * 64,
318 processor_sha="b" * 64,
319 sample_rate=16_000,
320 max_length_ms=30_000,
321 ),
322 np.zeros((1,), dtype=np.float32),
323 )
324 waveforms.put(_wkey(), np.zeros((1,), dtype=np.float32))
325 # Neither cache's directory tree touches the other's.
326 feat_files = set((tmp_path / "audio-cache").rglob("*.npz"))
327 wave_files = set((tmp_path / "audio-waveform-cache").rglob("*.npz"))
328 assert feat_files
329 assert wave_files
330 assert feat_files.isdisjoint(wave_files)