documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 12005 bytes Raw Blame History

  
        1
        """Audio cache — key stability, atomic I/O, processor fingerprint.
      
        2
        
        3
        Mirrors `test_vl_cache.py`. Covers:
      
        4
        
        5
        - `AudioCacheKey.as_filename` / `shard` are deterministic.
      
        6
        - Different sample_rate / max_length_ms / processor_sha produce
      
        7
          different filenames.
      
        8
        - Round-trip: put → get returns byte-identical array.
      
        9
        - Miss on empty store, miss on corrupt file.
      
        10
        - `processor_sha256` is stable across repeat calls + pinned on instance.
      
        11
        - Different feature-extractor constants drift the sha.
      
        12
        """
      
        13
        
        14
        from __future__ import annotations
      
        15
        
        16
        from pathlib import Path
      
        17
        from types import SimpleNamespace
      
        18
        
        19
        import numpy as np
      
        20
        import pytest
      
        21
        
        22
        from dlm.data.audio_cache import (
      
        23
            AudioCache,
      
        24
            AudioCacheKey,
      
        25
            WaveformCache,
      
        26
            WaveformCacheKey,
      
        27
            processor_sha256,
      
        28
        )
      
        29
        
        30
        
        31
        def _key(**overrides: object) -> AudioCacheKey:
      
        32
            defaults = {
      
        33
                "blob_sha": "a" * 64,
      
        34
                "processor_sha": "b" * 64,
      
        35
                "sample_rate": 16_000,
      
        36
                "max_length_ms": 30_000,
      
        37
            }
      
        38
            defaults.update(overrides)
      
        39
            return AudioCacheKey(**defaults)  # type: ignore[arg-type]
      
        40
        
        41
        
        42
        class TestAudioCacheKey:
      
        43
            def test_filename_shape(self) -> None:
      
        44
                key = _key()
      
        45
                assert key.as_filename() == f"{'a' * 64}.{'b' * 12}.16000.30000.npz"
      
        46
        
        47
            def test_shard_is_two_prefix(self) -> None:
      
        48
                assert _key(blob_sha="cd" + "0" * 62).shard() == "cd"
      
        49
        
        50
            def test_different_sample_rate_different_filename(self) -> None:
      
        51
                a = _key(sample_rate=16_000)
      
        52
                b = _key(sample_rate=48_000)
      
        53
                assert a.as_filename() != b.as_filename()
      
        54
        
        55
            def test_different_max_length_different_filename(self) -> None:
      
        56
                a = _key(max_length_ms=30_000)
      
        57
                b = _key(max_length_ms=60_000)
      
        58
                assert a.as_filename() != b.as_filename()
      
        59
        
        60
            def test_different_processor_different_filename(self) -> None:
      
        61
                a = _key(processor_sha="1" * 64)
      
        62
                b = _key(processor_sha="2" * 64)
      
        63
                assert a.as_filename() != b.as_filename()
      
        64
        
        65
            def test_key_is_frozen(self) -> None:
      
        66
                key = _key()
      
        67
                with pytest.raises(AttributeError):
      
        68
                    key.blob_sha = "x" * 64  # type: ignore[misc]
      
        69
        
        70
            def test_auto_resample_default_false_absent_from_filename(self) -> None:
      
        71
                """Default False → filename stays v11-compatible (no `.rs` suffix).
      
        72
        
        73
                Guards backward-compat: an existing cache populated before the
      
        74
                auto_resample field lands still hits on the same filename when
      
        75
                the caller doesn't opt in.
      
        76
                """
      
        77
                assert ".rs" not in _key().as_filename()
      
        78
        
        79
            def test_auto_resample_true_adds_suffix(self) -> None:
      
        80
                a = _key(auto_resample=False)
      
        81
                b = _key(auto_resample=True)
      
        82
                assert a.as_filename() != b.as_filename()
      
        83
                assert ".rs" in b.as_filename()
      
        84
        
        85
        
        86
        class TestAudioCacheRoundTrip:
      
        87
            def test_miss_on_empty(self, tmp_path: Path) -> None:
      
        88
                cache = AudioCache(tmp_path / "audio")
      
        89
                assert cache.get(_key()) is None
      
        90
        
        91
            def test_put_then_get(self, tmp_path: Path) -> None:
      
        92
                cache = AudioCache(tmp_path / "audio")
      
        93
                tensor = np.arange(80 * 3000, dtype=np.float32).reshape(1, 80, 3000)
      
        94
                cache.put(_key(), tensor)
      
        95
                loaded = cache.get(_key())
      
        96
                assert loaded is not None
      
        97
                np.testing.assert_array_equal(loaded, tensor)
      
        98
                assert loaded.dtype == np.float32
      
        99
        
        100
            def test_put_creates_shard_dir(self, tmp_path: Path) -> None:
      
        101
                cache = AudioCache(tmp_path / "audio")
      
        102
                key = _key(blob_sha="ef" + "0" * 62)
      
        103
                cache.put(key, np.zeros((1,), dtype=np.float32))
      
        104
                assert (tmp_path / "audio" / "ef").is_dir()
      
        105
        
        106
            def test_exists_flips_after_put(self, tmp_path: Path) -> None:
      
        107
                cache = AudioCache(tmp_path / "audio")
      
        108
                key = _key()
      
        109
                assert cache.exists(key) is False
      
        110
                cache.put(key, np.zeros((1,), dtype=np.float32))
      
        111
                assert cache.exists(key) is True
      
        112
        
        113
            def test_corrupt_file_treated_as_miss(self, tmp_path: Path) -> None:
      
        114
                cache = AudioCache(tmp_path / "audio")
      
        115
                key = _key()
      
        116
                cache.put(key, np.zeros((1,), dtype=np.float32))
      
        117
                cache.path_for(key).write_bytes(b"not a real npz")
      
        118
                assert cache.get(key) is None
      
        119
        
        120
            def test_clear_removes_tree(self, tmp_path: Path) -> None:
      
        121
                cache = AudioCache(tmp_path / "audio")
      
        122
                cache.put(_key(), np.zeros((1,), dtype=np.float32))
      
        123
                cache.clear()
      
        124
                assert not (tmp_path / "audio").exists()
      
        125
        
        126
        
        127
        class TestProcessorSha256:
      
        128
            def _make_processor(self, **attrs: object) -> SimpleNamespace:
      
        129
                defaults: dict[str, object] = {
      
        130
                    "sampling_rate": 16_000,
      
        131
                    "feature_size": 80,
      
        132
                    "n_fft": 400,
      
        133
                    "hop_length": 160,
      
        134
                    "chunk_length": 30,
      
        135
                    "padding_value": 0.0,
      
        136
                    "return_attention_mask": True,
      
        137
                }
      
        138
                defaults.update(attrs)
      
        139
                fe = SimpleNamespace(**defaults)
      
        140
                return SimpleNamespace(feature_extractor=fe)
      
        141
        
        142
            def test_stable_across_calls(self) -> None:
      
        143
                proc = self._make_processor()
      
        144
                assert processor_sha256(proc) == processor_sha256(proc)
      
        145
        
        146
            def test_pinned_on_instance(self) -> None:
      
        147
                proc = self._make_processor()
      
        148
                first = processor_sha256(proc)
      
        149
                # Mutate a field that would drift the sha if recomputed — the
      
        150
                # pinned cache returns the original so repeat calls stay O(1).
      
        151
                proc.feature_extractor.sampling_rate = 48_000
      
        152
                assert processor_sha256(proc) == first
      
        153
        
        154
            def test_different_sample_rate_different_sha(self) -> None:
      
        155
                a = self._make_processor(sampling_rate=16_000)
      
        156
                b = self._make_processor(sampling_rate=48_000)
      
        157
                assert processor_sha256(a) != processor_sha256(b)
      
        158
        
        159
            def test_different_n_fft_different_sha(self) -> None:
      
        160
                a = self._make_processor(n_fft=400)
      
        161
                b = self._make_processor(n_fft=1024)
      
        162
                assert processor_sha256(a) != processor_sha256(b)
      
        163
        
        164
            def test_different_hop_length_different_sha(self) -> None:
      
        165
                a = self._make_processor(hop_length=160)
      
        166
                b = self._make_processor(hop_length=320)
      
        167
                assert processor_sha256(a) != processor_sha256(b)
      
        168
        
        169
            def test_different_feature_extractor_class_different_sha(self) -> None:
      
        170
                class FeA:
      
        171
                    sampling_rate = 16_000
      
        172
                    feature_size = 80
      
        173
                    n_fft = 400
      
        174
                    hop_length = 160
      
        175
        
        176
                class FeB:
      
        177
                    sampling_rate = 16_000
      
        178
                    feature_size = 80
      
        179
                    n_fft = 400
      
        180
                    hop_length = 160
      
        181
        
        182
                proc_a = SimpleNamespace(feature_extractor=FeA())
      
        183
                proc_b = SimpleNamespace(feature_extractor=FeB())
      
        184
                assert processor_sha256(proc_a) != processor_sha256(proc_b)
      
        185
        
        186
            def test_nested_feature_extractor_fields_are_readable(self) -> None:
      
        187
                proc = SimpleNamespace(
      
        188
                    feature_extractor=SimpleNamespace(
      
        189
                        sampling_rate=16_000,
      
        190
                        feature_size=(80, 2),
      
        191
                        n_fft=400,
      
        192
                        hop_length=160,
      
        193
                        chunk_length={"seconds": 30},
      
        194
                        padding_value=0.0,
      
        195
                        return_attention_mask=True,
      
        196
                    )
      
        197
                )
      
        198
                sha = processor_sha256(proc)
      
        199
                assert len(sha) == 64
      
        200
        
        201
            def test_exotic_feature_field_stringifies_stably(self) -> None:
      
        202
                proc = SimpleNamespace(
      
        203
                    feature_extractor=SimpleNamespace(
      
        204
                        sampling_rate=16_000,
      
        205
                        feature_size=80,
      
        206
                        n_fft=400,
      
        207
                        hop_length=160,
      
        208
                        chunk_length=object(),
      
        209
                        padding_value=0.0,
      
        210
                        return_attention_mask=True,
      
        211
                    )
      
        212
                )
      
        213
                sha = processor_sha256(proc)
      
        214
                assert len(sha) == 64
      
        215
        
        216
        
        217
        # --- WaveformCache (35.2 deferred-item follow-up) ---------------------------
      
        218
        
        219
        
        220
        def _wkey(**overrides: object) -> WaveformCacheKey:
      
        221
            defaults: dict[str, object] = {
      
        222
                "blob_sha": "a" * 64,
      
        223
                "sample_rate": 16_000,
      
        224
                "max_length_ms": 30_000,
      
        225
            }
      
        226
            defaults.update(overrides)
      
        227
            return WaveformCacheKey(**defaults)  # type: ignore[arg-type]
      
        228
        
        229
        
        230
        class TestWaveformCacheKey:
      
        231
            def test_filename_shape(self) -> None:
      
        232
                k = _wkey()
      
        233
                assert k.as_filename() == f"{'a' * 64}.16000.30000.wav.npz"
      
        234
        
        235
            def test_shard_is_two_prefix(self) -> None:
      
        236
                assert _wkey(blob_sha="cd" + "0" * 62).shard() == "cd"
      
        237
        
        238
            def test_different_sample_rate_different_filename(self) -> None:
      
        239
                assert _wkey(sample_rate=16_000).as_filename() != _wkey(sample_rate=48_000).as_filename()
      
        240
        
        241
            def test_different_max_length_different_filename(self) -> None:
      
        242
                assert (
      
        243
                    _wkey(max_length_ms=30_000).as_filename() != _wkey(max_length_ms=60_000).as_filename()
      
        244
                )
      
        245
        
        246
            def test_key_no_processor_sha(self) -> None:
      
        247
                """Waveform cache is pre-processor; key should omit processor_sha."""
      
        248
                # Distinct filenames from AudioCacheKey even for overlapping params
      
        249
                # — the layout is intentionally separate.
      
        250
                k = _wkey()
      
        251
                assert "proc" not in k.as_filename().lower()
      
        252
        
        253
            def test_key_is_frozen(self) -> None:
      
        254
                k = _wkey()
      
        255
                with pytest.raises(AttributeError):
      
        256
                    k.blob_sha = "x" * 64  # type: ignore[misc]
      
        257
        
        258
            def test_auto_resample_default_false_absent_from_filename(self) -> None:
      
        259
                assert ".rs" not in _wkey().as_filename()
      
        260
        
        261
            def test_auto_resample_true_adds_suffix(self) -> None:
      
        262
                a = _wkey(auto_resample=False)
      
        263
                b = _wkey(auto_resample=True)
      
        264
                assert a.as_filename() != b.as_filename()
      
        265
                assert ".rs" in b.as_filename()
      
        266
        
        267
        
        268
        class TestWaveformCacheRoundTrip:
      
        269
            def test_miss_on_empty(self, tmp_path: Path) -> None:
      
        270
                cache = WaveformCache(tmp_path / "wav")
      
        271
                assert cache.get(_wkey()) is None
      
        272
        
        273
            def test_put_then_get(self, tmp_path: Path) -> None:
      
        274
                cache = WaveformCache(tmp_path / "wav")
      
        275
                waveform = np.arange(16_000 * 1, dtype=np.float32) / 16_000.0
      
        276
                cache.put(_wkey(), waveform)
      
        277
                loaded = cache.get(_wkey())
      
        278
                assert loaded is not None
      
        279
                np.testing.assert_array_equal(loaded, waveform)
      
        280
                assert loaded.dtype == np.float32
      
        281
        
        282
            def test_put_creates_shard_dir(self, tmp_path: Path) -> None:
      
        283
                cache = WaveformCache(tmp_path / "wav")
      
        284
                key = _wkey(blob_sha="ef" + "0" * 62)
      
        285
                cache.put(key, np.zeros((1,), dtype=np.float32))
      
        286
                assert (tmp_path / "wav" / "ef").is_dir()
      
        287
        
        288
            def test_exists_flips_after_put(self, tmp_path: Path) -> None:
      
        289
                cache = WaveformCache(tmp_path / "wav")
      
        290
                key = _wkey()
      
        291
                assert cache.exists(key) is False
      
        292
                cache.put(key, np.zeros((1,), dtype=np.float32))
      
        293
                assert cache.exists(key) is True
      
        294
        
        295
            def test_corrupt_file_treated_as_miss(self, tmp_path: Path) -> None:
      
        296
                cache = WaveformCache(tmp_path / "wav")
      
        297
                key = _wkey()
      
        298
                cache.put(key, np.zeros((1,), dtype=np.float32))
      
        299
                cache.path_for(key).write_bytes(b"not npz")
      
        300
                assert cache.get(key) is None
      
        301
        
        302
            def test_clear_removes_tree(self, tmp_path: Path) -> None:
      
        303
                cache = WaveformCache(tmp_path / "wav")
      
        304
                cache.put(_wkey(), np.zeros((1,), dtype=np.float32))
      
        305
                cache.clear()
      
        306
                assert not (tmp_path / "wav").exists()
      
        307
        
        308
        
        309
        class TestWaveformAndFeatureCachesDistinct:
      
        310
            """The two audio caches must not collide on-disk or in-memory."""
      
        311
        
        312
            def test_separate_roots_coexist(self, tmp_path: Path) -> None:
      
        313
                features = AudioCache(tmp_path / "audio-cache")
      
        314
                waveforms = WaveformCache(tmp_path / "audio-waveform-cache")
      
        315
                features.put(
      
        316
                    AudioCacheKey(
      
        317
                        blob_sha="a" * 64,
      
        318
                        processor_sha="b" * 64,
      
        319
                        sample_rate=16_000,
      
        320
                        max_length_ms=30_000,
      
        321
                    ),
      
        322
                    np.zeros((1,), dtype=np.float32),
      
        323
                )
      
        324
                waveforms.put(_wkey(), np.zeros((1,), dtype=np.float32))
      
        325
                # Neither cache's directory tree touches the other's.
      
        326
                feat_files = set((tmp_path / "audio-cache").rglob("*.npz"))
      
        327
                wave_files = set((tmp_path / "audio-waveform-cache").rglob("*.npz"))
      
        328
                assert feat_files
      
        329
                assert wave_files
      
        330
                assert feat_files.isdisjoint(wave_files)

1	"""Audio cache — key stability, atomic I/O, processor fingerprint.
2
3	Mirrors `test_vl_cache.py`. Covers:
4
5	- `AudioCacheKey.as_filename` / `shard` are deterministic.
6	- Different sample_rate / max_length_ms / processor_sha produce
7	different filenames.
8	- Round-trip: put → get returns byte-identical array.
9	- Miss on empty store, miss on corrupt file.
10	- `processor_sha256` is stable across repeat calls + pinned on instance.
11	- Different feature-extractor constants drift the sha.
12	"""
13
14	from __future__ import annotations
15
16	from pathlib import Path
17	from types import SimpleNamespace
18
19	import numpy as np
20	import pytest
21
22	from dlm.data.audio_cache import (
23	AudioCache,
24	AudioCacheKey,
25	WaveformCache,
26	WaveformCacheKey,
27	processor_sha256,
28	)
29
30
31	def _key(**overrides: object) -> AudioCacheKey:
32	defaults = {
33	"blob_sha": "a" * 64,
34	"processor_sha": "b" * 64,
35	"sample_rate": 16_000,
36	"max_length_ms": 30_000,
37	}
38	defaults.update(overrides)
39	return AudioCacheKey(**defaults) # type: ignore[arg-type]
40
41
42	class TestAudioCacheKey:
43	def test_filename_shape(self) -> None:
44	key = _key()
45	assert key.as_filename() == f"{'a' * 64}.{'b' * 12}.16000.30000.npz"
46
47	def test_shard_is_two_prefix(self) -> None:
48	assert _key(blob_sha="cd" + "0" * 62).shard() == "cd"
49
50	def test_different_sample_rate_different_filename(self) -> None:
51	a = _key(sample_rate=16_000)
52	b = _key(sample_rate=48_000)
53	assert a.as_filename() != b.as_filename()
54
55	def test_different_max_length_different_filename(self) -> None:
56	a = _key(max_length_ms=30_000)
57	b = _key(max_length_ms=60_000)
58	assert a.as_filename() != b.as_filename()
59
60	def test_different_processor_different_filename(self) -> None:
61	a = _key(processor_sha="1" * 64)
62	b = _key(processor_sha="2" * 64)
63	assert a.as_filename() != b.as_filename()
64
65	def test_key_is_frozen(self) -> None:
66	key = _key()
67	with pytest.raises(AttributeError):
68	key.blob_sha = "x" * 64 # type: ignore[misc]
69
70	def test_auto_resample_default_false_absent_from_filename(self) -> None:
71	"""Default False → filename stays v11-compatible (no `.rs` suffix).
72
73	Guards backward-compat: an existing cache populated before the
74	auto_resample field lands still hits on the same filename when
75	the caller doesn't opt in.
76	"""
77	assert ".rs" not in _key().as_filename()
78
79	def test_auto_resample_true_adds_suffix(self) -> None:
80	a = _key(auto_resample=False)
81	b = _key(auto_resample=True)
82	assert a.as_filename() != b.as_filename()
83	assert ".rs" in b.as_filename()
84
85
86	class TestAudioCacheRoundTrip:
87	def test_miss_on_empty(self, tmp_path: Path) -> None:
88	cache = AudioCache(tmp_path / "audio")
89	assert cache.get(_key()) is None
90
91	def test_put_then_get(self, tmp_path: Path) -> None:
92	cache = AudioCache(tmp_path / "audio")
93	tensor = np.arange(80 * 3000, dtype=np.float32).reshape(1, 80, 3000)
94	cache.put(_key(), tensor)
95	loaded = cache.get(_key())
96	assert loaded is not None
97	np.testing.assert_array_equal(loaded, tensor)
98	assert loaded.dtype == np.float32
99
100	def test_put_creates_shard_dir(self, tmp_path: Path) -> None:
101	cache = AudioCache(tmp_path / "audio")
102	key = _key(blob_sha="ef" + "0" * 62)
103	cache.put(key, np.zeros((1,), dtype=np.float32))
104	assert (tmp_path / "audio" / "ef").is_dir()
105
106	def test_exists_flips_after_put(self, tmp_path: Path) -> None:
107	cache = AudioCache(tmp_path / "audio")
108	key = _key()
109	assert cache.exists(key) is False
110	cache.put(key, np.zeros((1,), dtype=np.float32))
111	assert cache.exists(key) is True
112
113	def test_corrupt_file_treated_as_miss(self, tmp_path: Path) -> None:
114	cache = AudioCache(tmp_path / "audio")
115	key = _key()
116	cache.put(key, np.zeros((1,), dtype=np.float32))
117	cache.path_for(key).write_bytes(b"not a real npz")
118	assert cache.get(key) is None
119
120	def test_clear_removes_tree(self, tmp_path: Path) -> None:
121	cache = AudioCache(tmp_path / "audio")
122	cache.put(_key(), np.zeros((1,), dtype=np.float32))
123	cache.clear()
124	assert not (tmp_path / "audio").exists()
125
126
127	class TestProcessorSha256:
128	def _make_processor(self, **attrs: object) -> SimpleNamespace:
129	defaults: dict[str, object] = {
130	"sampling_rate": 16_000,
131	"feature_size": 80,
132	"n_fft": 400,
133	"hop_length": 160,
134	"chunk_length": 30,
135	"padding_value": 0.0,
136	"return_attention_mask": True,
137	}
138	defaults.update(attrs)
139	fe = SimpleNamespace(**defaults)
140	return SimpleNamespace(feature_extractor=fe)
141
142	def test_stable_across_calls(self) -> None:
143	proc = self._make_processor()
144	assert processor_sha256(proc) == processor_sha256(proc)
145
146	def test_pinned_on_instance(self) -> None:
147	proc = self._make_processor()
148	first = processor_sha256(proc)
149	# Mutate a field that would drift the sha if recomputed — the
150	# pinned cache returns the original so repeat calls stay O(1).
151	proc.feature_extractor.sampling_rate = 48_000
152	assert processor_sha256(proc) == first
153
154	def test_different_sample_rate_different_sha(self) -> None:
155	a = self._make_processor(sampling_rate=16_000)
156	b = self._make_processor(sampling_rate=48_000)
157	assert processor_sha256(a) != processor_sha256(b)
158
159	def test_different_n_fft_different_sha(self) -> None:
160	a = self._make_processor(n_fft=400)
161	b = self._make_processor(n_fft=1024)
162	assert processor_sha256(a) != processor_sha256(b)
163
164	def test_different_hop_length_different_sha(self) -> None:
165	a = self._make_processor(hop_length=160)
166	b = self._make_processor(hop_length=320)
167	assert processor_sha256(a) != processor_sha256(b)
168
169	def test_different_feature_extractor_class_different_sha(self) -> None:
170	class FeA:
171	sampling_rate = 16_000
172	feature_size = 80
173	n_fft = 400
174	hop_length = 160
175
176	class FeB:
177	sampling_rate = 16_000
178	feature_size = 80
179	n_fft = 400
180	hop_length = 160
181
182	proc_a = SimpleNamespace(feature_extractor=FeA())
183	proc_b = SimpleNamespace(feature_extractor=FeB())
184	assert processor_sha256(proc_a) != processor_sha256(proc_b)
185
186	def test_nested_feature_extractor_fields_are_readable(self) -> None:
187	proc = SimpleNamespace(
188	feature_extractor=SimpleNamespace(
189	sampling_rate=16_000,
190	feature_size=(80, 2),
191	n_fft=400,
192	hop_length=160,
193	chunk_length={"seconds": 30},
194	padding_value=0.0,
195	return_attention_mask=True,
196	)
197	)
198	sha = processor_sha256(proc)
199	assert len(sha) == 64
200
201	def test_exotic_feature_field_stringifies_stably(self) -> None:
202	proc = SimpleNamespace(
203	feature_extractor=SimpleNamespace(
204	sampling_rate=16_000,
205	feature_size=80,
206	n_fft=400,
207	hop_length=160,
208	chunk_length=object(),
209	padding_value=0.0,
210	return_attention_mask=True,
211	)
212	)
213	sha = processor_sha256(proc)
214	assert len(sha) == 64
215
216
217	# --- WaveformCache (35.2 deferred-item follow-up) ---------------------------
218
219
220	def _wkey(**overrides: object) -> WaveformCacheKey:
221	defaults: dict[str, object] = {
222	"blob_sha": "a" * 64,
223	"sample_rate": 16_000,
224	"max_length_ms": 30_000,
225	}
226	defaults.update(overrides)
227	return WaveformCacheKey(**defaults) # type: ignore[arg-type]
228
229
230	class TestWaveformCacheKey:
231	def test_filename_shape(self) -> None:
232	k = _wkey()
233	assert k.as_filename() == f"{'a' * 64}.16000.30000.wav.npz"
234
235	def test_shard_is_two_prefix(self) -> None:
236	assert _wkey(blob_sha="cd" + "0" * 62).shard() == "cd"
237
238	def test_different_sample_rate_different_filename(self) -> None:
239	assert _wkey(sample_rate=16_000).as_filename() != _wkey(sample_rate=48_000).as_filename()
240
241	def test_different_max_length_different_filename(self) -> None:
242	assert (
243	_wkey(max_length_ms=30_000).as_filename() != _wkey(max_length_ms=60_000).as_filename()
244	)
245
246	def test_key_no_processor_sha(self) -> None:
247	"""Waveform cache is pre-processor; key should omit processor_sha."""
248	# Distinct filenames from AudioCacheKey even for overlapping params
249	# — the layout is intentionally separate.
250	k = _wkey()
251	assert "proc" not in k.as_filename().lower()
252
253	def test_key_is_frozen(self) -> None:
254	k = _wkey()
255	with pytest.raises(AttributeError):
256	k.blob_sha = "x" * 64 # type: ignore[misc]
257
258	def test_auto_resample_default_false_absent_from_filename(self) -> None:
259	assert ".rs" not in _wkey().as_filename()
260
261	def test_auto_resample_true_adds_suffix(self) -> None:
262	a = _wkey(auto_resample=False)
263	b = _wkey(auto_resample=True)
264	assert a.as_filename() != b.as_filename()
265	assert ".rs" in b.as_filename()
266
267
268	class TestWaveformCacheRoundTrip:
269	def test_miss_on_empty(self, tmp_path: Path) -> None:
270	cache = WaveformCache(tmp_path / "wav")
271	assert cache.get(_wkey()) is None
272
273	def test_put_then_get(self, tmp_path: Path) -> None:
274	cache = WaveformCache(tmp_path / "wav")
275	waveform = np.arange(16_000 * 1, dtype=np.float32) / 16_000.0
276	cache.put(_wkey(), waveform)
277	loaded = cache.get(_wkey())
278	assert loaded is not None
279	np.testing.assert_array_equal(loaded, waveform)
280	assert loaded.dtype == np.float32
281
282	def test_put_creates_shard_dir(self, tmp_path: Path) -> None:
283	cache = WaveformCache(tmp_path / "wav")
284	key = _wkey(blob_sha="ef" + "0" * 62)
285	cache.put(key, np.zeros((1,), dtype=np.float32))
286	assert (tmp_path / "wav" / "ef").is_dir()
287
288	def test_exists_flips_after_put(self, tmp_path: Path) -> None:
289	cache = WaveformCache(tmp_path / "wav")
290	key = _wkey()
291	assert cache.exists(key) is False
292	cache.put(key, np.zeros((1,), dtype=np.float32))
293	assert cache.exists(key) is True
294
295	def test_corrupt_file_treated_as_miss(self, tmp_path: Path) -> None:
296	cache = WaveformCache(tmp_path / "wav")
297	key = _wkey()
298	cache.put(key, np.zeros((1,), dtype=np.float32))
299	cache.path_for(key).write_bytes(b"not npz")
300	assert cache.get(key) is None
301
302	def test_clear_removes_tree(self, tmp_path: Path) -> None:
303	cache = WaveformCache(tmp_path / "wav")
304	cache.put(_wkey(), np.zeros((1,), dtype=np.float32))
305	cache.clear()
306	assert not (tmp_path / "wav").exists()
307
308
309	class TestWaveformAndFeatureCachesDistinct:
310	"""The two audio caches must not collide on-disk or in-memory."""
311
312	def test_separate_roots_coexist(self, tmp_path: Path) -> None:
313	features = AudioCache(tmp_path / "audio-cache")
314	waveforms = WaveformCache(tmp_path / "audio-waveform-cache")
315	features.put(
316	AudioCacheKey(
317	blob_sha="a" * 64,
318	processor_sha="b" * 64,
319	sample_rate=16_000,
320	max_length_ms=30_000,
321	),
322	np.zeros((1,), dtype=np.float32),
323	)
324	waveforms.put(_wkey(), np.zeros((1,), dtype=np.float32))
325	# Neither cache's directory tree touches the other's.
326	feat_files = set((tmp_path / "audio-cache").rglob("*.npz"))
327	wave_files = set((tmp_path / "audio-waveform-cache").rglob("*.npz"))
328	assert feat_files
329	assert wave_files
330	assert feat_files.isdisjoint(wave_files)