Python · 4005 bytes Raw Blame History
1 """CacheKey + tokenizer_sha256 — determinism + sensitivity."""
2
3 from __future__ import annotations
4
5 import pytest
6
7 from dlm.directives.cache_key import CacheKey, tokenizer_sha256
8
9
10 class TestCacheKey:
11 def test_as_filename_is_deterministic(self) -> None:
12 key = CacheKey(
13 section_id="ab12cd34ef567890",
14 tokenizer_sha="a" * 64,
15 sequence_len=2048,
16 )
17 assert key.as_filename() == "ab12cd34ef567890.aaaaaaaaaaaa.seq2048.npz"
18
19 def test_shard_is_first_two_hex(self) -> None:
20 key = CacheKey(section_id="ff12cd34ef567890", tokenizer_sha="a" * 64, sequence_len=1024)
21 assert key.shard() == "ff"
22
23 @pytest.mark.parametrize(
24 ("a", "b"),
25 [
26 # section_id changes → different filename
27 (
28 CacheKey("ab12cd34ef567890", "a" * 64, 2048),
29 CacheKey("ab12cd34ef567891", "a" * 64, 2048),
30 ),
31 # tokenizer_sha changes → different filename
32 (
33 CacheKey("ab12cd34ef567890", "a" * 64, 2048),
34 CacheKey("ab12cd34ef567890", "b" * 64, 2048),
35 ),
36 # sequence_len changes → different filename
37 (
38 CacheKey("ab12cd34ef567890", "a" * 64, 2048),
39 CacheKey("ab12cd34ef567890", "a" * 64, 1024),
40 ),
41 ],
42 )
43 def test_each_input_changes_filename(self, a: CacheKey, b: CacheKey) -> None:
44 assert a.as_filename() != b.as_filename()
45
46
47 class _FakeBackendTokenizer:
48 """Stand-in for `tokenizer.backend_tokenizer` with `to_str`."""
49
50 def __init__(self, canonical: str) -> None:
51 self._canonical = canonical
52
53 def to_str(self) -> str:
54 return self._canonical
55
56
57 class _BrokenBackendTokenizer:
58 def to_str(self) -> str:
59 raise RuntimeError("boom")
60
61
62 class _FakeTokenizer:
63 """Minimal shape for tokenizer_sha256 — just enough attrs."""
64
65 def __init__(self, *, canonical: str | None = None, vocab_size: int = 32000) -> None:
66 self.backend_tokenizer: object | None = (
67 _FakeBackendTokenizer(canonical) if canonical else None
68 )
69 self.vocab_size = vocab_size
70 self.model_max_length = 2048
71 self.pad_token = "<pad>"
72 self.eos_token = "</s>"
73 self.bos_token = "<s>"
74 self.unk_token = "<unk>"
75 self.cls_token = ""
76 self.sep_token = ""
77 self.mask_token = ""
78 self.added_tokens_decoder: dict[int, str] = {}
79
80
81 class TestTokenizerSha256:
82 def test_fast_tokenizer_path(self) -> None:
83 tok = _FakeTokenizer(canonical='{"type": "BPE", "vocab": {"a": 1}}')
84 sha = tokenizer_sha256(tok)
85 assert len(sha) == 64
86 # Deterministic: same input → same sha
87 assert tokenizer_sha256(tok) == sha
88
89 def test_canonical_change_flips_sha(self) -> None:
90 tok_a = _FakeTokenizer(canonical='{"v": 1}')
91 tok_b = _FakeTokenizer(canonical='{"v": 2}')
92 assert tokenizer_sha256(tok_a) != tokenizer_sha256(tok_b)
93
94 def test_legacy_path_when_no_backend(self) -> None:
95 tok = _FakeTokenizer() # no backend_tokenizer
96 sha = tokenizer_sha256(tok)
97 assert len(sha) == 64
98
99 def test_legacy_vocab_change_flips_sha(self) -> None:
100 tok_a = _FakeTokenizer(vocab_size=32000)
101 tok_b = _FakeTokenizer(vocab_size=64000)
102 assert tokenizer_sha256(tok_a) != tokenizer_sha256(tok_b)
103
104 def test_pinned_on_instance(self) -> None:
105 tok = _FakeTokenizer(canonical='{"v": 1}')
106 sha1 = tokenizer_sha256(tok)
107 # Swap canonical underneath — pinned value persists
108 tok.backend_tokenizer = _FakeBackendTokenizer('{"v": 2}')
109 sha2 = tokenizer_sha256(tok)
110 assert sha1 == sha2
111
112 def test_backend_to_str_failure_falls_back_to_legacy(self) -> None:
113 tok = _FakeTokenizer()
114 tok.backend_tokenizer = _BrokenBackendTokenizer()
115 sha = tokenizer_sha256(tok)
116 assert len(sha) == 64