| 1 |
"""CacheKey + tokenizer_sha256 — determinism + sensitivity.""" |
| 2 |
|
| 3 |
from __future__ import annotations |
| 4 |
|
| 5 |
import pytest |
| 6 |
|
| 7 |
from dlm.directives.cache_key import CacheKey, tokenizer_sha256 |
| 8 |
|
| 9 |
|
| 10 |
class TestCacheKey: |
| 11 |
def test_as_filename_is_deterministic(self) -> None: |
| 12 |
key = CacheKey( |
| 13 |
section_id="ab12cd34ef567890", |
| 14 |
tokenizer_sha="a" * 64, |
| 15 |
sequence_len=2048, |
| 16 |
) |
| 17 |
assert key.as_filename() == "ab12cd34ef567890.aaaaaaaaaaaa.seq2048.npz" |
| 18 |
|
| 19 |
def test_shard_is_first_two_hex(self) -> None: |
| 20 |
key = CacheKey(section_id="ff12cd34ef567890", tokenizer_sha="a" * 64, sequence_len=1024) |
| 21 |
assert key.shard() == "ff" |
| 22 |
|
| 23 |
@pytest.mark.parametrize( |
| 24 |
("a", "b"), |
| 25 |
[ |
| 26 |
# section_id changes → different filename |
| 27 |
( |
| 28 |
CacheKey("ab12cd34ef567890", "a" * 64, 2048), |
| 29 |
CacheKey("ab12cd34ef567891", "a" * 64, 2048), |
| 30 |
), |
| 31 |
# tokenizer_sha changes → different filename |
| 32 |
( |
| 33 |
CacheKey("ab12cd34ef567890", "a" * 64, 2048), |
| 34 |
CacheKey("ab12cd34ef567890", "b" * 64, 2048), |
| 35 |
), |
| 36 |
# sequence_len changes → different filename |
| 37 |
( |
| 38 |
CacheKey("ab12cd34ef567890", "a" * 64, 2048), |
| 39 |
CacheKey("ab12cd34ef567890", "a" * 64, 1024), |
| 40 |
), |
| 41 |
], |
| 42 |
) |
| 43 |
def test_each_input_changes_filename(self, a: CacheKey, b: CacheKey) -> None: |
| 44 |
assert a.as_filename() != b.as_filename() |
| 45 |
|
| 46 |
|
| 47 |
class _FakeBackendTokenizer: |
| 48 |
"""Stand-in for `tokenizer.backend_tokenizer` with `to_str`.""" |
| 49 |
|
| 50 |
def __init__(self, canonical: str) -> None: |
| 51 |
self._canonical = canonical |
| 52 |
|
| 53 |
def to_str(self) -> str: |
| 54 |
return self._canonical |
| 55 |
|
| 56 |
|
| 57 |
class _BrokenBackendTokenizer: |
| 58 |
def to_str(self) -> str: |
| 59 |
raise RuntimeError("boom") |
| 60 |
|
| 61 |
|
| 62 |
class _FakeTokenizer: |
| 63 |
"""Minimal shape for tokenizer_sha256 — just enough attrs.""" |
| 64 |
|
| 65 |
def __init__(self, *, canonical: str | None = None, vocab_size: int = 32000) -> None: |
| 66 |
self.backend_tokenizer: object | None = ( |
| 67 |
_FakeBackendTokenizer(canonical) if canonical else None |
| 68 |
) |
| 69 |
self.vocab_size = vocab_size |
| 70 |
self.model_max_length = 2048 |
| 71 |
self.pad_token = "<pad>" |
| 72 |
self.eos_token = "</s>" |
| 73 |
self.bos_token = "<s>" |
| 74 |
self.unk_token = "<unk>" |
| 75 |
self.cls_token = "" |
| 76 |
self.sep_token = "" |
| 77 |
self.mask_token = "" |
| 78 |
self.added_tokens_decoder: dict[int, str] = {} |
| 79 |
|
| 80 |
|
| 81 |
class TestTokenizerSha256: |
| 82 |
def test_fast_tokenizer_path(self) -> None: |
| 83 |
tok = _FakeTokenizer(canonical='{"type": "BPE", "vocab": {"a": 1}}') |
| 84 |
sha = tokenizer_sha256(tok) |
| 85 |
assert len(sha) == 64 |
| 86 |
# Deterministic: same input → same sha |
| 87 |
assert tokenizer_sha256(tok) == sha |
| 88 |
|
| 89 |
def test_canonical_change_flips_sha(self) -> None: |
| 90 |
tok_a = _FakeTokenizer(canonical='{"v": 1}') |
| 91 |
tok_b = _FakeTokenizer(canonical='{"v": 2}') |
| 92 |
assert tokenizer_sha256(tok_a) != tokenizer_sha256(tok_b) |
| 93 |
|
| 94 |
def test_legacy_path_when_no_backend(self) -> None: |
| 95 |
tok = _FakeTokenizer() # no backend_tokenizer |
| 96 |
sha = tokenizer_sha256(tok) |
| 97 |
assert len(sha) == 64 |
| 98 |
|
| 99 |
def test_legacy_vocab_change_flips_sha(self) -> None: |
| 100 |
tok_a = _FakeTokenizer(vocab_size=32000) |
| 101 |
tok_b = _FakeTokenizer(vocab_size=64000) |
| 102 |
assert tokenizer_sha256(tok_a) != tokenizer_sha256(tok_b) |
| 103 |
|
| 104 |
def test_pinned_on_instance(self) -> None: |
| 105 |
tok = _FakeTokenizer(canonical='{"v": 1}') |
| 106 |
sha1 = tokenizer_sha256(tok) |
| 107 |
# Swap canonical underneath — pinned value persists |
| 108 |
tok.backend_tokenizer = _FakeBackendTokenizer('{"v": 2}') |
| 109 |
sha2 = tokenizer_sha256(tok) |
| 110 |
assert sha1 == sha2 |
| 111 |
|
| 112 |
def test_backend_to_str_failure_falls_back_to_legacy(self) -> None: |
| 113 |
tok = _FakeTokenizer() |
| 114 |
tok.backend_tokenizer = _BrokenBackendTokenizer() |
| 115 |
sha = tokenizer_sha256(tok) |
| 116 |
assert len(sha) == 64 |