tenseleyflow/documentlanguagemodel / f7417d8

Browse files

test(data.tokenizer_contract): BPE/SentencePiece/pad-fallback predicate matrix

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
f7417d89e8fc6f043526d0019ebe880b1b81499a
Parents
67fe71a
Tree
ca481e4

1 changed file

StatusFile+-
A tests/unit/data/test_tokenizer_contract.py 84 0
tests/unit/data/test_tokenizer_contract.pyadded
@@ -0,0 +1,84 @@
1
+"""`tokenizer_grew` + `modules_to_save_for_growth` — canonical Sprint 12b contract."""
2
+
3
+from __future__ import annotations
4
+
5
+from dataclasses import dataclass, field
6
+
7
+import pytest
8
+
9
+from dlm.data.tokenizer_contract import modules_to_save_for_growth, tokenizer_grew
10
+
11
+
12
+@dataclass
13
+class _FakeTokenizer:
14
+    """Minimal stand-in: matches the two methods the predicate touches.
15
+
16
+    Real tokenizers (BPE / SentencePiece / Unigram) expose the same
17
+    two-method surface — `vocab_size` property + `get_added_vocab()`
18
+    returning a `dict[str, int]`. Faking it is safe.
19
+    """
20
+
21
+    vocab_size: int
22
+    added: dict[str, int] = field(default_factory=dict)
23
+
24
+    def get_added_vocab(self) -> dict[str, int]:
25
+        return dict(self.added)
26
+
27
+
28
+class TestTokenizerGrew:
29
+    def test_identical_tokenizers_not_grown(self) -> None:
30
+        a = _FakeTokenizer(vocab_size=32000, added={"<|im_end|>": 32001})
31
+        b = _FakeTokenizer(vocab_size=32000, added={"<|im_end|>": 32001})
32
+        assert tokenizer_grew(a, b) is False
33
+
34
+    def test_vocab_size_change_detected(self) -> None:
35
+        """Sprint 07 pad-fallback path: `add_special_tokens` bumps vocab_size."""
36
+        base = _FakeTokenizer(vocab_size=32000)
37
+        final = _FakeTokenizer(vocab_size=32001, added={"<|pad|>": 32000})
38
+        assert tokenizer_grew(base, final) is True
39
+
40
+    def test_added_token_set_change_detected(self) -> None:
41
+        """Rare: vocab size identical, but added-tokens set differs."""
42
+        base = _FakeTokenizer(vocab_size=32001, added={"<|a|>": 32000})
43
+        final = _FakeTokenizer(vocab_size=32001, added={"<|b|>": 32000})
44
+        assert tokenizer_grew(base, final) is True
45
+
46
+    def test_bpe_like_qwen_shape(self) -> None:
47
+        """Qwen-style BPE: large vocab + a handful of added specials."""
48
+        base = _FakeTokenizer(
49
+            vocab_size=151936,
50
+            added={"<|im_start|>": 151644, "<|im_end|>": 151645},
51
+        )
52
+        final = _FakeTokenizer(
53
+            vocab_size=151936,
54
+            added={"<|im_start|>": 151644, "<|im_end|>": 151645},
55
+        )
56
+        assert tokenizer_grew(base, final) is False
57
+
58
+    def test_sentencepiece_like_llama_shape(self) -> None:
59
+        """Llama-family SentencePiece: smaller vocab, minimal added tokens."""
60
+        base = _FakeTokenizer(vocab_size=128000, added={"<|begin_of_text|>": 128000})
61
+        final = _FakeTokenizer(vocab_size=128001, added={"<|begin_of_text|>": 128000})
62
+        assert tokenizer_grew(base, final) is True
63
+
64
+    def test_pad_fallback_case(self) -> None:
65
+        """Canonical Sprint 07 pad-fallback flow: vocab grew by exactly one."""
66
+        base = _FakeTokenizer(vocab_size=49152)
67
+        final = _FakeTokenizer(vocab_size=49153, added={"<|pad|>": 49152})
68
+        assert tokenizer_grew(base, final) is True
69
+
70
+
71
+class TestModulesToSave:
72
+    def test_grown_returns_embed_and_lm_head(self) -> None:
73
+        assert modules_to_save_for_growth(True) == ["embed_tokens", "lm_head"]
74
+
75
+    def test_unchanged_returns_empty(self) -> None:
76
+        assert modules_to_save_for_growth(False) == []
77
+
78
+    @pytest.mark.parametrize("grew", [True, False])
79
+    def test_returns_new_list_each_call(self, grew: bool) -> None:
80
+        """Callers mutate the returned list; must not share state."""
81
+        first = modules_to_save_for_growth(grew)
82
+        second = modules_to_save_for_growth(grew)
83
+        first.append("extra")
84
+        assert "extra" not in second