@@ -0,0 +1,84 @@ |
| 1 | +"""`tokenizer_grew` + `modules_to_save_for_growth` — canonical Sprint 12b contract.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from dataclasses import dataclass, field |
| 6 | + |
| 7 | +import pytest |
| 8 | + |
| 9 | +from dlm.data.tokenizer_contract import modules_to_save_for_growth, tokenizer_grew |
| 10 | + |
| 11 | + |
| 12 | +@dataclass |
| 13 | +class _FakeTokenizer: |
| 14 | + """Minimal stand-in: matches the two methods the predicate touches. |
| 15 | + |
| 16 | + Real tokenizers (BPE / SentencePiece / Unigram) expose the same |
| 17 | + two-method surface — `vocab_size` property + `get_added_vocab()` |
| 18 | + returning a `dict[str, int]`. Faking it is safe. |
| 19 | + """ |
| 20 | + |
| 21 | + vocab_size: int |
| 22 | + added: dict[str, int] = field(default_factory=dict) |
| 23 | + |
| 24 | + def get_added_vocab(self) -> dict[str, int]: |
| 25 | + return dict(self.added) |
| 26 | + |
| 27 | + |
| 28 | +class TestTokenizerGrew: |
| 29 | + def test_identical_tokenizers_not_grown(self) -> None: |
| 30 | + a = _FakeTokenizer(vocab_size=32000, added={"<|im_end|>": 32001}) |
| 31 | + b = _FakeTokenizer(vocab_size=32000, added={"<|im_end|>": 32001}) |
| 32 | + assert tokenizer_grew(a, b) is False |
| 33 | + |
| 34 | + def test_vocab_size_change_detected(self) -> None: |
| 35 | + """Sprint 07 pad-fallback path: `add_special_tokens` bumps vocab_size.""" |
| 36 | + base = _FakeTokenizer(vocab_size=32000) |
| 37 | + final = _FakeTokenizer(vocab_size=32001, added={"<|pad|>": 32000}) |
| 38 | + assert tokenizer_grew(base, final) is True |
| 39 | + |
| 40 | + def test_added_token_set_change_detected(self) -> None: |
| 41 | + """Rare: vocab size identical, but added-tokens set differs.""" |
| 42 | + base = _FakeTokenizer(vocab_size=32001, added={"<|a|>": 32000}) |
| 43 | + final = _FakeTokenizer(vocab_size=32001, added={"<|b|>": 32000}) |
| 44 | + assert tokenizer_grew(base, final) is True |
| 45 | + |
| 46 | + def test_bpe_like_qwen_shape(self) -> None: |
| 47 | + """Qwen-style BPE: large vocab + a handful of added specials.""" |
| 48 | + base = _FakeTokenizer( |
| 49 | + vocab_size=151936, |
| 50 | + added={"<|im_start|>": 151644, "<|im_end|>": 151645}, |
| 51 | + ) |
| 52 | + final = _FakeTokenizer( |
| 53 | + vocab_size=151936, |
| 54 | + added={"<|im_start|>": 151644, "<|im_end|>": 151645}, |
| 55 | + ) |
| 56 | + assert tokenizer_grew(base, final) is False |
| 57 | + |
| 58 | + def test_sentencepiece_like_llama_shape(self) -> None: |
| 59 | + """Llama-family SentencePiece: smaller vocab, minimal added tokens.""" |
| 60 | + base = _FakeTokenizer(vocab_size=128000, added={"<|begin_of_text|>": 128000}) |
| 61 | + final = _FakeTokenizer(vocab_size=128001, added={"<|begin_of_text|>": 128000}) |
| 62 | + assert tokenizer_grew(base, final) is True |
| 63 | + |
| 64 | + def test_pad_fallback_case(self) -> None: |
| 65 | + """Canonical Sprint 07 pad-fallback flow: vocab grew by exactly one.""" |
| 66 | + base = _FakeTokenizer(vocab_size=49152) |
| 67 | + final = _FakeTokenizer(vocab_size=49153, added={"<|pad|>": 49152}) |
| 68 | + assert tokenizer_grew(base, final) is True |
| 69 | + |
| 70 | + |
| 71 | +class TestModulesToSave: |
| 72 | + def test_grown_returns_embed_and_lm_head(self) -> None: |
| 73 | + assert modules_to_save_for_growth(True) == ["embed_tokens", "lm_head"] |
| 74 | + |
| 75 | + def test_unchanged_returns_empty(self) -> None: |
| 76 | + assert modules_to_save_for_growth(False) == [] |
| 77 | + |
| 78 | + @pytest.mark.parametrize("grew", [True, False]) |
| 79 | + def test_returns_new_list_each_call(self, grew: bool) -> None: |
| 80 | + """Callers mutate the returned list; must not share state.""" |
| 81 | + first = modules_to_save_for_growth(grew) |
| 82 | + second = modules_to_save_for_growth(grew) |
| 83 | + first.append("extra") |
| 84 | + assert "extra" not in second |