fix(export): bound GGUF metadata string length to defend against OOM bombs (audit-04 F2)
- SHA
e7925ff1a528df22553f905702f7ae21a4aea18f- Parents
-
0e0a262 - Tree
9940eea
e7925ff
e7925ff1a528df22553f905702f7ae21a4aea18f0e0a262
9940eea| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm/export/tokenizer_sync.py
|
14 | 0 |
| M |
tests/unit/export/test_tokenizer_sync.py
|
38 | 0 |
src/dlm/export/tokenizer_sync.pymodified@@ -38,6 +38,12 @@ if TYPE_CHECKING: | ||
| 38 | 38 | |
| 39 | 39 | _GGUF_MAGIC: Final[bytes] = b"GGUF" |
| 40 | 40 | |
| 41 | +# Upper bound on a GGUF metadata string — chosen wildly larger than any | |
| 42 | +# credible real value (tokens are ≤ a few hundred bytes; chat templates | |
| 43 | +# run tens of KB at most) but small enough to reject a crafted GGUF that | |
| 44 | +# claims a multi-GB string and drives `f.read(length)` into OOM. | |
| 45 | +_MAX_STRING_BYTES: Final[int] = 16 * 1024 * 1024 | |
| 46 | + | |
| 41 | 47 | # GGUF value types per llama.cpp's gguf spec (stable v2+v3). |
| 42 | 48 | _TYPE_UINT8: Final[int] = 0 |
| 43 | 49 | _TYPE_INT8: Final[int] = 1 |
@@ -198,6 +204,8 @@ def _read_u64(f: Any) -> int: | ||
| 198 | 204 | |
| 199 | 205 | def _read_string(f: Any) -> str: |
| 200 | 206 | length = _read_u64(f) |
| 207 | + if length > _MAX_STRING_BYTES: | |
| 208 | + raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}") | |
| 201 | 209 | raw = f.read(length) |
| 202 | 210 | if len(raw) != length: |
| 203 | 211 | raise struct.error("short read in string") |
@@ -211,6 +219,8 @@ def _skip_value(f: Any, value_type: int) -> None: | ||
| 211 | 219 | return |
| 212 | 220 | if value_type == _TYPE_STRING: |
| 213 | 221 | length = _read_u64(f) |
| 222 | + if length > _MAX_STRING_BYTES: | |
| 223 | + raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}") | |
| 214 | 224 | f.seek(length, 1) |
| 215 | 225 | return |
| 216 | 226 | if value_type == _TYPE_ARRAY: |
@@ -222,6 +232,10 @@ def _skip_value(f: Any, value_type: int) -> None: | ||
| 222 | 232 | if elem_type == _TYPE_STRING: |
| 223 | 233 | for _ in range(count): |
| 224 | 234 | length = _read_u64(f) |
| 235 | + if length > _MAX_STRING_BYTES: | |
| 236 | + raise struct.error( | |
| 237 | + f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}" | |
| 238 | + ) | |
| 225 | 239 | f.seek(length, 1) |
| 226 | 240 | return |
| 227 | 241 | # Nested arrays aren't used by llama.cpp's vocab metadata; treat |
tests/unit/export/test_tokenizer_sync.pymodified@@ -167,6 +167,44 @@ class TestReadVocabSize: | ||
| 167 | 167 | with pytest.raises(PreflightError, match="cannot parse GGUF"): |
| 168 | 168 | read_gguf_vocab_size(path) |
| 169 | 169 | |
| 170 | + def test_crafted_string_length_rejected(self, tmp_path: Path) -> None: | |
| 171 | + """A GGUF claiming a multi-GB string key must be rejected, not `f.read(huge)`.""" | |
| 172 | + import struct as _s | |
| 173 | + | |
| 174 | + header = bytearray() | |
| 175 | + header.extend(b"GGUF") | |
| 176 | + header.extend(_s.pack("<I", 3)) | |
| 177 | + header.extend(_s.pack("<Q", 0)) | |
| 178 | + header.extend(_s.pack("<Q", 1)) # kv_count=1 | |
| 179 | + # Key length claims 1 TiB. File only has a few bytes after. | |
| 180 | + header.extend(_s.pack("<Q", 2**40)) | |
| 181 | + header.extend(b"a") # body truncated — the bound check should fire first | |
| 182 | + | |
| 183 | + path = tmp_path / "oom_bomb.gguf" | |
| 184 | + path.write_bytes(bytes(header)) | |
| 185 | + with pytest.raises(PreflightError, match="cannot parse GGUF"): | |
| 186 | + read_gguf_vocab_size(path) | |
| 187 | + | |
| 188 | + def test_crafted_skip_string_length_rejected(self, tmp_path: Path) -> None: | |
| 189 | + """A GGUF whose skipped string KV claims huge length must be rejected.""" | |
| 190 | + import struct as _s | |
| 191 | + | |
| 192 | + body = bytearray() | |
| 193 | + _write_string(body, "to_skip") | |
| 194 | + body.extend(_s.pack("<I", _TYPE_STRING)) | |
| 195 | + body.extend(_s.pack("<Q", 2**40)) # bogus length on the value | |
| 196 | + _write_kv_string_array(body, "tokenizer.ggml.tokens", ["a"]) | |
| 197 | + | |
| 198 | + header = bytearray() | |
| 199 | + header.extend(b"GGUF") | |
| 200 | + header.extend(_s.pack("<I", 3)) | |
| 201 | + header.extend(_s.pack("<Q", 0)) | |
| 202 | + header.extend(_s.pack("<Q", 2)) # kv_count=2 | |
| 203 | + path = tmp_path / "skip_bomb.gguf" | |
| 204 | + path.write_bytes(bytes(header) + bytes(body)) | |
| 205 | + with pytest.raises(PreflightError, match="cannot parse GGUF"): | |
| 206 | + read_gguf_vocab_size(path) | |
| 207 | + | |
| 170 | 208 | def test_nested_array_raises(self, tmp_path: Path) -> None: |
| 171 | 209 | """Array-of-array is not supported by llama.cpp's vocab metadata.""" |
| 172 | 210 | import struct as _s |