tenseleyflow/documentlanguagemodel / e7925ff

Browse files

fix(export): bound GGUF metadata string length to defend against OOM bombs (audit-04 F2)

Authored by espadonne
SHA
e7925ff1a528df22553f905702f7ae21a4aea18f
Parents
0e0a262
Tree
9940eea

2 changed files

StatusFile+-
M src/dlm/export/tokenizer_sync.py 14 0
M tests/unit/export/test_tokenizer_sync.py 38 0
src/dlm/export/tokenizer_sync.pymodified
@@ -38,6 +38,12 @@ if TYPE_CHECKING:
3838
 
3939
 _GGUF_MAGIC: Final[bytes] = b"GGUF"
4040
 
41
+# Upper bound on a GGUF metadata string — chosen wildly larger than any
42
+# credible real value (tokens are ≤ a few hundred bytes; chat templates
43
+# run tens of KB at most) but small enough to reject a crafted GGUF that
44
+# claims a multi-GB string and drives `f.read(length)` into OOM.
45
+_MAX_STRING_BYTES: Final[int] = 16 * 1024 * 1024
46
+
4147
 # GGUF value types per llama.cpp's gguf spec (stable v2+v3).
4248
 _TYPE_UINT8: Final[int] = 0
4349
 _TYPE_INT8: Final[int] = 1
@@ -198,6 +204,8 @@ def _read_u64(f: Any) -> int:
198204
 
199205
 def _read_string(f: Any) -> str:
200206
     length = _read_u64(f)
207
+    if length > _MAX_STRING_BYTES:
208
+        raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}")
201209
     raw = f.read(length)
202210
     if len(raw) != length:
203211
         raise struct.error("short read in string")
@@ -211,6 +219,8 @@ def _skip_value(f: Any, value_type: int) -> None:
211219
         return
212220
     if value_type == _TYPE_STRING:
213221
         length = _read_u64(f)
222
+        if length > _MAX_STRING_BYTES:
223
+            raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}")
214224
         f.seek(length, 1)
215225
         return
216226
     if value_type == _TYPE_ARRAY:
@@ -222,6 +232,10 @@ def _skip_value(f: Any, value_type: int) -> None:
222232
         if elem_type == _TYPE_STRING:
223233
             for _ in range(count):
224234
                 length = _read_u64(f)
235
+                if length > _MAX_STRING_BYTES:
236
+                    raise struct.error(
237
+                        f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}"
238
+                    )
225239
                 f.seek(length, 1)
226240
             return
227241
         # Nested arrays aren't used by llama.cpp's vocab metadata; treat
tests/unit/export/test_tokenizer_sync.pymodified
@@ -167,6 +167,44 @@ class TestReadVocabSize:
167167
         with pytest.raises(PreflightError, match="cannot parse GGUF"):
168168
             read_gguf_vocab_size(path)
169169
 
170
+    def test_crafted_string_length_rejected(self, tmp_path: Path) -> None:
171
+        """A GGUF claiming a multi-GB string key must be rejected, not `f.read(huge)`."""
172
+        import struct as _s
173
+
174
+        header = bytearray()
175
+        header.extend(b"GGUF")
176
+        header.extend(_s.pack("<I", 3))
177
+        header.extend(_s.pack("<Q", 0))
178
+        header.extend(_s.pack("<Q", 1))  # kv_count=1
179
+        # Key length claims 1 TiB. File only has a few bytes after.
180
+        header.extend(_s.pack("<Q", 2**40))
181
+        header.extend(b"a")  # body truncated — the bound check should fire first
182
+
183
+        path = tmp_path / "oom_bomb.gguf"
184
+        path.write_bytes(bytes(header))
185
+        with pytest.raises(PreflightError, match="cannot parse GGUF"):
186
+            read_gguf_vocab_size(path)
187
+
188
+    def test_crafted_skip_string_length_rejected(self, tmp_path: Path) -> None:
189
+        """A GGUF whose skipped string KV claims huge length must be rejected."""
190
+        import struct as _s
191
+
192
+        body = bytearray()
193
+        _write_string(body, "to_skip")
194
+        body.extend(_s.pack("<I", _TYPE_STRING))
195
+        body.extend(_s.pack("<Q", 2**40))  # bogus length on the value
196
+        _write_kv_string_array(body, "tokenizer.ggml.tokens", ["a"])
197
+
198
+        header = bytearray()
199
+        header.extend(b"GGUF")
200
+        header.extend(_s.pack("<I", 3))
201
+        header.extend(_s.pack("<Q", 0))
202
+        header.extend(_s.pack("<Q", 2))  # kv_count=2
203
+        path = tmp_path / "skip_bomb.gguf"
204
+        path.write_bytes(bytes(header) + bytes(body))
205
+        with pytest.raises(PreflightError, match="cannot parse GGUF"):
206
+            read_gguf_vocab_size(path)
207
+
170208
     def test_nested_array_raises(self, tmp_path: Path) -> None:
171209
         """Array-of-array is not supported by llama.cpp's vocab metadata."""
172210
         import struct as _s