`e7925ff`

fix(export): bound GGUF metadata string length to defend against OOM bombs (audit-04 F2)

Authored by

espadonne 3 weeks ago

SHA: e7925ff1a528df22553f905702f7ae21a4aea18f
Parents: 0e0a262
Tree: 9940eea

2 changed files

Status	File	+	-
M	`src/dlm/export/tokenizer_sync.py`	14	0
M	`tests/unit/export/test_tokenizer_sync.py`	38	0

src/dlm/export/tokenizer_sync.pymodified

  _GGUF_MAGIC: Final[bytes] = b"GGUF"
 +# Upper bound on a GGUF metadata string — chosen wildly larger than any
 +# credible real value (tokens are ≤ a few hundred bytes; chat templates
 +# run tens of KB at most) but small enough to reject a crafted GGUF that
 +# claims a multi-GB string and drives `f.read(length)` into OOM.
 +_MAX_STRING_BYTES: Final[int] = 16 * 1024 * 1024
++
  # GGUF value types per llama.cpp's gguf spec (stable v2+v3).
  _TYPE_UINT8: Final[int] = 0
  _TYPE_INT8: Final[int] = 1
  def _read_string(f: Any) -> str:
      length = _read_u64(f)
 +    if length > _MAX_STRING_BYTES:
 +        raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}")
      raw = f.read(length)
      if len(raw) != length:
          raise struct.error("short read in string")
          return
      if value_type == _TYPE_STRING:
          length = _read_u64(f)
 +        if length > _MAX_STRING_BYTES:
 +            raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}")
          f.seek(length, 1)
          return
      if value_type == _TYPE_ARRAY:
          if elem_type == _TYPE_STRING:
              for _ in range(count):
                  length = _read_u64(f)
 +                if length > _MAX_STRING_BYTES:
 +                    raise struct.error(
 +                        f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}"
 +                    )
                  f.seek(length, 1)
              return
          # Nested arrays aren't used by llama.cpp's vocab metadata; treat

tests/unit/export/test_tokenizer_sync.pymodified

          with pytest.raises(PreflightError, match="cannot parse GGUF"):
              read_gguf_vocab_size(path)
 +    def test_crafted_string_length_rejected(self, tmp_path: Path) -> None:
 +        """A GGUF claiming a multi-GB string key must be rejected, not `f.read(huge)`."""
 +        import struct as _s
++
 +        header = bytearray()
 +        header.extend(b"GGUF")
 +        header.extend(_s.pack("<I", 3))
 +        header.extend(_s.pack("<Q", 0))
 +        header.extend(_s.pack("<Q", 1))  # kv_count=1
 +        # Key length claims 1 TiB. File only has a few bytes after.
 +        header.extend(_s.pack("<Q", 2**40))
 +        header.extend(b"a")  # body truncated — the bound check should fire first
++
 +        path = tmp_path / "oom_bomb.gguf"
 +        path.write_bytes(bytes(header))
 +        with pytest.raises(PreflightError, match="cannot parse GGUF"):
 +            read_gguf_vocab_size(path)
++
 +    def test_crafted_skip_string_length_rejected(self, tmp_path: Path) -> None:
 +        """A GGUF whose skipped string KV claims huge length must be rejected."""
 +        import struct as _s
++
 +        body = bytearray()
 +        _write_string(body, "to_skip")
 +        body.extend(_s.pack("<I", _TYPE_STRING))
 +        body.extend(_s.pack("<Q", 2**40))  # bogus length on the value
 +        _write_kv_string_array(body, "tokenizer.ggml.tokens", ["a"])
++
 +        header = bytearray()
 +        header.extend(b"GGUF")
 +        header.extend(_s.pack("<I", 3))
 +        header.extend(_s.pack("<Q", 0))
 +        header.extend(_s.pack("<Q", 2))  # kv_count=2
 +        path = tmp_path / "skip_bomb.gguf"
 +        path.write_bytes(bytes(header) + bytes(body))
 +        with pytest.raises(PreflightError, match="cannot parse GGUF"):
 +            read_gguf_vocab_size(path)
++
      def test_nested_array_raises(self, tmp_path: Path) -> None:
          """Array-of-array is not supported by llama.cpp's vocab metadata."""
          import struct as _s