Python · 3811 bytes Raw Blame History
1 """Shared GGUF byte-level primitives.
2
3 Both `tokenizer_sync` (metadata reader) and `gguf_tensors` (tensor-index
4 reader) need the same scalar readers + type constants. Kept in a
5 private module so only one copy exists; the public modules re-export
6 what their callers need.
7
8 Bounds guard: `_read_string` refuses lengths above
9 `_MAX_STRING_BYTES`. Every caller that reads a length-prefixed array
10 should also bound the length against a sensible ceiling; we don't
11 enforce that here because the ceiling depends on caller context
12 (string vs tensor-name vs token-set) — each caller picks its own.
13 """
14
15 from __future__ import annotations
16
17 import struct
18 from typing import Any, Final
19
20 _GGUF_MAGIC: Final[bytes] = b"GGUF"
21
22 # Upper bound on a GGUF metadata string. Tokens / keys / templates all
23 # fit well under 16 MiB; a crafted file claiming a GB-scale string
24 # would otherwise drive `f.read(length)` into OOM.
25 _MAX_STRING_BYTES: Final[int] = 16 * 1024 * 1024
26
27 # GGUF metadata value types (stable v2+v3).
28 _TYPE_UINT8: Final[int] = 0
29 _TYPE_INT8: Final[int] = 1
30 _TYPE_UINT16: Final[int] = 2
31 _TYPE_INT16: Final[int] = 3
32 _TYPE_UINT32: Final[int] = 4
33 _TYPE_INT32: Final[int] = 5
34 _TYPE_FLOAT32: Final[int] = 6
35 _TYPE_BOOL: Final[int] = 7
36 _TYPE_STRING: Final[int] = 8
37 _TYPE_ARRAY: Final[int] = 9
38 _TYPE_UINT64: Final[int] = 10
39 _TYPE_INT64: Final[int] = 11
40 _TYPE_FLOAT64: Final[int] = 12
41
42 _FIXED_WIDTH: Final[dict[int, int]] = {
43 _TYPE_UINT8: 1,
44 _TYPE_INT8: 1,
45 _TYPE_UINT16: 2,
46 _TYPE_INT16: 2,
47 _TYPE_UINT32: 4,
48 _TYPE_INT32: 4,
49 _TYPE_FLOAT32: 4,
50 _TYPE_BOOL: 1,
51 _TYPE_UINT64: 8,
52 _TYPE_INT64: 8,
53 _TYPE_FLOAT64: 8,
54 }
55
56
57 def _read_u32(f: Any) -> int:
58 raw = f.read(4)
59 if len(raw) != 4:
60 raise struct.error("short read")
61 value: int = struct.unpack("<I", raw)[0]
62 return value
63
64
65 def _read_u64(f: Any) -> int:
66 raw = f.read(8)
67 if len(raw) != 8:
68 raise struct.error("short read")
69 value: int = struct.unpack("<Q", raw)[0]
70 return value
71
72
73 def _read_string(f: Any) -> str:
74 length = _read_u64(f)
75 if length > _MAX_STRING_BYTES:
76 raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}")
77 raw = f.read(length)
78 if len(raw) != length:
79 raise struct.error("short read in string")
80 decoded: str = raw.decode("utf-8", errors="replace")
81 return decoded
82
83
84 def _skip_value(f: Any, value_type: int) -> None:
85 """Skip a metadata value of any type without parsing the payload.
86
87 Used by header walkers that locate one key by name and skip
88 everything else. Bounds string lengths on the way so a crafted
89 file can't drive us into an absurd `seek`.
90 """
91 if value_type in _FIXED_WIDTH:
92 f.seek(_FIXED_WIDTH[value_type], 1)
93 return
94 if value_type == _TYPE_STRING:
95 length = _read_u64(f)
96 if length > _MAX_STRING_BYTES:
97 raise struct.error(f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}")
98 f.seek(length, 1)
99 return
100 if value_type == _TYPE_ARRAY:
101 elem_type = _read_u32(f)
102 count = _read_u64(f)
103 if elem_type in _FIXED_WIDTH:
104 f.seek(_FIXED_WIDTH[elem_type] * count, 1)
105 return
106 if elem_type == _TYPE_STRING:
107 for _ in range(count):
108 length = _read_u64(f)
109 if length > _MAX_STRING_BYTES:
110 raise struct.error(
111 f"GGUF string length {length} exceeds bound {_MAX_STRING_BYTES}"
112 )
113 f.seek(length, 1)
114 return
115 # Nested arrays aren't used by llama.cpp's vocab metadata; treat
116 # as unsupported.
117 raise struct.error(f"nested/unknown array elem_type {elem_type}")
118 raise struct.error(f"unknown GGUF value_type {value_type}")