Bash · 7148 bytes Raw Blame History
1 #!/usr/bin/env bash
2 # Bump the vendored llama.cpp submodule, build its tools, and refresh
3 # the pre-tokenizer hash table.
4 #
5 # Usage:
6 # scripts/bump-llama-cpp.sh bump <tag>
7 # Fast-forward submodule to <tag>, re-extract hashes, write VERSION,
8 # stage changes.
9 # scripts/bump-llama-cpp.sh build
10 # Build `llama-quantize` (+ siblings) via cmake. Idempotent.
11 # scripts/bump-llama-cpp.sh build --portable
12 # Build portable CPU binaries (`GGML_NATIVE=OFF`) suitable for CI
13 # caches or redistribution across heterogeneous hosts.
14 # scripts/bump-llama-cpp.sh build --with-server
15 # Also build `llama-server` for Sprint 41's local HTTP target.
16 # scripts/bump-llama-cpp.sh build --portable --with-server
17 # Portable build plus `llama-server`.
18 # scripts/bump-llama-cpp.sh refresh-labels
19 # Regenerate vendor/llama_cpp_pretokenizer_hashes.json from the
20 # current submodule contents. Does not touch the submodule itself.
21 # scripts/bump-llama-cpp.sh probe-vl-arch
22 # Re-run the VL arch probe (Sprint 35.4) and cache the verdicts
23 # under vendor/llama_cpp_vl_arch_support.json. Fast path for the
24 # runtime probe; omitted or stale means the runtime falls back to
25 # a live scan.
26
27 set -euo pipefail
28
29 REPO_ROOT="$(git rev-parse --show-toplevel)"
30 VENDOR_DIR="$REPO_ROOT/vendor/llama.cpp"
31 HASHES_PATH="$REPO_ROOT/vendor/llama_cpp_pretokenizer_hashes.json"
32 VL_ARCH_PATH="$REPO_ROOT/vendor/llama_cpp_vl_arch_support.json"
33 VERSION_PATH="$VENDOR_DIR/VERSION"
34
35 cmd="${1:-}"
36
37 refresh_labels() {
38 echo "--> re-extracting pre-tokenizer hash labels to $HASHES_PATH"
39 uv run python - <<'PY'
40 import json
41 import re
42 import sys
43 from pathlib import Path
44
45 repo_root = Path.cwd()
46 converter = repo_root / "vendor" / "llama.cpp" / "convert_hf_to_gguf.py"
47 hashes_path = repo_root / "vendor" / "llama_cpp_pretokenizer_hashes.json"
48
49 if not converter.is_file():
50 print(f"ERROR: {converter} not found", file=sys.stderr)
51 sys.exit(1)
52
53 source = converter.read_text(encoding="utf-8", errors="replace")
54 pattern = re.compile(r"""\bres\s*=\s*["']([^"']+)["']""")
55 labels = sorted(set(pattern.findall(source)))
56 if not labels:
57 print("ERROR: no pre-tokenizer labels found in convert_hf_to_gguf.py",
58 file=sys.stderr)
59 sys.exit(1)
60
61 hashes_path.write_text(json.dumps(labels, indent=2) + "\n", encoding="utf-8")
62 print(f"wrote {len(labels)} labels to {hashes_path}")
63 PY
64 }
65
66 probe_vl_arch() {
67 echo "--> probing VL arch support in vendored llama.cpp"
68 uv run python - <<'PY'
69 import json
70 import sys
71 from pathlib import Path
72
73 # Import dlm's probe directly — this script runs from the repo root
74 # with dlm importable via `uv run`. Failing imports abort with a
75 # readable error; no silent half-cache files.
76 sys.path.insert(0, str(Path.cwd() / "src"))
77 from dlm.base_models import BASE_MODELS
78 from dlm.export.arch_probe import SupportLevel, clear_cache, probe_gguf_arch
79
80 # Fresh probe — the cache may carry a stale verdict from an earlier
81 # run in the same process; clear before enumerating.
82 clear_cache()
83
84 out_path = Path.cwd() / "vendor" / "llama_cpp_vl_arch_support.json"
85 entries: dict[str, dict[str, str | None]] = {}
86 for key, spec in BASE_MODELS.items():
87 if spec.modality != "vision-language":
88 continue
89 result = probe_gguf_arch(spec.architecture)
90 entries[key] = {
91 "architecture": spec.architecture,
92 "support": result.support.value,
93 "llama_cpp_tag": result.llama_cpp_tag,
94 "reason": result.reason,
95 }
96
97 payload = {
98 "schema": 1,
99 "bases": entries,
100 }
101 out_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
102
103 # Print a summary table for operators.
104 print(f"wrote {len(entries)} VL verdicts to {out_path}")
105 for key, entry in sorted(entries.items()):
106 print(f" {entry['support']:<12} {key} ({entry['architecture']})")
107 PY
108 }
109
110 do_bump() {
111 local tag="${1:-}"
112 if [ -z "$tag" ]; then
113 echo "usage: scripts/bump-llama-cpp.sh bump <tag>" >&2
114 exit 2
115 fi
116 if [ -n "$(git status --porcelain)" ]; then
117 echo "error: working tree must be clean before a submodule bump" >&2
118 exit 1
119 fi
120 if [ ! -d "$VENDOR_DIR" ]; then
121 echo "error: $VENDOR_DIR missing — initialize the submodule first:" >&2
122 echo " git submodule add https://github.com/ggerganov/llama.cpp vendor/llama.cpp" >&2
123 exit 1
124 fi
125
126 echo "--> fetching tags in $VENDOR_DIR"
127 git -C "$VENDOR_DIR" fetch --tags origin
128 echo "--> checking out $tag"
129 git -C "$VENDOR_DIR" checkout "tags/$tag"
130
131 echo "--> writing $VERSION_PATH"
132 echo "$tag" > "$VERSION_PATH"
133
134 refresh_labels
135 probe_vl_arch
136
137 echo "--> staging changes"
138 git -C "$REPO_ROOT" add \
139 vendor/llama.cpp \
140 vendor/llama_cpp_pretokenizer_hashes.json \
141 vendor/llama_cpp_vl_arch_support.json
142
143 cat <<EOF
144 Done. Review the staged diff and commit with:
145 git commit -m "chore: bump llama.cpp to $tag + refresh pre-tokenizer hashes"
146
147 Then build the binaries:
148 scripts/bump-llama-cpp.sh build
149
150 And re-run the registry probe suite:
151 uv run python scripts/refresh-registry.py
152 EOF
153 }
154
155 do_build() {
156 local with_server=0
157 local portable=0
158 while [ "$#" -gt 0 ]; do
159 case "$1" in
160 --with-server)
161 with_server=1
162 ;;
163 --portable)
164 portable=1
165 ;;
166 *)
167 echo "usage: scripts/bump-llama-cpp.sh build [--portable] [--with-server]" >&2
168 exit 2
169 ;;
170 esac
171 shift
172 done
173 if [ ! -d "$VENDOR_DIR" ]; then
174 echo "error: $VENDOR_DIR missing — run 'bump <tag>' first" >&2
175 exit 1
176 fi
177 echo "--> configuring llama.cpp via cmake"
178 local cmake_args=(
179 -S "$VENDOR_DIR"
180 -B "$VENDOR_DIR/build"
181 -DCMAKE_BUILD_TYPE=Release
182 )
183 if [ "$portable" -eq 1 ]; then
184 echo "--> portable build: forcing GGML_NATIVE=OFF for cross-runner compatibility"
185 cmake_args+=(-DGGML_NATIVE=OFF)
186 fi
187 cmake "${cmake_args[@]}"
188 # `llama-quantize` does the actual per-tensor quantization; `llama-imatrix`
189 # produces the importance-matrix file we feed to quantize for k-quant
190 # calibration (Sprint 11.6). Both are required for the full export
191 # pipeline; building them separately means a missing target fails the
192 # build loudly rather than silently shipping a half-built toolchain.
193 local targets=(llama-quantize llama-imatrix)
194 if [ "$with_server" -eq 1 ]; then
195 targets+=(llama-server)
196 fi
197 for target in "${targets[@]}"; do
198 echo "--> building $target"
199 cmake --build "$VENDOR_DIR/build" --target "$target" --config Release
200 if [ ! -f "$VENDOR_DIR/build/bin/$target" ]; then
201 echo "error: build finished but $target not found under build/bin" >&2
202 exit 1
203 fi
204 echo "OK: $VENDOR_DIR/build/bin/$target"
205 done
206 }
207
208 case "$cmd" in
209 bump)
210 do_bump "${2:-}"
211 ;;
212 build)
213 do_build "${2:-}"
214 ;;
215 refresh-labels)
216 refresh_labels
217 ;;
218 probe-vl-arch)
219 probe_vl_arch
220 ;;
221 "")
222 echo "usage: scripts/bump-llama-cpp.sh <bump|build|refresh-labels|probe-vl-arch> [args]" >&2
223 exit 2
224 ;;
225 *)
226 echo "unknown command: $cmd" >&2
227 echo "usage: scripts/bump-llama-cpp.sh <bump|build|refresh-labels|probe-vl-arch> [args]" >&2
228 exit 2
229 ;;
230 esac