| 1 |
#!/usr/bin/env bash |
| 2 |
# Bump the vendored llama.cpp submodule, build its tools, and refresh |
| 3 |
# the pre-tokenizer hash table. |
| 4 |
# |
| 5 |
# Usage: |
| 6 |
# scripts/bump-llama-cpp.sh bump <tag> |
| 7 |
# Fast-forward submodule to <tag>, re-extract hashes, write VERSION, |
| 8 |
# stage changes. |
| 9 |
# scripts/bump-llama-cpp.sh build |
| 10 |
# Build `llama-quantize` (+ siblings) via cmake. Idempotent. |
| 11 |
# scripts/bump-llama-cpp.sh build --portable |
| 12 |
# Build portable CPU binaries (`GGML_NATIVE=OFF`) suitable for CI |
| 13 |
# caches or redistribution across heterogeneous hosts. |
| 14 |
# scripts/bump-llama-cpp.sh build --with-server |
| 15 |
# Also build `llama-server` for Sprint 41's local HTTP target. |
| 16 |
# scripts/bump-llama-cpp.sh build --portable --with-server |
| 17 |
# Portable build plus `llama-server`. |
| 18 |
# scripts/bump-llama-cpp.sh refresh-labels |
| 19 |
# Regenerate vendor/llama_cpp_pretokenizer_hashes.json from the |
| 20 |
# current submodule contents. Does not touch the submodule itself. |
| 21 |
# scripts/bump-llama-cpp.sh probe-vl-arch |
| 22 |
# Re-run the VL arch probe (Sprint 35.4) and cache the verdicts |
| 23 |
# under vendor/llama_cpp_vl_arch_support.json. Fast path for the |
| 24 |
# runtime probe; omitted or stale means the runtime falls back to |
| 25 |
# a live scan. |
| 26 |
|
| 27 |
set -euo pipefail |
| 28 |
|
| 29 |
REPO_ROOT="$(git rev-parse --show-toplevel)" |
| 30 |
VENDOR_DIR="$REPO_ROOT/vendor/llama.cpp" |
| 31 |
HASHES_PATH="$REPO_ROOT/vendor/llama_cpp_pretokenizer_hashes.json" |
| 32 |
VL_ARCH_PATH="$REPO_ROOT/vendor/llama_cpp_vl_arch_support.json" |
| 33 |
VERSION_PATH="$VENDOR_DIR/VERSION" |
| 34 |
|
| 35 |
cmd="${1:-}" |
| 36 |
|
| 37 |
refresh_labels() { |
| 38 |
echo "--> re-extracting pre-tokenizer hash labels to $HASHES_PATH" |
| 39 |
uv run python - <<'PY' |
| 40 |
import json |
| 41 |
import re |
| 42 |
import sys |
| 43 |
from pathlib import Path |
| 44 |
|
| 45 |
repo_root = Path.cwd() |
| 46 |
converter = repo_root / "vendor" / "llama.cpp" / "convert_hf_to_gguf.py" |
| 47 |
hashes_path = repo_root / "vendor" / "llama_cpp_pretokenizer_hashes.json" |
| 48 |
|
| 49 |
if not converter.is_file(): |
| 50 |
print(f"ERROR: {converter} not found", file=sys.stderr) |
| 51 |
sys.exit(1) |
| 52 |
|
| 53 |
source = converter.read_text(encoding="utf-8", errors="replace") |
| 54 |
pattern = re.compile(r"""\bres\s*=\s*["']([^"']+)["']""") |
| 55 |
labels = sorted(set(pattern.findall(source))) |
| 56 |
if not labels: |
| 57 |
print("ERROR: no pre-tokenizer labels found in convert_hf_to_gguf.py", |
| 58 |
file=sys.stderr) |
| 59 |
sys.exit(1) |
| 60 |
|
| 61 |
hashes_path.write_text(json.dumps(labels, indent=2) + "\n", encoding="utf-8") |
| 62 |
print(f"wrote {len(labels)} labels to {hashes_path}") |
| 63 |
PY |
| 64 |
} |
| 65 |
|
| 66 |
probe_vl_arch() { |
| 67 |
echo "--> probing VL arch support in vendored llama.cpp" |
| 68 |
uv run python - <<'PY' |
| 69 |
import json |
| 70 |
import sys |
| 71 |
from pathlib import Path |
| 72 |
|
| 73 |
# Import dlm's probe directly — this script runs from the repo root |
| 74 |
# with dlm importable via `uv run`. Failing imports abort with a |
| 75 |
# readable error; no silent half-cache files. |
| 76 |
sys.path.insert(0, str(Path.cwd() / "src")) |
| 77 |
from dlm.base_models import BASE_MODELS |
| 78 |
from dlm.export.arch_probe import SupportLevel, clear_cache, probe_gguf_arch |
| 79 |
|
| 80 |
# Fresh probe — the cache may carry a stale verdict from an earlier |
| 81 |
# run in the same process; clear before enumerating. |
| 82 |
clear_cache() |
| 83 |
|
| 84 |
out_path = Path.cwd() / "vendor" / "llama_cpp_vl_arch_support.json" |
| 85 |
entries: dict[str, dict[str, str | None]] = {} |
| 86 |
for key, spec in BASE_MODELS.items(): |
| 87 |
if spec.modality != "vision-language": |
| 88 |
continue |
| 89 |
result = probe_gguf_arch(spec.architecture) |
| 90 |
entries[key] = { |
| 91 |
"architecture": spec.architecture, |
| 92 |
"support": result.support.value, |
| 93 |
"llama_cpp_tag": result.llama_cpp_tag, |
| 94 |
"reason": result.reason, |
| 95 |
} |
| 96 |
|
| 97 |
payload = { |
| 98 |
"schema": 1, |
| 99 |
"bases": entries, |
| 100 |
} |
| 101 |
out_path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") |
| 102 |
|
| 103 |
# Print a summary table for operators. |
| 104 |
print(f"wrote {len(entries)} VL verdicts to {out_path}") |
| 105 |
for key, entry in sorted(entries.items()): |
| 106 |
print(f" {entry['support']:<12} {key} ({entry['architecture']})") |
| 107 |
PY |
| 108 |
} |
| 109 |
|
| 110 |
do_bump() { |
| 111 |
local tag="${1:-}" |
| 112 |
if [ -z "$tag" ]; then |
| 113 |
echo "usage: scripts/bump-llama-cpp.sh bump <tag>" >&2 |
| 114 |
exit 2 |
| 115 |
fi |
| 116 |
if [ -n "$(git status --porcelain)" ]; then |
| 117 |
echo "error: working tree must be clean before a submodule bump" >&2 |
| 118 |
exit 1 |
| 119 |
fi |
| 120 |
if [ ! -d "$VENDOR_DIR" ]; then |
| 121 |
echo "error: $VENDOR_DIR missing — initialize the submodule first:" >&2 |
| 122 |
echo " git submodule add https://github.com/ggerganov/llama.cpp vendor/llama.cpp" >&2 |
| 123 |
exit 1 |
| 124 |
fi |
| 125 |
|
| 126 |
echo "--> fetching tags in $VENDOR_DIR" |
| 127 |
git -C "$VENDOR_DIR" fetch --tags origin |
| 128 |
echo "--> checking out $tag" |
| 129 |
git -C "$VENDOR_DIR" checkout "tags/$tag" |
| 130 |
|
| 131 |
echo "--> writing $VERSION_PATH" |
| 132 |
echo "$tag" > "$VERSION_PATH" |
| 133 |
|
| 134 |
refresh_labels |
| 135 |
probe_vl_arch |
| 136 |
|
| 137 |
echo "--> staging changes" |
| 138 |
git -C "$REPO_ROOT" add \ |
| 139 |
vendor/llama.cpp \ |
| 140 |
vendor/llama_cpp_pretokenizer_hashes.json \ |
| 141 |
vendor/llama_cpp_vl_arch_support.json |
| 142 |
|
| 143 |
cat <<EOF |
| 144 |
Done. Review the staged diff and commit with: |
| 145 |
git commit -m "chore: bump llama.cpp to $tag + refresh pre-tokenizer hashes" |
| 146 |
|
| 147 |
Then build the binaries: |
| 148 |
scripts/bump-llama-cpp.sh build |
| 149 |
|
| 150 |
And re-run the registry probe suite: |
| 151 |
uv run python scripts/refresh-registry.py |
| 152 |
EOF |
| 153 |
} |
| 154 |
|
| 155 |
do_build() { |
| 156 |
local with_server=0 |
| 157 |
local portable=0 |
| 158 |
while [ "$#" -gt 0 ]; do |
| 159 |
case "$1" in |
| 160 |
--with-server) |
| 161 |
with_server=1 |
| 162 |
;; |
| 163 |
--portable) |
| 164 |
portable=1 |
| 165 |
;; |
| 166 |
*) |
| 167 |
echo "usage: scripts/bump-llama-cpp.sh build [--portable] [--with-server]" >&2 |
| 168 |
exit 2 |
| 169 |
;; |
| 170 |
esac |
| 171 |
shift |
| 172 |
done |
| 173 |
if [ ! -d "$VENDOR_DIR" ]; then |
| 174 |
echo "error: $VENDOR_DIR missing — run 'bump <tag>' first" >&2 |
| 175 |
exit 1 |
| 176 |
fi |
| 177 |
echo "--> configuring llama.cpp via cmake" |
| 178 |
local cmake_args=( |
| 179 |
-S "$VENDOR_DIR" |
| 180 |
-B "$VENDOR_DIR/build" |
| 181 |
-DCMAKE_BUILD_TYPE=Release |
| 182 |
) |
| 183 |
if [ "$portable" -eq 1 ]; then |
| 184 |
echo "--> portable build: forcing GGML_NATIVE=OFF for cross-runner compatibility" |
| 185 |
cmake_args+=(-DGGML_NATIVE=OFF) |
| 186 |
fi |
| 187 |
cmake "${cmake_args[@]}" |
| 188 |
# `llama-quantize` does the actual per-tensor quantization; `llama-imatrix` |
| 189 |
# produces the importance-matrix file we feed to quantize for k-quant |
| 190 |
# calibration (Sprint 11.6). Both are required for the full export |
| 191 |
# pipeline; building them separately means a missing target fails the |
| 192 |
# build loudly rather than silently shipping a half-built toolchain. |
| 193 |
local targets=(llama-quantize llama-imatrix) |
| 194 |
if [ "$with_server" -eq 1 ]; then |
| 195 |
targets+=(llama-server) |
| 196 |
fi |
| 197 |
for target in "${targets[@]}"; do |
| 198 |
echo "--> building $target" |
| 199 |
cmake --build "$VENDOR_DIR/build" --target "$target" --config Release |
| 200 |
if [ ! -f "$VENDOR_DIR/build/bin/$target" ]; then |
| 201 |
echo "error: build finished but $target not found under build/bin" >&2 |
| 202 |
exit 1 |
| 203 |
fi |
| 204 |
echo "OK: $VENDOR_DIR/build/bin/$target" |
| 205 |
done |
| 206 |
} |
| 207 |
|
| 208 |
case "$cmd" in |
| 209 |
bump) |
| 210 |
do_bump "${2:-}" |
| 211 |
;; |
| 212 |
build) |
| 213 |
do_build "${2:-}" |
| 214 |
;; |
| 215 |
refresh-labels) |
| 216 |
refresh_labels |
| 217 |
;; |
| 218 |
probe-vl-arch) |
| 219 |
probe_vl_arch |
| 220 |
;; |
| 221 |
"") |
| 222 |
echo "usage: scripts/bump-llama-cpp.sh <bump|build|refresh-labels|probe-vl-arch> [args]" >&2 |
| 223 |
exit 2 |
| 224 |
;; |
| 225 |
*) |
| 226 |
echo "unknown command: $cmd" >&2 |
| 227 |
echo "usage: scripts/bump-llama-cpp.sh <bump|build|refresh-labels|probe-vl-arch> [args]" >&2 |
| 228 |
exit 2 |
| 229 |
;; |
| 230 |
esac |