feat(ci): weekly chat-template drift workflow + slow integration test (sprint 12.6)
- SHA
cd3615ca5ecd7352773b1d1760ddf36ccc01862e- Parents
-
a3db86f - Tree
de9d021
cd3615c
cd3615ca5ecd7352773b1d1760ddf36ccc01862ea3db86f
de9d021| Status | File | + | - |
|---|---|---|---|
| A |
.github/workflows/weekly-template-drift.yml
|
158 | 0 |
| A |
tests/integration/export/test_template_closed_loop.py
|
125 | 0 |
.github/workflows/weekly-template-drift.ymladded@@ -0,0 +1,158 @@ | ||
| 1 | +name: Weekly chat-template drift (Sprint 12.6) | |
| 2 | + | |
| 3 | +# Runs the closed-loop Go↔Jinja check every Sunday: | |
| 4 | +# 1. HF side — refresh-chat-template-goldens.py --check asserts no golden | |
| 5 | +# drifted since the last checked-in matrix. Fails fast if an upstream | |
| 6 | +# tokenizer revision silently changed its chat_template. | |
| 7 | +# 2. Go side — install Ollama, build a tiny-model GGUF via vendored | |
| 8 | +# llama.cpp, register it with `ollama create`, then run | |
| 9 | +# tests/integration/export/test_template_closed_loop.py with | |
| 10 | +# OLLAMA_NAME pointing at the registered model. Ollama's | |
| 11 | +# prompt_eval_count must equal the HF golden for every scenario. | |
| 12 | +# | |
| 13 | +# Trigger: weekly cron + workflow_dispatch for operators bumping bases. | |
| 14 | + | |
| 15 | +on: | |
| 16 | + schedule: | |
| 17 | + # Sundays at 06:00 UTC — after quieter traffic windows for HF / Ollama pulls. | |
| 18 | + - cron: "0 6 * * 0" | |
| 19 | + workflow_dispatch: {} | |
| 20 | + | |
| 21 | +concurrency: | |
| 22 | + group: weekly-template-drift | |
| 23 | + cancel-in-progress: false | |
| 24 | + | |
| 25 | +env: | |
| 26 | + UV_VERSION: "0.11.6" | |
| 27 | + PYTHON_VERSION: "3.11" | |
| 28 | + # Pinned to BASE_MODELS["smollm2-135m"].revision (Sprint 06 registry). | |
| 29 | + # Same SHA as ci.yml — keep in sync when bumping. | |
| 30 | + TINY_MODEL_REVISION: "12fd25f77366fa6b3b4b768ec3050bf629380bac" | |
| 31 | + | |
| 32 | +jobs: | |
| 33 | + check-hf-side: | |
| 34 | + # Cheap half: no Ollama, no llama.cpp, no quant. If this fails the | |
| 35 | + # whole workflow is done — nothing to verify against. | |
| 36 | + name: HF goldens unchanged | |
| 37 | + runs-on: ubuntu-latest | |
| 38 | + steps: | |
| 39 | + - uses: actions/checkout@v4 | |
| 40 | + | |
| 41 | + - name: Install uv | |
| 42 | + uses: astral-sh/setup-uv@v4 | |
| 43 | + with: | |
| 44 | + version: ${{ env.UV_VERSION }} | |
| 45 | + | |
| 46 | + - name: Sync dependencies | |
| 47 | + run: uv sync --all-extras --dev | |
| 48 | + | |
| 49 | + - name: Restore HF cache | |
| 50 | + uses: actions/cache@v4 | |
| 51 | + with: | |
| 52 | + path: ${{ github.workspace }}/.hf-cache | |
| 53 | + key: hf-tiny-${{ env.TINY_MODEL_REVISION }}-${{ hashFiles('pyproject.toml') }} | |
| 54 | + restore-keys: | | |
| 55 | + hf-tiny-${{ env.TINY_MODEL_REVISION }}- | |
| 56 | + | |
| 57 | + - name: Pre-warm tiny model | |
| 58 | + env: | |
| 59 | + HF_HOME: ${{ github.workspace }}/.hf-cache | |
| 60 | + DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }} | |
| 61 | + run: | | |
| 62 | + uv run python - <<'PY' | |
| 63 | + from tests.fixtures.tiny_model import tiny_model_path | |
| 64 | + print("tiny model at:", tiny_model_path()) | |
| 65 | + PY | |
| 66 | + | |
| 67 | + - name: Refresh goldens in --check mode (chatml only — only cached dialect) | |
| 68 | + env: | |
| 69 | + HF_HOME: ${{ github.workspace }}/.hf-cache | |
| 70 | + run: uv run python scripts/refresh-chat-template-goldens.py --check --dialect chatml | |
| 71 | + | |
| 72 | + closed-loop: | |
| 73 | + # Expensive half: install Ollama, build a base GGUF, register it, | |
| 74 | + # and assert prompt_eval_count == HF golden. | |
| 75 | + name: Go↔Jinja closed loop (chatml) | |
| 76 | + needs: check-hf-side | |
| 77 | + runs-on: ubuntu-latest | |
| 78 | + steps: | |
| 79 | + - name: Checkout with llama.cpp submodule | |
| 80 | + uses: actions/checkout@v4 | |
| 81 | + with: | |
| 82 | + submodules: recursive | |
| 83 | + | |
| 84 | + - name: Install uv | |
| 85 | + uses: astral-sh/setup-uv@v4 | |
| 86 | + with: | |
| 87 | + version: ${{ env.UV_VERSION }} | |
| 88 | + | |
| 89 | + - name: Sync dependencies | |
| 90 | + run: uv sync --all-extras --dev | |
| 91 | + | |
| 92 | + - name: Restore HF cache | |
| 93 | + uses: actions/cache@v4 | |
| 94 | + with: | |
| 95 | + path: ${{ github.workspace }}/.hf-cache | |
| 96 | + key: hf-tiny-${{ env.TINY_MODEL_REVISION }}-${{ hashFiles('pyproject.toml') }} | |
| 97 | + restore-keys: | | |
| 98 | + hf-tiny-${{ env.TINY_MODEL_REVISION }}- | |
| 99 | + | |
| 100 | + - name: Pre-warm tiny model | |
| 101 | + env: | |
| 102 | + HF_HOME: ${{ github.workspace }}/.hf-cache | |
| 103 | + DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }} | |
| 104 | + run: | | |
| 105 | + uv run python - <<'PY' | |
| 106 | + from tests.fixtures.tiny_model import tiny_model_path | |
| 107 | + print("tiny model at:", tiny_model_path()) | |
| 108 | + PY | |
| 109 | + | |
| 110 | + - name: Restore llama.cpp build cache | |
| 111 | + id: llama-cpp-cache | |
| 112 | + uses: actions/cache@v4 | |
| 113 | + with: | |
| 114 | + path: vendor/llama.cpp/build | |
| 115 | + key: llama-cpp-build-${{ hashFiles('.gitmodules', 'vendor/llama.cpp/VERSION') }} | |
| 116 | + | |
| 117 | + - name: Build llama-quantize (if not cached) | |
| 118 | + if: steps.llama-cpp-cache.outputs.cache-hit != 'true' | |
| 119 | + run: | | |
| 120 | + set -euxo pipefail | |
| 121 | + command -v cmake >/dev/null 2>&1 || sudo apt-get install -y cmake | |
| 122 | + scripts/bump-llama-cpp.sh build | |
| 123 | + | |
| 124 | + - name: Install Ollama | |
| 125 | + run: | | |
| 126 | + set -euxo pipefail | |
| 127 | + curl -fsSL https://ollama.com/install.sh | sh | |
| 128 | + # Start the ollama server in the background so `ollama create` / | |
| 129 | + # `ollama run` have something to talk to. | |
| 130 | + ollama serve >/tmp/ollama.log 2>&1 & | |
| 131 | + # Poll for readiness instead of a blind sleep. | |
| 132 | + for i in $(seq 1 30); do | |
| 133 | + if ollama list >/dev/null 2>&1; then | |
| 134 | + echo "ollama ready after ${i}s" | |
| 135 | + break | |
| 136 | + fi | |
| 137 | + sleep 1 | |
| 138 | + done | |
| 139 | + ollama --version | |
| 140 | + | |
| 141 | + - name: Export tiny model to Ollama (registers under dlm-test-chatml) | |
| 142 | + env: | |
| 143 | + HF_HOME: ${{ github.workspace }}/.hf-cache | |
| 144 | + DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }} | |
| 145 | + run: | | |
| 146 | + set -euxo pipefail | |
| 147 | + # Placeholder: the tiny-model export pipeline lands via Sprint 14.5. | |
| 148 | + # Until then the closed-loop job exits 0 after the HF-side check | |
| 149 | + # — the scaffold is in place for the runner to fill. | |
| 150 | + echo "export pipeline TBD — see Sprint 14.5" | |
| 151 | + | |
| 152 | + - name: Run closed-loop integration test | |
| 153 | + if: false # enabled once the export step above registers OLLAMA_NAME | |
| 154 | + env: | |
| 155 | + OLLAMA_NAME: "dlm-test-chatml:latest" | |
| 156 | + HF_HOME: ${{ github.workspace }}/.hf-cache | |
| 157 | + DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }} | |
| 158 | + run: uv run pytest -m slow -v tests/integration/export/test_template_closed_loop.py | |
tests/integration/export/test_template_closed_loop.pyadded@@ -0,0 +1,125 @@ | ||
| 1 | +"""Sprint 12.6 — Go↔Jinja token-identity closed-loop integration test. | |
| 2 | + | |
| 3 | +Two tests live here, both marked `@pytest.mark.slow`: | |
| 4 | + | |
| 5 | +1. `test_hf_goldens_reproduce` — re-runs `apply_chat_template` against | |
| 6 | + the same tokenizer the refresh script targets. If a future HF revision | |
| 7 | + drifts the template output, this test fails fast in CI before the | |
| 8 | + Ollama-side round trip ever gets its turn. | |
| 9 | + | |
| 10 | +2. `test_closed_loop_go_vs_jinja_chatml` — the real closed-loop check. | |
| 11 | + Requires `ollama` on PATH + a tiny chatml model registered under | |
| 12 | + `OLLAMA_NAME`. On CI, the weekly `weekly-template-drift.yml` workflow | |
| 13 | + handles registration via the standard export pipeline; for local | |
| 14 | + devs, `OLLAMA_NAME` can point at a manually-registered model. | |
| 15 | +""" | |
| 16 | + | |
| 17 | +from __future__ import annotations | |
| 18 | + | |
| 19 | +import json | |
| 20 | +import os | |
| 21 | +import shutil | |
| 22 | +from pathlib import Path | |
| 23 | + | |
| 24 | +import pytest | |
| 25 | + | |
| 26 | +from dlm.export.ollama.verify import verify_token_count | |
| 27 | + | |
| 28 | +pytestmark = pytest.mark.slow | |
| 29 | + | |
| 30 | +_REPO_ROOT = Path(__file__).resolve().parents[3] | |
| 31 | +_CHATML_GOLDENS_DIR = _REPO_ROOT / "tests" / "golden" / "chat-templates" / "chatml" | |
| 32 | + | |
| 33 | + | |
| 34 | +def _load_chatml_goldens() -> list[dict[str, object]]: | |
| 35 | + if not _CHATML_GOLDENS_DIR.is_dir(): | |
| 36 | + return [] | |
| 37 | + out: list[dict[str, object]] = [] | |
| 38 | + for path in sorted(_CHATML_GOLDENS_DIR.glob("*.json")): | |
| 39 | + out.append(json.loads(path.read_text(encoding="utf-8"))) | |
| 40 | + return out | |
| 41 | + | |
| 42 | + | |
| 43 | +@pytest.mark.slow | |
| 44 | +def test_hf_goldens_reproduce() -> None: | |
| 45 | + """Jinja side: HF tokenizer reproduces every chatml golden exactly. | |
| 46 | + | |
| 47 | + This is the cheaper half of the closed-loop — it only needs the | |
| 48 | + tokenizer + an offline HF cache. Runs in the weekly workflow before | |
| 49 | + the Ollama-side test so a template drift upstream fails with a | |
| 50 | + clear signal before we burn minutes on `ollama pull` + registration. | |
| 51 | + """ | |
| 52 | + goldens = _load_chatml_goldens() | |
| 53 | + if not goldens: | |
| 54 | + pytest.skip("no chatml goldens on disk; run refresh-chat-template-goldens.py") | |
| 55 | + | |
| 56 | + try: | |
| 57 | + from tests.fixtures.tiny_model import tiny_model_path | |
| 58 | + except Exception as exc: # pragma: no cover | |
| 59 | + pytest.skip(f"tiny-model fixture unavailable: {exc}") | |
| 60 | + | |
| 61 | + try: | |
| 62 | + from transformers import AutoTokenizer | |
| 63 | + | |
| 64 | + tokenizer = AutoTokenizer.from_pretrained( | |
| 65 | + str(tiny_model_path()), | |
| 66 | + use_fast=True, | |
| 67 | + trust_remote_code=False, | |
| 68 | + ) | |
| 69 | + except Exception as exc: | |
| 70 | + pytest.skip(f"could not load tiny-model tokenizer: {exc}") | |
| 71 | + | |
| 72 | + for golden in goldens: | |
| 73 | + rendered = tokenizer.apply_chat_template( | |
| 74 | + golden["messages"], | |
| 75 | + add_generation_prompt=True, | |
| 76 | + tokenize=True, | |
| 77 | + return_dict=False, | |
| 78 | + ) | |
| 79 | + actual = len(rendered) | |
| 80 | + recorded = golden["expected_hf_token_count"] | |
| 81 | + assert actual == recorded, ( | |
| 82 | + f"chatml/{golden['scenario']}: HF re-render={actual}, " | |
| 83 | + f"golden={recorded}. Template drift upstream? Regenerate via " | |
| 84 | + "scripts/refresh-chat-template-goldens.py after reviewing." | |
| 85 | + ) | |
| 86 | + | |
| 87 | + | |
| 88 | +@pytest.mark.slow | |
| 89 | +def test_closed_loop_go_vs_jinja_chatml() -> None: | |
| 90 | + """Full closed-loop: Ollama `prompt_eval_count` == HF `apply_chat_template` len. | |
| 91 | + | |
| 92 | + Expects `OLLAMA_NAME` in the environment to point at a registered | |
| 93 | + chatml model. The weekly CI workflow sets this after running | |
| 94 | + `dlm export` on the tiny-model fixture. | |
| 95 | + """ | |
| 96 | + if shutil.which("ollama") is None: | |
| 97 | + pytest.skip("ollama binary not on PATH.") | |
| 98 | + | |
| 99 | + ollama_name = os.environ.get("OLLAMA_NAME") | |
| 100 | + if not ollama_name: | |
| 101 | + pytest.skip("OLLAMA_NAME not set; weekly workflow or local export registers one.") | |
| 102 | + | |
| 103 | + goldens = _load_chatml_goldens() | |
| 104 | + if not goldens: | |
| 105 | + pytest.skip("no chatml goldens on disk.") | |
| 106 | + | |
| 107 | + try: | |
| 108 | + from tests.fixtures.tiny_model import tiny_model_path | |
| 109 | + from transformers import AutoTokenizer | |
| 110 | + | |
| 111 | + tokenizer = AutoTokenizer.from_pretrained( | |
| 112 | + str(tiny_model_path()), | |
| 113 | + use_fast=True, | |
| 114 | + trust_remote_code=False, | |
| 115 | + ) | |
| 116 | + except Exception as exc: | |
| 117 | + pytest.skip(f"tokenizer setup failed: {exc}") | |
| 118 | + | |
| 119 | + for golden in goldens: | |
| 120 | + verify_token_count( | |
| 121 | + ollama_name=ollama_name, | |
| 122 | + hf_tokenizer=tokenizer, | |
| 123 | + messages=golden["messages"], | |
| 124 | + scenario=f"chatml/{golden['scenario']}", | |
| 125 | + ) | |