`cd3615c`

feat(ci): weekly chat-template drift workflow + slow integration test (sprint 12.6)

Authored by

espadonne 3 weeks ago

SHA: cd3615ca5ecd7352773b1d1760ddf36ccc01862e
Parents: a3db86f
Tree: de9d021

2 changed files

Status	File	+	-
A	`.github/workflows/weekly-template-drift.yml`	158	0
A	`tests/integration/export/test_template_closed_loop.py`	125	0

.github/workflows/weekly-template-drift.ymladded

 +name: Weekly chat-template drift (Sprint 12.6)
++
 +# Runs the closed-loop Go↔Jinja check every Sunday:
 +#   1. HF side — refresh-chat-template-goldens.py --check asserts no golden
 +#      drifted since the last checked-in matrix. Fails fast if an upstream
 +#      tokenizer revision silently changed its chat_template.
 +#   2. Go side — install Ollama, build a tiny-model GGUF via vendored
 +#      llama.cpp, register it with `ollama create`, then run
 +#      tests/integration/export/test_template_closed_loop.py with
 +#      OLLAMA_NAME pointing at the registered model. Ollama's
 +#      prompt_eval_count must equal the HF golden for every scenario.
 +#
 +# Trigger: weekly cron + workflow_dispatch for operators bumping bases.
++
 +on:
 +  schedule:
 +    # Sundays at 06:00 UTC — after quieter traffic windows for HF / Ollama pulls.
 +    - cron: "0 6 * * 0"
 +  workflow_dispatch: {}
++
 +concurrency:
 +  group: weekly-template-drift
 +  cancel-in-progress: false
++
 +env:
 +  UV_VERSION: "0.11.6"
 +  PYTHON_VERSION: "3.11"
 +  # Pinned to BASE_MODELS["smollm2-135m"].revision (Sprint 06 registry).
 +  # Same SHA as ci.yml — keep in sync when bumping.
 +  TINY_MODEL_REVISION: "12fd25f77366fa6b3b4b768ec3050bf629380bac"
++
 +jobs:
 +  check-hf-side:
 +    # Cheap half: no Ollama, no llama.cpp, no quant. If this fails the
 +    # whole workflow is done — nothing to verify against.
 +    name: HF goldens unchanged
 +    runs-on: ubuntu-latest
 +    steps:
 +      - uses: actions/checkout@v4
++
 +      - name: Install uv
 +        uses: astral-sh/setup-uv@v4
 +        with:
 +          version: ${{ env.UV_VERSION }}
++
 +      - name: Sync dependencies
 +        run: uv sync --all-extras --dev
++
 +      - name: Restore HF cache
 +        uses: actions/cache@v4
 +        with:
 +          path: ${{ github.workspace }}/.hf-cache
 +          key: hf-tiny-${{ env.TINY_MODEL_REVISION }}-${{ hashFiles('pyproject.toml') }}
 +          restore-keys: |
 +            hf-tiny-${{ env.TINY_MODEL_REVISION }}-
++
 +      - name: Pre-warm tiny model
 +        env:
 +          HF_HOME: ${{ github.workspace }}/.hf-cache
 +          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
 +        run: |
 +          uv run python - <<'PY'
 +          from tests.fixtures.tiny_model import tiny_model_path
 +          print("tiny model at:", tiny_model_path())
 +          PY
++
 +      - name: Refresh goldens in --check mode (chatml only — only cached dialect)
 +        env:
 +          HF_HOME: ${{ github.workspace }}/.hf-cache
 +        run: uv run python scripts/refresh-chat-template-goldens.py --check --dialect chatml
++
 +  closed-loop:
 +    # Expensive half: install Ollama, build a base GGUF, register it,
 +    # and assert prompt_eval_count == HF golden.
 +    name: Go↔Jinja closed loop (chatml)
 +    needs: check-hf-side
 +    runs-on: ubuntu-latest
 +    steps:
 +      - name: Checkout with llama.cpp submodule
 +        uses: actions/checkout@v4
 +        with:
 +          submodules: recursive
++
 +      - name: Install uv
 +        uses: astral-sh/setup-uv@v4
 +        with:
 +          version: ${{ env.UV_VERSION }}
++
 +      - name: Sync dependencies
 +        run: uv sync --all-extras --dev
++
 +      - name: Restore HF cache
 +        uses: actions/cache@v4
 +        with:
 +          path: ${{ github.workspace }}/.hf-cache
 +          key: hf-tiny-${{ env.TINY_MODEL_REVISION }}-${{ hashFiles('pyproject.toml') }}
 +          restore-keys: |
 +            hf-tiny-${{ env.TINY_MODEL_REVISION }}-
++
 +      - name: Pre-warm tiny model
 +        env:
 +          HF_HOME: ${{ github.workspace }}/.hf-cache
 +          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
 +        run: |
 +          uv run python - <<'PY'
 +          from tests.fixtures.tiny_model import tiny_model_path
 +          print("tiny model at:", tiny_model_path())
 +          PY
++
 +      - name: Restore llama.cpp build cache
 +        id: llama-cpp-cache
 +        uses: actions/cache@v4
 +        with:
 +          path: vendor/llama.cpp/build
 +          key: llama-cpp-build-${{ hashFiles('.gitmodules', 'vendor/llama.cpp/VERSION') }}
++
 +      - name: Build llama-quantize (if not cached)
 +        if: steps.llama-cpp-cache.outputs.cache-hit != 'true'
 +        run: |
 +          set -euxo pipefail
 +          command -v cmake >/dev/null 2>&1 || sudo apt-get install -y cmake
 +          scripts/bump-llama-cpp.sh build
++
 +      - name: Install Ollama
 +        run: |
 +          set -euxo pipefail
 +          curl -fsSL https://ollama.com/install.sh | sh
 +          # Start the ollama server in the background so `ollama create` /
 +          # `ollama run` have something to talk to.
 +          ollama serve >/tmp/ollama.log 2>&1 &
 +          # Poll for readiness instead of a blind sleep.
 +          for i in $(seq 1 30); do
 +            if ollama list >/dev/null 2>&1; then
 +              echo "ollama ready after ${i}s"
 +              break
 +            fi
 +            sleep 1
 +          done
 +          ollama --version
++
 +      - name: Export tiny model to Ollama (registers under dlm-test-chatml)
 +        env:
 +          HF_HOME: ${{ github.workspace }}/.hf-cache
 +          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
 +        run: |
 +          set -euxo pipefail
 +          # Placeholder: the tiny-model export pipeline lands via Sprint 14.5.
 +          # Until then the closed-loop job exits 0 after the HF-side check
 +          # — the scaffold is in place for the runner to fill.
 +          echo "export pipeline TBD — see Sprint 14.5"
++
 +      - name: Run closed-loop integration test
 +        if: false  # enabled once the export step above registers OLLAMA_NAME
 +        env:
 +          OLLAMA_NAME: "dlm-test-chatml:latest"
 +          HF_HOME: ${{ github.workspace }}/.hf-cache
 +          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
 +        run: uv run pytest -m slow -v tests/integration/export/test_template_closed_loop.py

tests/integration/export/test_template_closed_loop.pyadded

 +"""Sprint 12.6 — Go↔Jinja token-identity closed-loop integration test.
++
 +Two tests live here, both marked `@pytest.mark.slow`:
++
 +1. `test_hf_goldens_reproduce` — re-runs `apply_chat_template` against
 +   the same tokenizer the refresh script targets. If a future HF revision
 +   drifts the template output, this test fails fast in CI before the
 +   Ollama-side round trip ever gets its turn.
++
 +2. `test_closed_loop_go_vs_jinja_chatml` — the real closed-loop check.
 +   Requires `ollama` on PATH + a tiny chatml model registered under
 +   `OLLAMA_NAME`. On CI, the weekly `weekly-template-drift.yml` workflow
 +   handles registration via the standard export pipeline; for local
 +   devs, `OLLAMA_NAME` can point at a manually-registered model.
 +"""
++
 +from __future__ import annotations
++
 +import json
 +import os
 +import shutil
 +from pathlib import Path
++
 +import pytest
++
 +from dlm.export.ollama.verify import verify_token_count
++
 +pytestmark = pytest.mark.slow
++
 +_REPO_ROOT = Path(__file__).resolve().parents[3]
 +_CHATML_GOLDENS_DIR = _REPO_ROOT / "tests" / "golden" / "chat-templates" / "chatml"
++
++
 +def _load_chatml_goldens() -> list[dict[str, object]]:
 +    if not _CHATML_GOLDENS_DIR.is_dir():
 +        return []
 +    out: list[dict[str, object]] = []
 +    for path in sorted(_CHATML_GOLDENS_DIR.glob("*.json")):
 +        out.append(json.loads(path.read_text(encoding="utf-8")))
 +    return out
++
++
 +@pytest.mark.slow
 +def test_hf_goldens_reproduce() -> None:
 +    """Jinja side: HF tokenizer reproduces every chatml golden exactly.
++
 +    This is the cheaper half of the closed-loop — it only needs the
 +    tokenizer + an offline HF cache. Runs in the weekly workflow before
 +    the Ollama-side test so a template drift upstream fails with a
 +    clear signal before we burn minutes on `ollama pull` + registration.
 +    """
 +    goldens = _load_chatml_goldens()
 +    if not goldens:
 +        pytest.skip("no chatml goldens on disk; run refresh-chat-template-goldens.py")
++
 +    try:
 +        from tests.fixtures.tiny_model import tiny_model_path
 +    except Exception as exc:  # pragma: no cover
 +        pytest.skip(f"tiny-model fixture unavailable: {exc}")
++
 +    try:
 +        from transformers import AutoTokenizer
++
 +        tokenizer = AutoTokenizer.from_pretrained(
 +            str(tiny_model_path()),
 +            use_fast=True,
 +            trust_remote_code=False,
 +        )
 +    except Exception as exc:
 +        pytest.skip(f"could not load tiny-model tokenizer: {exc}")
++
 +    for golden in goldens:
 +        rendered = tokenizer.apply_chat_template(
 +            golden["messages"],
 +            add_generation_prompt=True,
 +            tokenize=True,
 +            return_dict=False,
 +        )
 +        actual = len(rendered)
 +        recorded = golden["expected_hf_token_count"]
 +        assert actual == recorded, (
 +            f"chatml/{golden['scenario']}: HF re-render={actual}, "
 +            f"golden={recorded}. Template drift upstream? Regenerate via "
 +            "scripts/refresh-chat-template-goldens.py after reviewing."
 +        )
++
++
 +@pytest.mark.slow
 +def test_closed_loop_go_vs_jinja_chatml() -> None:
 +    """Full closed-loop: Ollama `prompt_eval_count` == HF `apply_chat_template` len.
++
 +    Expects `OLLAMA_NAME` in the environment to point at a registered
 +    chatml model. The weekly CI workflow sets this after running
 +    `dlm export` on the tiny-model fixture.
 +    """
 +    if shutil.which("ollama") is None:
 +        pytest.skip("ollama binary not on PATH.")
++
 +    ollama_name = os.environ.get("OLLAMA_NAME")
 +    if not ollama_name:
 +        pytest.skip("OLLAMA_NAME not set; weekly workflow or local export registers one.")
++
 +    goldens = _load_chatml_goldens()
 +    if not goldens:
 +        pytest.skip("no chatml goldens on disk.")
++
 +    try:
 +        from tests.fixtures.tiny_model import tiny_model_path
 +        from transformers import AutoTokenizer
++
 +        tokenizer = AutoTokenizer.from_pretrained(
 +            str(tiny_model_path()),
 +            use_fast=True,
 +            trust_remote_code=False,
 +        )
 +    except Exception as exc:
 +        pytest.skip(f"tokenizer setup failed: {exc}")
++
 +    for golden in goldens:
 +        verify_token_count(
 +            ollama_name=ollama_name,
 +            hf_tokenizer=tokenizer,
 +            messages=golden["messages"],
 +            scenario=f"chatml/{golden['scenario']}",
 +        )