tenseleyflow/documentlanguagemodel / cd3615c

Browse files

feat(ci): weekly chat-template drift workflow + slow integration test (sprint 12.6)

Authored by espadonne
SHA
cd3615ca5ecd7352773b1d1760ddf36ccc01862e
Parents
a3db86f
Tree
de9d021

2 changed files

StatusFile+-
A .github/workflows/weekly-template-drift.yml 158 0
A tests/integration/export/test_template_closed_loop.py 125 0
.github/workflows/weekly-template-drift.ymladded
@@ -0,0 +1,158 @@
1
+name: Weekly chat-template drift (Sprint 12.6)
2
+
3
+# Runs the closed-loop Go↔Jinja check every Sunday:
4
+#   1. HF side — refresh-chat-template-goldens.py --check asserts no golden
5
+#      drifted since the last checked-in matrix. Fails fast if an upstream
6
+#      tokenizer revision silently changed its chat_template.
7
+#   2. Go side — install Ollama, build a tiny-model GGUF via vendored
8
+#      llama.cpp, register it with `ollama create`, then run
9
+#      tests/integration/export/test_template_closed_loop.py with
10
+#      OLLAMA_NAME pointing at the registered model. Ollama's
11
+#      prompt_eval_count must equal the HF golden for every scenario.
12
+#
13
+# Trigger: weekly cron + workflow_dispatch for operators bumping bases.
14
+
15
+on:
16
+  schedule:
17
+    # Sundays at 06:00 UTC — after quieter traffic windows for HF / Ollama pulls.
18
+    - cron: "0 6 * * 0"
19
+  workflow_dispatch: {}
20
+
21
+concurrency:
22
+  group: weekly-template-drift
23
+  cancel-in-progress: false
24
+
25
+env:
26
+  UV_VERSION: "0.11.6"
27
+  PYTHON_VERSION: "3.11"
28
+  # Pinned to BASE_MODELS["smollm2-135m"].revision (Sprint 06 registry).
29
+  # Same SHA as ci.yml — keep in sync when bumping.
30
+  TINY_MODEL_REVISION: "12fd25f77366fa6b3b4b768ec3050bf629380bac"
31
+
32
+jobs:
33
+  check-hf-side:
34
+    # Cheap half: no Ollama, no llama.cpp, no quant. If this fails the
35
+    # whole workflow is done — nothing to verify against.
36
+    name: HF goldens unchanged
37
+    runs-on: ubuntu-latest
38
+    steps:
39
+      - uses: actions/checkout@v4
40
+
41
+      - name: Install uv
42
+        uses: astral-sh/setup-uv@v4
43
+        with:
44
+          version: ${{ env.UV_VERSION }}
45
+
46
+      - name: Sync dependencies
47
+        run: uv sync --all-extras --dev
48
+
49
+      - name: Restore HF cache
50
+        uses: actions/cache@v4
51
+        with:
52
+          path: ${{ github.workspace }}/.hf-cache
53
+          key: hf-tiny-${{ env.TINY_MODEL_REVISION }}-${{ hashFiles('pyproject.toml') }}
54
+          restore-keys: |
55
+            hf-tiny-${{ env.TINY_MODEL_REVISION }}-
56
+
57
+      - name: Pre-warm tiny model
58
+        env:
59
+          HF_HOME: ${{ github.workspace }}/.hf-cache
60
+          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
61
+        run: |
62
+          uv run python - <<'PY'
63
+          from tests.fixtures.tiny_model import tiny_model_path
64
+          print("tiny model at:", tiny_model_path())
65
+          PY
66
+
67
+      - name: Refresh goldens in --check mode (chatml only — only cached dialect)
68
+        env:
69
+          HF_HOME: ${{ github.workspace }}/.hf-cache
70
+        run: uv run python scripts/refresh-chat-template-goldens.py --check --dialect chatml
71
+
72
+  closed-loop:
73
+    # Expensive half: install Ollama, build a base GGUF, register it,
74
+    # and assert prompt_eval_count == HF golden.
75
+    name: Go↔Jinja closed loop (chatml)
76
+    needs: check-hf-side
77
+    runs-on: ubuntu-latest
78
+    steps:
79
+      - name: Checkout with llama.cpp submodule
80
+        uses: actions/checkout@v4
81
+        with:
82
+          submodules: recursive
83
+
84
+      - name: Install uv
85
+        uses: astral-sh/setup-uv@v4
86
+        with:
87
+          version: ${{ env.UV_VERSION }}
88
+
89
+      - name: Sync dependencies
90
+        run: uv sync --all-extras --dev
91
+
92
+      - name: Restore HF cache
93
+        uses: actions/cache@v4
94
+        with:
95
+          path: ${{ github.workspace }}/.hf-cache
96
+          key: hf-tiny-${{ env.TINY_MODEL_REVISION }}-${{ hashFiles('pyproject.toml') }}
97
+          restore-keys: |
98
+            hf-tiny-${{ env.TINY_MODEL_REVISION }}-
99
+
100
+      - name: Pre-warm tiny model
101
+        env:
102
+          HF_HOME: ${{ github.workspace }}/.hf-cache
103
+          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
104
+        run: |
105
+          uv run python - <<'PY'
106
+          from tests.fixtures.tiny_model import tiny_model_path
107
+          print("tiny model at:", tiny_model_path())
108
+          PY
109
+
110
+      - name: Restore llama.cpp build cache
111
+        id: llama-cpp-cache
112
+        uses: actions/cache@v4
113
+        with:
114
+          path: vendor/llama.cpp/build
115
+          key: llama-cpp-build-${{ hashFiles('.gitmodules', 'vendor/llama.cpp/VERSION') }}
116
+
117
+      - name: Build llama-quantize (if not cached)
118
+        if: steps.llama-cpp-cache.outputs.cache-hit != 'true'
119
+        run: |
120
+          set -euxo pipefail
121
+          command -v cmake >/dev/null 2>&1 || sudo apt-get install -y cmake
122
+          scripts/bump-llama-cpp.sh build
123
+
124
+      - name: Install Ollama
125
+        run: |
126
+          set -euxo pipefail
127
+          curl -fsSL https://ollama.com/install.sh | sh
128
+          # Start the ollama server in the background so `ollama create` /
129
+          # `ollama run` have something to talk to.
130
+          ollama serve >/tmp/ollama.log 2>&1 &
131
+          # Poll for readiness instead of a blind sleep.
132
+          for i in $(seq 1 30); do
133
+            if ollama list >/dev/null 2>&1; then
134
+              echo "ollama ready after ${i}s"
135
+              break
136
+            fi
137
+            sleep 1
138
+          done
139
+          ollama --version
140
+
141
+      - name: Export tiny model to Ollama (registers under dlm-test-chatml)
142
+        env:
143
+          HF_HOME: ${{ github.workspace }}/.hf-cache
144
+          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
145
+        run: |
146
+          set -euxo pipefail
147
+          # Placeholder: the tiny-model export pipeline lands via Sprint 14.5.
148
+          # Until then the closed-loop job exits 0 after the HF-side check
149
+          # — the scaffold is in place for the runner to fill.
150
+          echo "export pipeline TBD — see Sprint 14.5"
151
+
152
+      - name: Run closed-loop integration test
153
+        if: false  # enabled once the export step above registers OLLAMA_NAME
154
+        env:
155
+          OLLAMA_NAME: "dlm-test-chatml:latest"
156
+          HF_HOME: ${{ github.workspace }}/.hf-cache
157
+          DLM_TINY_MODEL_REVISION: ${{ env.TINY_MODEL_REVISION }}
158
+        run: uv run pytest -m slow -v tests/integration/export/test_template_closed_loop.py
tests/integration/export/test_template_closed_loop.pyadded
@@ -0,0 +1,125 @@
1
+"""Sprint 12.6 — Go↔Jinja token-identity closed-loop integration test.
2
+
3
+Two tests live here, both marked `@pytest.mark.slow`:
4
+
5
+1. `test_hf_goldens_reproduce` — re-runs `apply_chat_template` against
6
+   the same tokenizer the refresh script targets. If a future HF revision
7
+   drifts the template output, this test fails fast in CI before the
8
+   Ollama-side round trip ever gets its turn.
9
+
10
+2. `test_closed_loop_go_vs_jinja_chatml` — the real closed-loop check.
11
+   Requires `ollama` on PATH + a tiny chatml model registered under
12
+   `OLLAMA_NAME`. On CI, the weekly `weekly-template-drift.yml` workflow
13
+   handles registration via the standard export pipeline; for local
14
+   devs, `OLLAMA_NAME` can point at a manually-registered model.
15
+"""
16
+
17
+from __future__ import annotations
18
+
19
+import json
20
+import os
21
+import shutil
22
+from pathlib import Path
23
+
24
+import pytest
25
+
26
+from dlm.export.ollama.verify import verify_token_count
27
+
28
+pytestmark = pytest.mark.slow
29
+
30
+_REPO_ROOT = Path(__file__).resolve().parents[3]
31
+_CHATML_GOLDENS_DIR = _REPO_ROOT / "tests" / "golden" / "chat-templates" / "chatml"
32
+
33
+
34
+def _load_chatml_goldens() -> list[dict[str, object]]:
35
+    if not _CHATML_GOLDENS_DIR.is_dir():
36
+        return []
37
+    out: list[dict[str, object]] = []
38
+    for path in sorted(_CHATML_GOLDENS_DIR.glob("*.json")):
39
+        out.append(json.loads(path.read_text(encoding="utf-8")))
40
+    return out
41
+
42
+
43
+@pytest.mark.slow
44
+def test_hf_goldens_reproduce() -> None:
45
+    """Jinja side: HF tokenizer reproduces every chatml golden exactly.
46
+
47
+    This is the cheaper half of the closed-loop — it only needs the
48
+    tokenizer + an offline HF cache. Runs in the weekly workflow before
49
+    the Ollama-side test so a template drift upstream fails with a
50
+    clear signal before we burn minutes on `ollama pull` + registration.
51
+    """
52
+    goldens = _load_chatml_goldens()
53
+    if not goldens:
54
+        pytest.skip("no chatml goldens on disk; run refresh-chat-template-goldens.py")
55
+
56
+    try:
57
+        from tests.fixtures.tiny_model import tiny_model_path
58
+    except Exception as exc:  # pragma: no cover
59
+        pytest.skip(f"tiny-model fixture unavailable: {exc}")
60
+
61
+    try:
62
+        from transformers import AutoTokenizer
63
+
64
+        tokenizer = AutoTokenizer.from_pretrained(
65
+            str(tiny_model_path()),
66
+            use_fast=True,
67
+            trust_remote_code=False,
68
+        )
69
+    except Exception as exc:
70
+        pytest.skip(f"could not load tiny-model tokenizer: {exc}")
71
+
72
+    for golden in goldens:
73
+        rendered = tokenizer.apply_chat_template(
74
+            golden["messages"],
75
+            add_generation_prompt=True,
76
+            tokenize=True,
77
+            return_dict=False,
78
+        )
79
+        actual = len(rendered)
80
+        recorded = golden["expected_hf_token_count"]
81
+        assert actual == recorded, (
82
+            f"chatml/{golden['scenario']}: HF re-render={actual}, "
83
+            f"golden={recorded}. Template drift upstream? Regenerate via "
84
+            "scripts/refresh-chat-template-goldens.py after reviewing."
85
+        )
86
+
87
+
88
+@pytest.mark.slow
89
+def test_closed_loop_go_vs_jinja_chatml() -> None:
90
+    """Full closed-loop: Ollama `prompt_eval_count` == HF `apply_chat_template` len.
91
+
92
+    Expects `OLLAMA_NAME` in the environment to point at a registered
93
+    chatml model. The weekly CI workflow sets this after running
94
+    `dlm export` on the tiny-model fixture.
95
+    """
96
+    if shutil.which("ollama") is None:
97
+        pytest.skip("ollama binary not on PATH.")
98
+
99
+    ollama_name = os.environ.get("OLLAMA_NAME")
100
+    if not ollama_name:
101
+        pytest.skip("OLLAMA_NAME not set; weekly workflow or local export registers one.")
102
+
103
+    goldens = _load_chatml_goldens()
104
+    if not goldens:
105
+        pytest.skip("no chatml goldens on disk.")
106
+
107
+    try:
108
+        from tests.fixtures.tiny_model import tiny_model_path
109
+        from transformers import AutoTokenizer
110
+
111
+        tokenizer = AutoTokenizer.from_pretrained(
112
+            str(tiny_model_path()),
113
+            use_fast=True,
114
+            trust_remote_code=False,
115
+        )
116
+    except Exception as exc:
117
+        pytest.skip(f"tokenizer setup failed: {exc}")
118
+
119
+    for golden in goldens:
120
+        verify_token_count(
121
+            ollama_name=ollama_name,
122
+            hf_tokenizer=tokenizer,
123
+            messages=golden["messages"],
124
+            scenario=f"chatml/{golden['scenario']}",
125
+        )