`a3db86f`

feat(export): chat-template golden matrix + refresh script (sprint 12.6)

Authored by

espadonne 3 weeks ago

SHA: a3db86fa0fc4a56ffbb1bdf541ad1680bb26fb82
Parents: 8ab516d
Tree: 1eb06e7

9 changed files

Status	File	+
A	`scripts/refresh-chat-template-goldens.py`	218
A	`tests/golden/chat-templates/chatml/empty-user.json`	17
A	`tests/golden/chat-templates/chatml/multi-system.json`	21
A	`tests/golden/chat-templates/chatml/multi-turn.json`	25
A	`tests/golden/chat-templates/chatml/single-turn.json`	17
A	`tests/golden/chat-templates/chatml/system-only.json`	17
A	`tests/golden/chat-templates/chatml/tool-call.json`	21
A	`tests/golden/chat-templates/dialect-specs.json`	11
A	`tests/golden/chat-templates/scenarios.json`	57

scripts/refresh-chat-template-goldens.pyadded

 +#!/usr/bin/env python3
 +"""Regenerate Sprint 12.6's per-dialect chat-template token-count goldens.
++
 +For every registered Go-template dialect that has a representative
 +base spec in the registry, this script:
++
 +1. Loads the HF tokenizer for the representative base.
 +2. Walks the shared scenario matrix (`tests/golden/chat-templates/
 +   scenarios.json`).
 +3. For each scenario, renders via `apply_chat_template(...,
 +   add_generation_prompt=True, tokenize=True)` and records the
 +   token count.
 +4. Writes `tests/golden/chat-templates/<dialect>/<scenario>.json`.
++
 +The emitted files are byte-identical across runs against the same
 +pinned base revision + transformers version — they power the
 +Sprint 12.6 closed-loop check (Ollama Go template's
 +`prompt_eval_count` must equal these HF counts).
++
 +Usage:
 +    uv run python scripts/refresh-chat-template-goldens.py
 +    uv run python scripts/refresh-chat-template-goldens.py --check
 +    uv run python scripts/refresh-chat-template-goldens.py --dialect chatml
++
 +Requires a hot HF cache for each dialect's representative base —
 +`--dialect NAME` lets you refresh one at a time if others aren't cached.
++
 +`--check` exits 0 when every existing golden matches the freshly-
 +computed value; exits 1 + prints a diff otherwise. Used by the
 +weekly drift workflow and by operators validating a base bump.
 +"""
++
 +from __future__ import annotations
++
 +import argparse
 +import json
 +import sys
 +from datetime import UTC, datetime
 +from pathlib import Path
 +from typing import Any
++
 +_REPO_ROOT = Path(__file__).resolve().parents[1]
 +_GOLDENS_ROOT = _REPO_ROOT / "tests" / "golden" / "chat-templates"
 +_SCENARIOS_PATH = _GOLDENS_ROOT / "scenarios.json"
 +_DIALECT_SPECS_PATH = _GOLDENS_ROOT / "dialect-specs.json"
++
++
 +def _load_scenarios() -> list[dict[str, Any]]:
 +    blob: dict[str, Any] = json.loads(_SCENARIOS_PATH.read_text(encoding="utf-8"))
 +    scenarios: list[dict[str, Any]] = blob["scenarios"]
 +    return scenarios
++
++
 +def _load_dialect_specs() -> dict[str, str | None]:
 +    blob = json.loads(_DIALECT_SPECS_PATH.read_text(encoding="utf-8"))
 +    return {k: v for k, v in blob.items() if not k.startswith("_")}
++
++
 +def _golden_path(dialect: str, scenario_name: str) -> Path:
 +    return _GOLDENS_ROOT / dialect / f"{scenario_name}.json"
++
++
 +def _compute_token_count(tokenizer: Any, messages: list[dict[str, str]]) -> int:
 +    # `return_dict=False` makes HF return a plain `list[int]`; without it
 +    # newer tokenizers hand back a `BatchEncoding` whose `len(...)` is
 +    # the number of keys (2), not the number of tokens.
 +    rendered = tokenizer.apply_chat_template(
 +        messages,
 +        add_generation_prompt=True,
 +        tokenize=True,
 +        return_dict=False,
 +    )
 +    return len(rendered)
++
++
 +def _load_tokenizer(registry_key: str) -> Any:
 +    from dlm.base_models import BASE_MODELS
++
 +    spec = BASE_MODELS[registry_key]
 +    from transformers import AutoTokenizer
++
 +    # `use_fast=True` is the default but we spell it for clarity —
 +    # `apply_chat_template` behaves identically across fast/slow.
 +    return AutoTokenizer.from_pretrained(
 +        spec.hf_id,
 +        revision=spec.revision,
 +        use_fast=True,
 +        trust_remote_code=False,
 +    )
++
++
 +def _write_golden(
 +    path: Path,
 +    *,
 +    dialect: str,
 +    scenario: dict[str, Any],
 +    registry_key: str,
 +    token_count: int,
 +) -> None:
 +    path.parent.mkdir(parents=True, exist_ok=True)
 +    blob: dict[str, Any] = {
 +        "dialect": dialect,
 +        "scenario": scenario["name"],
 +        "representative_base": registry_key,
 +        "messages": scenario["messages"],
 +        "expected_hf_token_count": token_count,
 +        "regenerated_at": datetime.now(UTC).replace(tzinfo=None, microsecond=0).isoformat(),
 +    }
 +    path.write_text(json.dumps(blob, indent=2) + "\n", encoding="utf-8")
++
++
 +def _read_recorded(path: Path) -> int | None:
 +    if not path.is_file():
 +        return None
 +    try:
 +        blob = json.loads(path.read_text(encoding="utf-8"))
 +    except (OSError, json.JSONDecodeError):
 +        return None
 +    val = blob.get("expected_hf_token_count")
 +    return val if isinstance(val, int) else None
++
++
 +def _refresh_dialect(
 +    dialect: str,
 +    registry_key: str | None,
 +    scenarios: list[dict[str, Any]],
 +    *,
 +    check: bool,
 +) -> tuple[int, int]:
 +    """Return `(written_or_matched, drifted)` counts for reporting."""
 +    if registry_key is None:
 +        print(f"[skip] {dialect}: no representative base in registry")
 +        return (0, 0)
++
 +    print(f"[load] {dialect}: using {registry_key}")
 +    tokenizer = _load_tokenizer(registry_key)
++
 +    written = 0
 +    drifted = 0
 +    for scenario in scenarios:
 +        target = _golden_path(dialect, scenario["name"])
 +        actual = _compute_token_count(tokenizer, scenario["messages"])
 +        recorded = _read_recorded(target)
++
 +        if check:
 +            if recorded is None:
 +                print(f"  [MISS] {scenario['name']}: no golden on disk")
 +                drifted += 1
 +            elif recorded != actual:
 +                print(
 +                    f"  [DRIFT] {scenario['name']}: "
 +                    f"golden={recorded} actual={actual} delta={actual - recorded:+d}"
 +                )
 +                drifted += 1
 +            else:
 +                written += 1
 +        else:
 +            _write_golden(
 +                target,
 +                dialect=dialect,
 +                scenario=scenario,
 +                registry_key=registry_key,
 +                token_count=actual,
 +            )
 +            status = "=" if recorded == actual else "+"
 +            print(f"  [{status}] {scenario['name']}: {actual} tokens")
 +            written += 1
++
 +    return written, drifted
++
++
 +def main() -> int:
 +    parser = argparse.ArgumentParser(description=__doc__)
 +    parser.add_argument(
 +        "--check",
 +        action="store_true",
 +        help="Exit non-zero on drift; don't write.",
 +    )
 +    parser.add_argument(
 +        "--dialect",
 +        help="Refresh only this dialect (default: all).",
 +    )
 +    args = parser.parse_args()
++
 +    scenarios = _load_scenarios()
 +    dialect_specs = _load_dialect_specs()
 +    if args.dialect is not None:
 +        if args.dialect not in dialect_specs:
 +            print(
 +                f"error: unknown dialect {args.dialect!r}; known: {sorted(dialect_specs)}",
 +                file=sys.stderr,
 +            )
 +            return 2
 +        dialect_specs = {args.dialect: dialect_specs[args.dialect]}
++
 +    total_written = 0
 +    total_drifted = 0
 +    for dialect, registry_key in dialect_specs.items():
 +        written, drifted = _refresh_dialect(dialect, registry_key, scenarios, check=args.check)
 +        total_written += written
 +        total_drifted += drifted
++
 +    if args.check:
 +        if total_drifted:
 +            print(
 +                f"\nFAIL: {total_drifted} golden(s) drifted. Run without "
 +                "`--check` to regenerate, then review the diff.",
 +                file=sys.stderr,
 +            )
 +            return 1
 +        print(f"\nOK: {total_written} golden(s) match current tokenizers.")
 +    else:
 +        print(f"\nOK: {total_written} golden(s) written.")
 +    return 0
++
++
 +if __name__ == "__main__":
 +    sys.exit(main())

tests/golden/chat-templates/chatml/empty-user.jsonadded

 +{
 +  "dialect": "chatml",
 +  "scenario": "empty-user",
 +  "representative_base": "smollm2-135m",
 +  "messages": [
 +    {
 +      "role": "system",
 +      "content": "You handle empty user messages gracefully."
 +    },
 +    {
 +      "role": "user",
 +      "content": ""
 +    }
 +  ],
 +  "expected_hf_token_count": 21,
 +  "regenerated_at": "2026-04-19T21:34:04"
 +}

tests/golden/chat-templates/chatml/multi-system.jsonadded

 +{
 +  "dialect": "chatml",
 +  "scenario": "multi-system",
 +  "representative_base": "smollm2-135m",
 +  "messages": [
 +    {
 +      "role": "system",
 +      "content": "You are concise."
 +    },
 +    {
 +      "role": "system",
 +      "content": "You always greet first."
 +    },
 +    {
 +      "role": "user",
 +      "content": "Hello!"
 +    }
 +  ],
 +  "expected_hf_token_count": 30,
 +  "regenerated_at": "2026-04-19T21:34:04"
 +}

tests/golden/chat-templates/chatml/multi-turn.jsonadded

 +{
 +  "dialect": "chatml",
 +  "scenario": "multi-turn",
 +  "representative_base": "smollm2-135m",
 +  "messages": [
 +    {
 +      "role": "system",
 +      "content": "You are a helpful assistant."
 +    },
 +    {
 +      "role": "user",
 +      "content": "What is 2 + 2?"
 +    },
 +    {
 +      "role": "assistant",
 +      "content": "4"
 +    },
 +    {
 +      "role": "user",
 +      "content": "And 3 + 3?"
 +    }
 +  ],
 +  "expected_hf_token_count": 47,
 +  "regenerated_at": "2026-04-19T21:34:04"
 +}

tests/golden/chat-templates/chatml/single-turn.jsonadded

 +{
 +  "dialect": "chatml",
 +  "scenario": "single-turn",
 +  "representative_base": "smollm2-135m",
 +  "messages": [
 +    {
 +      "role": "system",
 +      "content": "You are a helpful assistant."
 +    },
 +    {
 +      "role": "user",
 +      "content": "What is 2 + 2?"
 +    }
 +  ],
 +  "expected_hf_token_count": 28,
 +  "regenerated_at": "2026-04-19T21:34:04"
 +}

tests/golden/chat-templates/chatml/system-only.jsonadded

 +{
 +  "dialect": "chatml",
 +  "scenario": "system-only",
 +  "representative_base": "smollm2-135m",
 +  "messages": [
 +    {
 +      "role": "system",
 +      "content": "You are a helpful assistant."
 +    },
 +    {
 +      "role": "user",
 +      "content": ""
 +    }
 +  ],
 +  "expected_hf_token_count": 20,
 +  "regenerated_at": "2026-04-19T21:34:04"
 +}

tests/golden/chat-templates/chatml/tool-call.jsonadded

 +{
 +  "dialect": "chatml",
 +  "scenario": "tool-call",
 +  "representative_base": "smollm2-135m",
 +  "messages": [
 +    {
 +      "role": "system",
 +      "content": "You can call tools."
 +    },
 +    {
 +      "role": "user",
 +      "content": "Call the get_weather tool for London."
 +    },
 +    {
 +      "role": "assistant",
 +      "content": "I'll call get_weather(city='London')."
 +    }
 +  ],
 +  "expected_hf_token_count": 45,
 +  "regenerated_at": "2026-04-19T21:34:04"
 +}

tests/golden/chat-templates/dialect-specs.jsonadded

 +{
 +  "_comment": [
 +    "Maps each Go-template dialect to the representative registry key",
 +    "whose HF tokenizer + chat template define the golden token counts.",
 +    "Picks small bases so the refresh script can run on any dev machine."
 +  ],
 +  "chatml": "smollm2-135m",
 +  "llama3": "llama-3.2-1b",
 +  "phi3": "phi-3.5-mini",
 +  "mistral": null
 +}

tests/golden/chat-templates/scenarios.jsonadded

 +{
 +  "_comment": [
 +    "Canonical message-set scenarios used by Sprint 12.6's Go↔Jinja",
 +    "closed-loop check. Each registered dialect is expected to render",
 +    "every scenario; the per-dialect golden under ./<dialect>/ pins",
 +    "the HF token count against which Ollama's prompt_eval_count must",
 +    "match at verification time."
 +  ],
 +  "scenarios": [
 +    {
 +      "name": "system-only",
 +      "messages": [
 +        {"role": "system", "content": "You are a helpful assistant."},
 +        {"role": "user", "content": ""}
 +      ]
 +    },
 +    {
 +      "name": "single-turn",
 +      "messages": [
 +        {"role": "system", "content": "You are a helpful assistant."},
 +        {"role": "user", "content": "What is 2 + 2?"}
 +      ]
 +    },
 +    {
 +      "name": "multi-turn",
 +      "messages": [
 +        {"role": "system", "content": "You are a helpful assistant."},
 +        {"role": "user", "content": "What is 2 + 2?"},
 +        {"role": "assistant", "content": "4"},
 +        {"role": "user", "content": "And 3 + 3?"}
 +      ]
 +    },
 +    {
 +      "name": "tool-call",
 +      "messages": [
 +        {"role": "system", "content": "You can call tools."},
 +        {"role": "user", "content": "Call the get_weather tool for London."},
 +        {"role": "assistant", "content": "I'll call get_weather(city='London')."}
 +      ]
 +    },
 +    {
 +      "name": "multi-system",
 +      "messages": [
 +        {"role": "system", "content": "You are concise."},
 +        {"role": "system", "content": "You always greet first."},
 +        {"role": "user", "content": "Hello!"}
 +      ]
 +    },
 +    {
 +      "name": "empty-user",
 +      "messages": [
 +        {"role": "system", "content": "You handle empty user messages gracefully."},
 +        {"role": "user", "content": ""}
 +      ]
 +    }
 +  ]
 +}