`95402d2`

Scrub base model jargon

Authored by

espadonne 3 weeks ago

SHA: 95402d2749cc6da27f45a29d66ed72f56e81ce7f
Parents: 5e24f0e
Tree: e00f9d0

10 changed files

Status	File	+	-
M	`src/dlm/base_models/errors.py`	2	3
M	`src/dlm/base_models/license.py`	15	16
M	`src/dlm/base_models/probes.py`	20	22
M	`src/dlm/base_models/registry.py`	22	24
M	`src/dlm/base_models/resolver.py`	2	3
M	`src/dlm/base_models/schema.py`	12	13
M	`src/dlm/base_models/templates/chatml.jinja`	3	3
M	`src/dlm/base_models/templates/llama3.jinja`	1	1
M	`src/dlm/base_models/templates/mistral.jinja`	1	1
M	`src/dlm/base_models/templates/phi3.jinja`	1	1

src/dlm/base_models/errors.pymodified

  class GatedModelError(BaseModelError):
      """Model requires HuggingFace license acceptance and the user hasn't accepted.
 -    Lives here (not in Sprint 12b) because registry probes catch it
 -    first; Sprint 12b's `dlm init --i-accept-license` flow writes the
 -    acceptance record, but the error shape is owned here.
 +    Lives here because registry probes catch it first; the acceptance
 +    record is written elsewhere, but the error shape is owned here.
      """
      def __init__(self, hf_id: str, license_url: str | None) -> None:

src/dlm/base_models/license.pymodified

 -"""License acceptance records for gated base models (Sprint 12b).
 +"""License acceptance records for gated base models.
  The `BaseModelSpec` schema already carries `requires_acceptance`,
 -`redistributable`, `license_spdx`, and `license_url` (Sprint 06). What
 -Sprint 12b adds is the *acceptance record* — a small Pydantic model
 -that stores "user X accepted license Y at time T via path Z", plus a
 -helper that validates an `accept_license` flag against the spec.
 +`redistributable`, `license_spdx`, and `license_url`. This module adds
 +the *acceptance record* — a small Pydantic model that stores "user X
 +accepted license Y at time T via path Z", plus a helper that validates
 +an `accept_license` flag against the spec.
  `LicenseAcceptance` rides on two load-bearing files:
 -- `manifest.json.license_acceptance` (this sprint): the per-store
 -  durable record; reads on every subsequent `dlm train` to verify
 -  the acceptance fingerprint is still present.
 -- Repo-level `dlm.lock.license_acceptance` (Sprint 15): the
 -  determinism-contract mirror; divergence between the two triggers a
 -  lock re-check.
 +- `manifest.json.license_acceptance`: the per-store durable record;
 +  read on every subsequent `dlm train` to verify the acceptance
 +  fingerprint is still present.
 +- Repo-level `dlm.lock.license_acceptance`: the determinism-contract
 +  mirror; divergence between the two triggers a lock re-check.
 -The interactive prompt in `dlm init` is owned by Sprint 13; this
 -module ships the data types + helpers that sprint will call.
 +The interactive prompt in `dlm init` lives in the CLI layer; this
 +module ships the data types + helpers that prompt calls.
  """
  from __future__ import annotations
      `via` records *how* acceptance was captured:
      - `"cli_flag"` — `--i-accept-license` on init/train (explicit).
 -    - `"interactive"` — `y/N` prompt (Sprint 13 UX).
 -    - `"frontmatter"` — persisted in `.dlm` frontmatter (Sprint 13).
 +    - `"interactive"` — `y/N` prompt.
 +    - `"frontmatter"` — persisted in `.dlm` frontmatter.
      The `license_url` is captured at acceptance time so a later
      upstream URL change is auditable (the recorded URL stays the
 -    user's contract; warn-not-fail on drift per Sprint 12b risks).
 +    user's contract; drift is visible without rewriting history).
      """
      model_config = ConfigDict(extra="forbid", frozen=True)

src/dlm/base_models/probes.pymodified

     `spec.architecture`. Catches model-surgery mismatches and wrong
     revisions.
 . `probe_chat_template` — tokenizer has a non-empty `chat_template`
 -   attribute. Essential for Sprint 12's Modelfile emission.
 +   attribute. Essential for Modelfile emission.
 . `probe_gguf_arch_supported` — scans the vendored
     `convert_hf_to_gguf.py` for a `@Model.register("<arch>")` matching
 -   `spec.gguf_arch`. Sprint 11 owns the vendored submodule; until then
 -   the probe skips with a clear message.
 +   `spec.gguf_arch`. If the vendored submodule is absent, the probe
 +   skips with a clear message.
 . `probe_pretokenizer_label` — reads `vendor/llama_cpp_pretokenizer_hashes.json`
     (populated by `scripts/bump-llama-cpp.sh`) and checks the spec's
     `tokenizer_pre` is a known **label**. Silent drift here causes
     silent GGUF export failures per findings §9; the probe catches it
     early. This is the offline fast-check.
 -5. `probe_pretokenizer_hash` — real fingerprint check (audit-04 B8 /
 +5. `probe_pretokenizer_hash` — real fingerprint check (see
     CLAUDE.md pitfall #5). Tokenizes `_LLAMA_CPP_CHKTXT` and compares
     the sha256 of the stringified token sequence against a vendored
     per-label fingerprint table. Detects silent upstream tokenization
  _LOG = logging.getLogger(__name__)
 -# Vendored artifact locations (Sprint 11 populates `vendor/llama.cpp`).
 +# Vendored artifact locations.
  _REPO_ROOT: Final[Path] = Path(__file__).resolve().parents[3]
  VENDOR_LLAMA_CPP_DEFAULT: Final[Path] = _REPO_ROOT / "vendor" / "llama.cpp"
  VENDOR_PRETOKENIZER_HASHES_DEFAULT: Final[Path] = (
  # stringify the resulting token-id list, sha256 it — that digest is
  # the fingerprint llama.cpp maps to one of its pre-tokenizer types.
  # Keep verbatim; any edit here desynchronizes us from llama.cpp's
 -# identification logic (audit-04 B8 + CLAUDE.md pitfall #5).
 +# identification logic (see CLAUDE.md pitfall #5).
  _LLAMA_CPP_CHKTXT: Final[str] = (
      "\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n"
      "🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ "
      """Scan vendored ``convert_hf_to_gguf.py`` for
      ``@Model.register("<gguf_arch>")`` or ``@ModelBase.register(...)``.
 -    Until Sprint 11 lands the submodule, this probe skips.
 +    If the vendored converter submodule is absent, this probe skips.
      """
      script = (vendor_path or VENDOR_LLAMA_CPP_DEFAULT) / "convert_hf_to_gguf.py"
      if not script.exists():
          return ProbeResult(
              name="gguf_arch",
              passed=True,
 -            detail=f"skipped: {script} not present (Sprint 11 vendors llama.cpp)",
 +            detail=f"skipped: {script} not present (vendor/llama.cpp missing)",
              skipped=True,
+         )
      The vendored table is a JSON array of label strings that llama.cpp
      recognizes in `get_vocab_base_pre()`. Missing table → skip.
 -    NOTE (audit-04 M7): this is a *label* probe, not a hash probe.
 -    Sprint 11 will add real `probe_pretokenizer_hash` that canonically
 -    digests `tokenizer.json` and compares against llama.cpp's fingerprint
 -    table. For now we check coarse compatibility via the label.
 +    NOTE: this is a *label* probe, not a hash probe.
 +    `probe_pretokenizer_hash` is the canonical fingerprint check; this
 +    probe only checks coarse compatibility via the label.
      """
      path = hashes_path or VENDOR_PRETOKENIZER_HASHES_DEFAULT
      if not path.exists():
  ) -> ProbeResult:
      """Compute the real llama.cpp pre-tokenizer fingerprint and compare.
 -    Audit-04 B8 / CLAUDE.md pitfall #5. The label probe (above) only
 -    checks membership in a string table; llama.cpp itself identifies
 -    the pre-tokenizer by sha256-hashing the token-id sequence produced
 -    by tokenizing a stable test string (`_LLAMA_CPP_CHKTXT`). We do
 -    the same here — if the upstream tokenizer changes behavior (new
 +    See CLAUDE.md pitfall #5. The label probe (above) only checks
 +    membership in a string table; llama.cpp itself identifies the
 +    pre-tokenizer by sha256-hashing the token-id sequence produced by
 +    tokenizing a stable test string (`_LLAMA_CPP_CHKTXT`). We do the
 +    same here — if the upstream tokenizer changes behavior (new
      revision, silently different merges), the fingerprint drifts and
      this probe fails loudly *before* a broken GGUF reaches Ollama.
      vendored llama.cpp can absorb (e.g. brand-new Qwen3 on a llama.cpp
      pin from last month). They forfeit `dlm export` to Ollama until
      the vendored copy catches up. VL bases auto-opt-out of export
 -    probes — GGUF conversion for VL archs is tracked in Sprint 35.4.
 +    probes because current GGUF export does not support them.
      """
      from dlm.modality import modality_for
      else:
          core = (*core, probe_chat_template(spec))
 -    # Media bases (VL + audio) bypass the llama.cpp-converter probes:
 -    # converter support for VL archs is Sprint 35.4's scope, and audio
 -    # archs are not on any llama.cpp roadmap yet. The export path
 -    # refuses GGUF cleanly for both and emits an HF snapshot instead.
 +    # Media bases (VL + audio) bypass the llama.cpp-converter probes.
 +    # The export path refuses GGUF cleanly for both and emits an HF
 +    # snapshot instead.
      is_media = dispatch.requires_processor
      if skip_export_probes or is_media:
          return ProbeReport(hf_id=spec.hf_id, results=core)

src/dlm/base_models/registry.pymodified

    with <100M MAU). We record it as `license_spdx="Other"` and surface
    the URL via `license_url`; it remains `redistributable=True` because
    the license permits bundling + redistribution with attribution.
 -  **Caveat (audit-04 m11):** the boolean `redistributable` field does
 -  not express the MAU threshold or attribution requirement. A
 +  **Caveat:** the boolean `redistributable` field does not express the
 +  MAU threshold or attribution requirement. A
    `redistributable_conditions: str | None` field on `BaseModelSpec`
    plus a pack-time attestation checkbox would encode this properly —
 -  deferred to Sprint 12b's license-UX extension. Until then, users
 -  at the scale threshold must consult the license text themselves.
 +  deferred follow-up work. Until then, users at the scale threshold
 +  must consult the license text themselves.
  - Llama-3.2 models are gated on HuggingFace (`requires_acceptance=True`)
    and their license does NOT permit bundling into a `.dlm.pack`
 -  (`redistributable=False`) — enforced by Sprint 14's pack gate and
 -  Sprint 28's share-protocol refusal.
 +  (`redistributable=False`) — enforced by the pack gate and
 +  share-protocol refusal.
  - SmolLM2 and Phi-3.5-mini are permissive (Apache-2.0 / MIT).
  - `size_gb_fp16` is approximate; the hardware doctor uses it to seed
 -  VRAM estimates, which then get refined by sprint 09's runtime guard.
 +  VRAM estimates, which then get refined by runtime checks.
  """
  from __future__ import annotations
          context_length=131_072,
          recommended_seq_len=2048,
      ),
 -    # --- Vision-language bases (Sprint 35 v1) -------------------------------
 +    # --- Vision-language bases ----------------------------------------------
      # PaliGemma-3B-mix-224 — Google's instruction-tuned VL base built on
      # Gemma-2B + SigLIP-So400m. Gated under the Gemma license; cannot
      # redistribute inside a `.dlm.pack` (same pattern as Llama-3.2).
      # Training targets Gemma's transformer blocks; the vision tower is
      # trained jointly when modules_to_save expands to ["embed_tokens",
 -    # "lm_head"], but Sprint 35 v1 keeps modules_to_save empty so only
 -    # the LLM-side LoRA adapters move — the vision tower is frozen.
 +    # "lm_head"], but the current entry keeps modules_to_save empty so
 +    # only the LLM-side LoRA adapters move — the vision tower is frozen.
+     #
      # `gguf_arch` / `tokenizer_pre` are set to tags the current vendored
      # llama.cpp doesn't recognize; the export probes surface
 -    # UNSUPPORTED + refuse GGUF conversion until Sprint 35.4 lands the
 -    # arch-support gate. HF-snapshot export (`dlm export --hf-snapshot`)
 -    # still works.
 +    # UNSUPPORTED + refuse GGUF conversion until GGUF support lands.
 +    # HF-snapshot export (`dlm export --hf-snapshot`) still works.
      BaseModelSpec(
          key="paligemma-3b-mix-224",
          hf_id="google/paligemma-3b-mix-224",
          # it as drift; a maintainer pastes in the observed SHA from
          # the script's output. Offline probe tests skip cleanly
          # until then (see tests/unit/base_models/test_vl_registry.py).
 -        # Landed as part of Sprint 35 v1; to verify, run:
 +        # To verify, run:
          #     uv run python scripts/refresh-registry.py --check
          revision="8d2f7bc9c15d71a00c14f9eb7e4c7b99c79e0a11",
          architecture="PaliGemmaForConditionalGeneration",
          ),
      ),
      # Qwen2-VL-2B-Instruct — Alibaba's Apache-2.0 VL base with dynamic-
 -    # resolution support in native HF. Sprint 35.3 pins a conservative
 -    # fixed 672×672 preprocessing plan (implementation-note (a) in the
 -    # sprint spec) to avoid growing the VlPreprocessorPlan abstraction
 -    # for dynamic ranges in v1 — later sprints can extend the plan with
 -    # {min_pixels, max_pixels} when a user reaches that limit.
 +    # resolution support in native HF. The current entry pins a
 +    # conservative fixed 672×672 preprocessing plan to avoid growing
 +    # the VlPreprocessorPlan abstraction for dynamic ranges yet; a
 +    # future extension can add {min_pixels, max_pixels} when needed.
+     #
      # 672×672 with Qwen2-VL's 28-pixel patch-merger grid yields 24×24 =
      # 576 vision tokens per image. `<|image_pad|>` is the runtime
              num_image_tokens=256,
          ),
      ),
 -    # --- Audio-language bases (Sprint 35.2) ---------------------------------
 +    # --- Audio-language bases -----------------------------------------------
      # Qwen2-Audio-7B-Instruct — Alibaba's open audio-text model. Uses
      # the Qwen2 LLM backbone + a dedicated audio encoder. Apache-2.0
      # but the 7B checkpoint is gated on HF via license acceptance, so
      # Apache-2.0, but not-bundled-by-default because the pack size
      # (~14 GB fp16) dominates the tarball.
+     #
 -    # The 16 kHz pin + 30 s max-length match the training-time defaults
 -    # documented in the Qwen2-Audio card. Resampling support lands as a
 -    # 35.2 follow-up; v1 refuses mismatched sample rates with an
 -    # actionable error at preprocess time.
 +    # The 16 kHz pin + 30 s max-length match the training-time
 +    # defaults documented in the Qwen2-Audio card. Resampling support
 +    # lands as follow-up work; current releases refuse mismatched
 +    # sample rates with an actionable error at preprocess time.
+     #
      # Placeholder SHA flagged the same way as paligemma — the weekly
      # `scripts/refresh-registry.py --check` run surfaces drift and a

src/dlm/base_models/resolver.pymodified

  Gated models (`requires_acceptance=True`) raise `GatedModelError` unless
  the caller has already accepted the license (signalled via
 -`accept_license=True`). Sprint 12b ships the `dlm init --i-accept-license`
 -flow that flips this on persistently; Sprint 06 tests pass
 -`accept_license=True` directly to exercise the downstream path.
 +`accept_license=True`). The CLI uses this to persist acceptance; tests
 +pass `accept_license=True` directly to exercise the downstream path.
  """
  from __future__ import annotations

src/dlm/base_models/schema.pymodified

    same spec pin at exactly the same weights.
  - `target_modules`: per-architecture LoRA target list (see findings §8;
    `"all-linear"` is avoided because it bloats small models).
 -- `template`: the chat-template dialect used by Sprint 12's Go-template
 +- `template`: the chat-template dialect used by the Go-template
    registry for Modelfile generation.
  - `gguf_arch` / `tokenizer_pre`: identifiers the llama.cpp converter
 -  matches against; Sprint 11's export preflight uses them.
 -- License / gating (audit-02 F04 + F21): separate fields for SPDX,
 -  acceptance gating, and re-distribution — each consumed by a different
 -  gate (Sprint 12b license UX; Sprint 14 pack `--include-base`;
 -  Sprint 28 share-protocol push refusal).
 +  matches against; export preflight uses them.
 +- License / gating: separate fields for SPDX, acceptance gating, and
 +  re-distribution — each consumed by a different policy gate (license
 +  acceptance, pack `--include-base`, share-protocol refusal).
  """
  from __future__ import annotations
      preflight checks + cache keying.
      `target_size` is `(height, width)` in pixels. `resize_policy`
 -    defaults to `"fixed"` because that's what Sprint 35 v1 ships —
 -    Qwen2-VL's dynamic resolution lands in 35.3. `image_token` is the
 -    textual placeholder inserted into prompts before the processor
 -    expands it into `num_image_tokens` copies.
 +    defaults to `"fixed"` because that's what the current launch
 +    registry ships. `image_token` is the textual placeholder inserted
 +    into prompts before the processor expands it into
 +    `num_image_tokens` copies.
      """
      model_config = ConfigDict(extra="forbid", frozen=True)
  class AudioPreprocessorPlan(BaseModel):
 -    """Per-base audio-preprocessing parameters (Sprint 35.2).
 +    """Per-base audio-preprocessing parameters.
      Mirrors `VlPreprocessorPlan` — pinned at registry-build time so
 -    the audio cache key stays stable. Sprint 35.2 v1 refuses audio at
 +    the audio cache key stays stable. Current releases refuse audio at
      non-target `sample_rate`; resampling lands as a follow-up.
      `sample_rate` is the model's training rate in Hz (Qwen2-Audio:
      gguf_arch: str = Field(..., min_length=1, description="Name llama.cpp's converter uses.")
      tokenizer_pre: str = Field(..., min_length=1, description="Pre-tokenizer label.")
 -    # License + acceptance (audit-02 F04 / F21).
 +    # License + acceptance.
      license_spdx: str = Field(..., min_length=1)
      license_url: str | None = None
      requires_acceptance: bool = False

src/dlm/base_models/templates/chatml.jinjamodified

  {#
  ChatML reference template — used by Qwen 2.5, SmolLM2, and compatible
 -models. Source-of-truth for Sprint 12's Go `text/template` round-trip
 -tests: rendering these messages through this Jinja and Sprint 12's Go
 -template must produce token-identical sequences after tokenization.
 +models. Source-of-truth for Go `text/template` round-trip tests:
 +rendering these messages through this Jinja and the Go template must
 +produce token-identical sequences after tokenization.
  Render `{"messages": [...]}` where each message has `role` in
  {"system", "user", "assistant"} and a `content` string. Call with

src/dlm/base_models/templates/llama3.jinjamodified

  {#
  Llama 3 / 3.1 / 3.2 reference template. Uses header-id framing and the
  `<|eot_id|>` end-of-turn marker (distinct from the EOS). Source-of-truth
 -for Sprint 12's Go template round-trip tests.
 +for Go template round-trip tests.
  Required tokens in the tokenizer:
    <|begin_of_text|>  <|start_header_id|>  <|end_header_id|>  <|eot_id|>

src/dlm/base_models/templates/mistral.jinjamodified

  prepended to the first user message by convention.
  Vendored for future Mistral-family entries in the registry; not used by
 -the 10 launch bases but Sprint 12's Go registry mirrors this shape.
 +the 10 launch bases but the Go registry mirrors this shape.
  #}
  {%- if messages[0]['role'] == 'system' -%}
      {%- set system_prompt = messages[0]['content'] -%}

src/dlm/base_models/templates/phi3.jinjamodified

  {#
  Phi-3 / Phi-3.5 reference template. Uses `<|role|>` opener and `<|end|>`
  closer; finishes with `<|endoftext|>` on assistant turns (handled by
 -`add_generation_prompt=False`). Source-of-truth for Sprint 12 round-trip.
 +`add_generation_prompt=False`). Source-of-truth for round-trip tests.
  Roles accepted: "system", "user", "assistant".
  #}