`e405098`

Pin Llama 3.3 provenance checks

Authored by

espadonne 2 weeks ago

SHA: e4050982729613f5b8def6a511f3c3a4767b8b95
Parents: 29a900f
Tree: b9e55a5

10 changed files

Status	File	+	-
M	`docs/cookbook/choosing-a-base.md`	1	1
M	`scripts/refresh-registry.py`	10	72
M	`src/dlm/base_models/errors.py`	2	2
M	`src/dlm/base_models/registry.py`	19	10
A	`src/dlm/base_models/registry_refresh.py`	119	0
M	`src/dlm/base_models/schema.py`	22	0
M	`tests/unit/base_models/test_errors.py`	1	0
M	`tests/unit/base_models/test_registry_2026.py`	10	1
A	`tests/unit/base_models/test_registry_refresh.py`	82	0
M	`tests/unit/base_models/test_schema.py`	22	0

docs/cookbook/choosing-a-base.mdmodified

  ## Notes on the sharp edges
 -- `llama-3.3-8b-instruct` is still treated like the Llama family in DLM’s policy surface: acceptance required, not redistributable, and intended for users who already know they want the Llama line.
 +- `llama-3.3-8b-instruct` is still treated like the Llama family in DLM’s policy surface: acceptance required, not redistributable, and intended for users who already know they want the Llama line. Today it resolves through a community HF mirror while DLM pins provenance against Meta’s official LlamaCon/newsroom announcement, because Meta has not published a first-party HF repo for this row.
  - `internvl2-2b` and `internvl3-2b` are registry-visible planning targets, but the current generic VL runtime still refuses the InternVL family until DLM owns its custom processor/collator contract.
  - `mistral-small-3.1-24b-instruct` is intentionally refused on MPS by default. It is a real shipped row, just not a casual laptop target.

scripts/refresh-registry.pymodified

  #!/usr/bin/env python
 -"""Re-resolve every curated base-model entry against HuggingFace.
 +"""Re-resolve every curated base-model entry against its live sources.
  Two modes:
  - Default: print a human-readable diff for each entry whose pinned SHA
 -  no longer matches HF's `main` (or whose license/gating changed).
 +  no longer matches its live fetch source (or whose license/gating /
 +  provenance changed).
    Exit 0.
  - `--check`: exit 1 if *any* entry has drifted. Used by the weekly
    CI job to open an issue when maintainer action is needed.
  Does **not** write back to `registry.py` automatically — drifted SHAs
  are a signal for a human to review the upstream change (new license
 -terms, tokenizer surgery, etc.). The script prints the ready-to-paste
 -field values so the manual update is trivial.
 +terms, tokenizer surgery, provenance changes, etc.). The script prints
 +the ready-to-paste field values so the manual update is trivial.
  Usage:
      uv run python scripts/refresh-registry.py            # print diff
  import argparse
  import sys
 -from dataclasses import dataclass
 -from huggingface_hub import HfApi
 -from huggingface_hub.errors import GatedRepoError, RepositoryNotFoundError
+-
 -from dlm.base_models import BASE_MODELS, BaseModelSpec
+-
+-
 -@dataclass(frozen=True)
 -class Drift:
 -    """Structured diff between a local registry entry and HF's head."""
+-
 -    key: str
 -    hf_id: str
 -    fields: tuple[tuple[str, str, str], ...]  # (name, pinned, observed)
+-
 -    def render(self) -> str:
 -        lines = [f"  {self.key} ({self.hf_id})"]
 -        for name, pinned, observed in self.fields:
 -            lines.append(f"    {name:<22} {pinned!r} → {observed!r}")
 -        return "\n".join(lines)
+-
+-
 -def _check_entry(api: HfApi, entry: BaseModelSpec) -> Drift | None:
 -    try:
 -        info = api.model_info(entry.hf_id)
 -    except GatedRepoError:
 -        # Gated models still expose public metadata via `model_info`;
 -        # if we can't read them, that's a new gating event worth flagging.
 -        return Drift(
 -            key=entry.key,
 -            hf_id=entry.hf_id,
 -            fields=(("gating", "readable", "now fully gated"),),
 -        )
 -    except RepositoryNotFoundError:
 -        return Drift(
 -            key=entry.key,
 -            hf_id=entry.hf_id,
 -            fields=(("repository", "present", "missing (renamed or deleted)"),),
 -        )
+-
 -    drifted: list[tuple[str, str, str]] = []
+-
 -    current_sha = info.sha
 -    if current_sha and current_sha != entry.revision:
 -        drifted.append(("revision", entry.revision, current_sha))
+-
 -    gated = getattr(info, "gated", False)
 -    # HF reports `gated` as False / "auto" / "manual". Non-False values
 -    # mean acceptance is required.
 -    gated_observed = bool(gated and gated != "False")
 -    if gated_observed != entry.requires_acceptance:
 -        drifted.append(
 -            (
 -                "requires_acceptance",
 -                str(entry.requires_acceptance),
 -                str(gated_observed),
 -            ),
 -        )
+-
 -    return Drift(key=entry.key, hf_id=entry.hf_id, fields=tuple(drifted)) if drifted else None
 +from dlm.base_models import BASE_MODELS
 +from dlm.base_models.registry_refresh import check_registry
  def main() -> int:
+     )
      args = parser.parse_args()
 -    api = HfApi()
 -    drifts: list[Drift] = []
 -    for entry in BASE_MODELS.values():
 -        drift = _check_entry(api, entry)
 -        if drift is not None:
 -            drifts.append(drift)
 +    drifts = check_registry()
      if not drifts:
 -        print(f"All {len(BASE_MODELS)} registry entries match HF.")
 +        print(f"All {len(BASE_MODELS)} registry entries match their live sources.")
          return 0
      print(f"{len(drifts)} of {len(BASE_MODELS)} entries have drifted:")
          print(drift.render())
      print()
      print(
 -        "Review each upstream change (commit log / license / gating) and "
 +        "Review each upstream change (commit log / license / gating / provenance) and "
          "update `src/dlm/base_models/registry.py` by hand."
+     )

src/dlm/base_models/errors.pymodified

  class GatedModelError(BaseModelError):
 -    """Model requires HuggingFace license acceptance and the user hasn't accepted.
 +    """Model requires license acceptance and the user hasn't accepted.
      Lives here because registry probes catch it first; the acceptance
      record is written elsewhere, but the error shape is owned here.
          self.license_url = license_url
          where = f" License: {license_url}" if license_url else ""
          super().__init__(
 -            f"{hf_id} is a gated HuggingFace model. Accept the license and "
 +            f"{hf_id} requires license acceptance. Accept the license and "
              f"pass --i-accept-license (or via `dlm init`).{where}"
+         )

src/dlm/base_models/registry.pymodified

    plus a pack-time attestation checkbox would encode this properly —
    deferred follow-up work. Until then, users at the scale threshold
    must consult the license text themselves.
 -- Llama-3.2 / 3.3 models are gated on HuggingFace
 -  (`requires_acceptance=True`) and their license does NOT permit
 -  bundling into a `.dlm.pack`
 -  (`redistributable=False`) — enforced by the pack gate and
 -  share-protocol refusal.
 +- Llama-3.2 models are gated on HuggingFace. Llama-3.3 8B currently
 +  needs a mirror-backed fetch path because Meta exposes it through the
 +  Llama API but not a first-party HF repo. DLM still keeps the same
 +  acceptance + non-redistribution policy surface for the whole Llama
 +  family (`requires_acceptance=True`, `redistributable=False`) —
 +  enforced by the pack gate and share-protocol refusal.
  - SmolLM2 / SmolLM3 and Phi-3.5-mini are permissive (Apache-2.0 / MIT).
  - `size_gb_fp16` is approximate; the hardware doctor uses it to seed
    VRAM estimates, which then get refined by runtime checks.
      ),
      BaseModelSpec(
          key="llama-3.3-8b-instruct",
 -        hf_id="meta-llama/Llama-3.3-8B-Instruct",
 -        # Placeholder SHA: format-valid, not a real HF commit. The
 -        # weekly `scripts/refresh-registry.py --check` run surfaces
 -        # drift and prints the live value for manual review.
 -        revision="4d5e6f7890abcdeffedcba0987654321abc2d3e4",
 +        # Meta's first-party LlamaCon announcement explicitly says the
 +        # Llama API can fine-tune "o novo modelo Llama 3.3 8B", but
 +        # there is still no first-party HF repo. DLM therefore fetches
 +        # weights from the community mirror below while
 +        # refresh-registry separately probes Meta's newsroom article
 +        # for provenance.
 +        hf_id="allura-forge/Llama-3.3-8B-Instruct",
 +        revision="df95224cf87c32d9f4958dd284a07ded620aa4fc",
          architecture="LlamaForCausalLM",
          params=8_000_000_000,
          target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
          context_length=131_072,
          context_length_effective=8_192,
          recommended_seq_len=4096,
 +        refresh_check_hf_gating=False,
 +        provenance_url=(
 +            "https://about.fb.com/br/news/2025/04/tudo-o-que-anunciamos-no-nosso-primeiro-llamacon/"
 +        ),
 +        provenance_match_text="novo modelo Llama 3.3 8B",
      ),
      BaseModelSpec(
          key="smollm3-3b",

src/dlm/base_models/registry_refresh.pyadded

 +"""Live drift checks for curated base-model registry entries."""
++
 +from __future__ import annotations
++
 +from collections.abc import Callable
 +from dataclasses import dataclass
 +from urllib.error import HTTPError, URLError
 +from urllib.request import Request, urlopen
++
 +from huggingface_hub import HfApi
 +from huggingface_hub.errors import GatedRepoError, RepositoryNotFoundError
++
 +from dlm.base_models import BASE_MODELS, BaseModelSpec
++
 +_USER_AGENT = "DocumentLanguageModel/registry-refresh"
 +FetchText = Callable[[str], str]
++
++
 +@dataclass(frozen=True)
 +class Drift:
 +    """Structured diff between a local registry entry and its live sources."""
++
 +    key: str
 +    hf_id: str
 +    fields: tuple[tuple[str, str, str], ...]
++
 +    def render(self) -> str:
 +        lines = [f"  {self.key} ({self.hf_id})"]
 +        for name, pinned, observed in self.fields:
 +            lines.append(f"    {name:<22} {pinned!r} → {observed!r}")
 +        return "\n".join(lines)
++
++
 +def fetch_text(url: str) -> str:
 +    """Fetch `url` as text for provenance checks."""
++
 +    req = Request(url, headers={"User-Agent": _USER_AGENT})
 +    with urlopen(req, timeout=15) as resp:
 +        body = bytes(resp.read())
 +        charset = str(resp.headers.get_content_charset() or "utf-8")
 +    return body.decode(charset, errors="replace")
++
++
 +def check_entry(
 +    api: HfApi,
 +    entry: BaseModelSpec,
 +    *,
 +    fetch_url_text: FetchText = fetch_text,
 +) -> Drift | None:
 +    """Return a structured drift report for one curated entry, if any."""
++
 +    try:
 +        info = api.model_info(entry.hf_id)
 +    except GatedRepoError:
 +        return Drift(
 +            key=entry.key,
 +            hf_id=entry.hf_id,
 +            fields=(("gating", "readable", "now fully gated"),),
 +        )
 +    except RepositoryNotFoundError:
 +        return Drift(
 +            key=entry.key,
 +            hf_id=entry.hf_id,
 +            fields=(("repository", "present", "missing (renamed or deleted)"),),
 +        )
++
 +    drifted: list[tuple[str, str, str]] = []
++
 +    current_sha = info.sha
 +    if current_sha and current_sha != entry.revision:
 +        drifted.append(("revision", entry.revision, current_sha))
++
 +    if entry.refresh_check_hf_gating:
 +        gated = getattr(info, "gated", False)
 +        gated_observed = bool(gated and gated != "False")
 +        if gated_observed != entry.requires_acceptance:
 +            drifted.append(
 +                (
 +                    "requires_acceptance",
 +                    str(entry.requires_acceptance),
 +                    str(gated_observed),
 +                ),
 +            )
++
 +    if entry.provenance_url and entry.provenance_match_text:
 +        expected = entry.provenance_match_text
 +        try:
 +            page = fetch_url_text(entry.provenance_url)
 +        except (HTTPError, URLError, TimeoutError, ValueError) as exc:
 +            drifted.append(
 +                (
 +                    "provenance_url",
 +                    f"{entry.provenance_url} contains {expected!r}",
 +                    f"unreachable ({type(exc).__name__})",
 +                )
 +            )
 +        else:
 +            if expected.casefold() not in page.casefold():
 +                drifted.append(
 +                    (
 +                        "provenance_marker",
 +                        expected,
 +                        f"missing from {entry.provenance_url}",
 +                    )
 +                )
++
 +    return Drift(key=entry.key, hf_id=entry.hf_id, fields=tuple(drifted)) if drifted else None
++
++
 +def check_registry(*, fetch_url_text: FetchText = fetch_text) -> list[Drift]:
 +    """Check every curated entry and return drift reports."""
++
 +    api = HfApi()
 +    drifts: list[Drift] = []
 +    for entry in BASE_MODELS.values():
 +        drift = check_entry(api, entry, fetch_url_text=fetch_url_text)
 +        if drift is not None:
 +            drifts.append(drift)
 +    return drifts

src/dlm/base_models/schema.pymodified

  - `reasoning_tuned` / `context_length_effective`: additive registry
    hints for prompt defaults and realistic doctor estimates. The
    effective length defaults to the nominal context window when unset.
 +- `refresh_check_hf_gating` / `provenance_url` /
 +  `provenance_match_text`: live-registry refresh hints for entries
 +  whose fetch mirror and first-party provenance page are not the same
 +  system.
  - License / gating: separate fields for SPDX, acceptance gating, and
    re-distribution — each consumed by a different policy gate (license
    acceptance, pack `--include-base`, share-protocol refusal).
      context_length_effective: int | None = Field(None, gt=0)
      recommended_seq_len: int = Field(..., gt=0)
      reasoning_tuned: bool = False
 +    refresh_check_hf_gating: bool = True
 +    provenance_url: str | None = None
 +    provenance_match_text: str | None = None
      # Modality + multi-modal preprocessing (schema v10 + v11, plus
      # Sprint 40's additive `text-moe` discriminator).
+             )
          return self
 +    @model_validator(mode="after")
 +    def _provenance_probe_is_complete(self) -> BaseModelSpec:
 +        url_set = self.provenance_url is not None
 +        text_set = self.provenance_match_text is not None
 +        if url_set != text_set:
 +            raise ValueError(
 +                f"base {self.key!r}: provenance_url and provenance_match_text must be set together"
 +            )
 +        if not self.refresh_check_hf_gating and not url_set:
 +            raise ValueError(
 +                f"base {self.key!r}: refresh_check_hf_gating=False requires a "
 +                "first-party provenance_url + provenance_match_text"
 +            )
 +        return self
++
      @property
      def suggested_prompt_temperature(self) -> float:
          """Default sampling temperature for `dlm prompt`.

tests/unit/base_models/test_errors.pymodified

          assert "meta-llama/Llama-3.2-1B-Instruct" in msg
          assert "https://example.com/license" in msg
          assert "--i-accept-license" in msg
 +        assert "requires license acceptance" in msg
      def test_no_license_url_still_renders(self) -> None:
          err = GatedModelError("org/gated", None)

tests/unit/base_models/test_registry_2026.pymodified

      def test_entry_present(self) -> None:
          assert "llama-3.3-8b-instruct" in BASE_MODELS
 -    def test_follows_existing_llama_gating_pattern(self) -> None:
 +    def test_keeps_existing_llama_policy_surface(self) -> None:
          spec = BASE_MODELS["llama-3.3-8b-instruct"]
 +        assert spec.hf_id == "allura-forge/Llama-3.3-8B-Instruct"
          assert spec.architecture == "LlamaForCausalLM"
          assert spec.template == "llama3"
          assert spec.gguf_arch == "llama"
          assert spec.redistributable is False
          assert spec.license_spdx == "Other"
 +    def test_refreshes_against_mirror_plus_official_provenance_page(self) -> None:
 +        spec = BASE_MODELS["llama-3.3-8b-instruct"]
 +        assert spec.refresh_check_hf_gating is False
 +        assert spec.provenance_url == (
 +            "https://about.fb.com/br/news/2025/04/tudo-o-que-anunciamos-no-nosso-primeiro-llamacon/"
 +        )
 +        assert spec.provenance_match_text == "novo modelo Llama 3.3 8B"
++
      def test_effective_context_hint_is_lower_than_nominal(self) -> None:
          spec = BASE_MODELS["llama-3.3-8b-instruct"]
          assert spec.context_length == 131_072

tests/unit/base_models/test_registry_refresh.pyadded

 +"""Live-drift helper coverage for registry refresh."""
++
 +from __future__ import annotations
++
 +from types import SimpleNamespace
++
 +from dlm.base_models.registry_refresh import Drift, check_entry
 +from dlm.base_models.schema import BaseModelSpec
++
++
 +def _spec(**overrides: object) -> BaseModelSpec:
 +    defaults: dict[str, object] = {
 +        "key": "demo-1b",
 +        "hf_id": "org/demo-1b",
 +        "revision": "0123456789abcdef0123456789abcdef01234567",
 +        "architecture": "DemoForCausalLM",
 +        "params": 1_000_000_000,
 +        "target_modules": ["q_proj", "v_proj"],
 +        "template": "chatml",
 +        "gguf_arch": "demo",
 +        "tokenizer_pre": "demo",
 +        "license_spdx": "Apache-2.0",
 +        "redistributable": True,
 +        "size_gb_fp16": 2.0,
 +        "context_length": 4096,
 +        "recommended_seq_len": 2048,
 +    }
 +    defaults.update(overrides)
 +    return BaseModelSpec.model_validate(defaults)
++
++
 +class _Api:
 +    def __init__(self, *, sha: str, gated: object = False) -> None:
 +        self._info = SimpleNamespace(sha=sha, gated=gated)
++
 +    def model_info(self, _hf_id: str) -> SimpleNamespace:
 +        return self._info
++
++
 +class TestCheckEntry:
 +    def test_no_drift_when_revision_and_gating_match(self) -> None:
 +        spec = _spec()
 +        drift = check_entry(_Api(sha=spec.revision), spec)
 +        assert drift is None
++
 +    def test_revision_drift_is_reported(self) -> None:
 +        spec = _spec()
 +        drift = check_entry(_Api(sha="a" * 40), spec)
 +        assert isinstance(drift, Drift)
 +        assert ("revision", spec.revision, "a" * 40) in drift.fields
++
 +    def test_gating_drift_is_skipped_when_entry_opts_out(self) -> None:
 +        spec = _spec(
 +            requires_acceptance=True,
 +            refresh_check_hf_gating=False,
 +            provenance_url="https://example.com/provenance",
 +            provenance_match_text="official marker",
 +        )
 +        drift = check_entry(
 +            _Api(sha=spec.revision, gated=False),
 +            spec,
 +            fetch_url_text=lambda _url: "official marker",
 +        )
 +        assert drift is None
++
 +    def test_provenance_marker_missing_is_reported(self) -> None:
 +        spec = _spec(
 +            refresh_check_hf_gating=False,
 +            provenance_url="https://example.com/provenance",
 +            provenance_match_text="official marker",
 +        )
 +        drift = check_entry(
 +            _Api(sha=spec.revision),
 +            spec,
 +            fetch_url_text=lambda _url: "different text",
 +        )
 +        assert isinstance(drift, Drift)
 +        assert (
 +            "provenance_marker",
 +            "official marker",
 +            "missing from https://example.com/provenance",
 +        ) in drift.fields

tests/unit/base_models/test_schema.pymodified

          spec = _minimal(context_length=8192, context_length_effective=4096)
          assert spec.effective_context_length == 4096
 +    def test_refresh_hf_gating_check_defaults_true(self) -> None:
 +        assert _minimal().refresh_check_hf_gating is True
++
 +    def test_provenance_probe_requires_url_and_marker_together(self) -> None:
 +        with pytest.raises(ValidationError, match="must be set together"):
 +            _minimal(provenance_url="https://example.com")
 +        with pytest.raises(ValidationError, match="must be set together"):
 +            _minimal(provenance_match_text="marker")
++
 +    def test_disabling_hf_gating_check_requires_provenance_probe(self) -> None:
 +        with pytest.raises(ValidationError, match="requires a first-party provenance_url"):
 +            _minimal(refresh_check_hf_gating=False)
++
 +    def test_disabling_hf_gating_check_with_provenance_is_valid(self) -> None:
 +        spec = _minimal(
 +            refresh_check_hf_gating=False,
 +            provenance_url="https://example.com/provenance",
 +            provenance_match_text="official marker",
 +        )
 +        assert spec.refresh_check_hf_gating is False
 +        assert spec.provenance_match_text == "official marker"
++
  class TestImmutability:
      def test_spec_is_frozen(self) -> None: