tenseleyflow/documentlanguagemodel / e405098

Browse files

Pin Llama 3.3 provenance checks

Authored by espadonne
SHA
e4050982729613f5b8def6a511f3c3a4767b8b95
Parents
29a900f
Tree
b9e55a5

10 changed files

StatusFile+-
M docs/cookbook/choosing-a-base.md 1 1
M scripts/refresh-registry.py 10 72
M src/dlm/base_models/errors.py 2 2
M src/dlm/base_models/registry.py 19 10
A src/dlm/base_models/registry_refresh.py 119 0
M src/dlm/base_models/schema.py 22 0
M tests/unit/base_models/test_errors.py 1 0
M tests/unit/base_models/test_registry_2026.py 10 1
A tests/unit/base_models/test_registry_refresh.py 82 0
M tests/unit/base_models/test_schema.py 22 0
docs/cookbook/choosing-a-base.mdmodified
@@ -23,7 +23,7 @@ The fastest way to pick a DLM base is to decide three things first:
2323
 
2424
 ## Notes on the sharp edges
2525
 
26
-- `llama-3.3-8b-instruct` is still treated like the Llama family in DLM’s policy surface: acceptance required, not redistributable, and intended for users who already know they want the Llama line.
26
+- `llama-3.3-8b-instruct` is still treated like the Llama family in DLM’s policy surface: acceptance required, not redistributable, and intended for users who already know they want the Llama line. Today it resolves through a community HF mirror while DLM pins provenance against Meta’s official LlamaCon/newsroom announcement, because Meta has not published a first-party HF repo for this row.
2727
 - `internvl2-2b` and `internvl3-2b` are registry-visible planning targets, but the current generic VL runtime still refuses the InternVL family until DLM owns its custom processor/collator contract.
2828
 - `mistral-small-3.1-24b-instruct` is intentionally refused on MPS by default. It is a real shipped row, just not a casual laptop target.
2929
 
scripts/refresh-registry.pymodified
@@ -1,18 +1,19 @@
11
 #!/usr/bin/env python
2
-"""Re-resolve every curated base-model entry against HuggingFace.
2
+"""Re-resolve every curated base-model entry against its live sources.
33
 
44
 Two modes:
55
 
66
 - Default: print a human-readable diff for each entry whose pinned SHA
7
-  no longer matches HF's `main` (or whose license/gating changed).
7
+  no longer matches its live fetch source (or whose license/gating /
8
+  provenance changed).
89
   Exit 0.
910
 - `--check`: exit 1 if *any* entry has drifted. Used by the weekly
1011
   CI job to open an issue when maintainer action is needed.
1112
 
1213
 Does **not** write back to `registry.py` automatically — drifted SHAs
1314
 are a signal for a human to review the upstream change (new license
14
-terms, tokenizer surgery, etc.). The script prints the ready-to-paste
15
-field values so the manual update is trivial.
15
+terms, tokenizer surgery, provenance changes, etc.). The script prints
16
+the ready-to-paste field values so the manual update is trivial.
1617
 
1718
 Usage:
1819
     uv run python scripts/refresh-registry.py            # print diff
@@ -23,67 +24,9 @@ from __future__ import annotations
2324
 
2425
 import argparse
2526
 import sys
26
-from dataclasses import dataclass
2727
 
28
-from huggingface_hub import HfApi
29
-from huggingface_hub.errors import GatedRepoError, RepositoryNotFoundError
30
-
31
-from dlm.base_models import BASE_MODELS, BaseModelSpec
32
-
33
-
34
-@dataclass(frozen=True)
35
-class Drift:
36
-    """Structured diff between a local registry entry and HF's head."""
37
-
38
-    key: str
39
-    hf_id: str
40
-    fields: tuple[tuple[str, str, str], ...]  # (name, pinned, observed)
41
-
42
-    def render(self) -> str:
43
-        lines = [f"  {self.key} ({self.hf_id})"]
44
-        for name, pinned, observed in self.fields:
45
-            lines.append(f"    {name:<22} {pinned!r} → {observed!r}")
46
-        return "\n".join(lines)
47
-
48
-
49
-def _check_entry(api: HfApi, entry: BaseModelSpec) -> Drift | None:
50
-    try:
51
-        info = api.model_info(entry.hf_id)
52
-    except GatedRepoError:
53
-        # Gated models still expose public metadata via `model_info`;
54
-        # if we can't read them, that's a new gating event worth flagging.
55
-        return Drift(
56
-            key=entry.key,
57
-            hf_id=entry.hf_id,
58
-            fields=(("gating", "readable", "now fully gated"),),
59
-        )
60
-    except RepositoryNotFoundError:
61
-        return Drift(
62
-            key=entry.key,
63
-            hf_id=entry.hf_id,
64
-            fields=(("repository", "present", "missing (renamed or deleted)"),),
65
-        )
66
-
67
-    drifted: list[tuple[str, str, str]] = []
68
-
69
-    current_sha = info.sha
70
-    if current_sha and current_sha != entry.revision:
71
-        drifted.append(("revision", entry.revision, current_sha))
72
-
73
-    gated = getattr(info, "gated", False)
74
-    # HF reports `gated` as False / "auto" / "manual". Non-False values
75
-    # mean acceptance is required.
76
-    gated_observed = bool(gated and gated != "False")
77
-    if gated_observed != entry.requires_acceptance:
78
-        drifted.append(
79
-            (
80
-                "requires_acceptance",
81
-                str(entry.requires_acceptance),
82
-                str(gated_observed),
83
-            ),
84
-        )
85
-
86
-    return Drift(key=entry.key, hf_id=entry.hf_id, fields=tuple(drifted)) if drifted else None
28
+from dlm.base_models import BASE_MODELS
29
+from dlm.base_models.registry_refresh import check_registry
8730
 
8831
 
8932
 def main() -> int:
@@ -95,15 +38,10 @@ def main() -> int:
9538
     )
9639
     args = parser.parse_args()
9740
 
98
-    api = HfApi()
99
-    drifts: list[Drift] = []
100
-    for entry in BASE_MODELS.values():
101
-        drift = _check_entry(api, entry)
102
-        if drift is not None:
103
-            drifts.append(drift)
41
+    drifts = check_registry()
10442
 
10543
     if not drifts:
106
-        print(f"All {len(BASE_MODELS)} registry entries match HF.")
44
+        print(f"All {len(BASE_MODELS)} registry entries match their live sources.")
10745
         return 0
10846
 
10947
     print(f"{len(drifts)} of {len(BASE_MODELS)} entries have drifted:")
@@ -111,7 +49,7 @@ def main() -> int:
11149
         print(drift.render())
11250
     print()
11351
     print(
114
-        "Review each upstream change (commit log / license / gating) and "
52
+        "Review each upstream change (commit log / license / gating / provenance) and "
11553
         "update `src/dlm/base_models/registry.py` by hand."
11654
     )
11755
 
src/dlm/base_models/errors.pymodified
@@ -63,7 +63,7 @@ class ProbeFailedError(BaseModelError):
6363
 
6464
 
6565
 class GatedModelError(BaseModelError):
66
-    """Model requires HuggingFace license acceptance and the user hasn't accepted.
66
+    """Model requires license acceptance and the user hasn't accepted.
6767
 
6868
     Lives here because registry probes catch it first; the acceptance
6969
     record is written elsewhere, but the error shape is owned here.
@@ -74,7 +74,7 @@ class GatedModelError(BaseModelError):
7474
         self.license_url = license_url
7575
         where = f" License: {license_url}" if license_url else ""
7676
         super().__init__(
77
-            f"{hf_id} is a gated HuggingFace model. Accept the license and "
77
+            f"{hf_id} requires license acceptance. Accept the license and "
7878
             f"pass --i-accept-license (or via `dlm init`).{where}"
7979
         )
8080
 
src/dlm/base_models/registry.pymodified
@@ -15,11 +15,12 @@ Notes on individual entries:
1515
   plus a pack-time attestation checkbox would encode this properly —
1616
   deferred follow-up work. Until then, users at the scale threshold
1717
   must consult the license text themselves.
18
-- Llama-3.2 / 3.3 models are gated on HuggingFace
19
-  (`requires_acceptance=True`) and their license does NOT permit
20
-  bundling into a `.dlm.pack`
21
-  (`redistributable=False`) — enforced by the pack gate and
22
-  share-protocol refusal.
18
+- Llama-3.2 models are gated on HuggingFace. Llama-3.3 8B currently
19
+  needs a mirror-backed fetch path because Meta exposes it through the
20
+  Llama API but not a first-party HF repo. DLM still keeps the same
21
+  acceptance + non-redistribution policy surface for the whole Llama
22
+  family (`requires_acceptance=True`, `redistributable=False`) —
23
+  enforced by the pack gate and share-protocol refusal.
2324
 - SmolLM2 / SmolLM3 and Phi-3.5-mini are permissive (Apache-2.0 / MIT).
2425
 - `size_gb_fp16` is approximate; the hardware doctor uses it to seed
2526
   VRAM estimates, which then get refined by runtime checks.
@@ -218,11 +219,14 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
218219
     ),
219220
     BaseModelSpec(
220221
         key="llama-3.3-8b-instruct",
221
-        hf_id="meta-llama/Llama-3.3-8B-Instruct",
222
-        # Placeholder SHA: format-valid, not a real HF commit. The
223
-        # weekly `scripts/refresh-registry.py --check` run surfaces
224
-        # drift and prints the live value for manual review.
225
-        revision="4d5e6f7890abcdeffedcba0987654321abc2d3e4",
222
+        # Meta's first-party LlamaCon announcement explicitly says the
223
+        # Llama API can fine-tune "o novo modelo Llama 3.3 8B", but
224
+        # there is still no first-party HF repo. DLM therefore fetches
225
+        # weights from the community mirror below while
226
+        # refresh-registry separately probes Meta's newsroom article
227
+        # for provenance.
228
+        hf_id="allura-forge/Llama-3.3-8B-Instruct",
229
+        revision="df95224cf87c32d9f4958dd284a07ded620aa4fc",
226230
         architecture="LlamaForCausalLM",
227231
         params=8_000_000_000,
228232
         target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
@@ -237,6 +241,11 @@ _ENTRIES: tuple[BaseModelSpec, ...] = (
237241
         context_length=131_072,
238242
         context_length_effective=8_192,
239243
         recommended_seq_len=4096,
244
+        refresh_check_hf_gating=False,
245
+        provenance_url=(
246
+            "https://about.fb.com/br/news/2025/04/tudo-o-que-anunciamos-no-nosso-primeiro-llamacon/"
247
+        ),
248
+        provenance_match_text="novo modelo Llama 3.3 8B",
240249
     ),
241250
     BaseModelSpec(
242251
         key="smollm3-3b",
src/dlm/base_models/registry_refresh.pyadded
@@ -0,0 +1,119 @@
1
+"""Live drift checks for curated base-model registry entries."""
2
+
3
+from __future__ import annotations
4
+
5
+from collections.abc import Callable
6
+from dataclasses import dataclass
7
+from urllib.error import HTTPError, URLError
8
+from urllib.request import Request, urlopen
9
+
10
+from huggingface_hub import HfApi
11
+from huggingface_hub.errors import GatedRepoError, RepositoryNotFoundError
12
+
13
+from dlm.base_models import BASE_MODELS, BaseModelSpec
14
+
15
+_USER_AGENT = "DocumentLanguageModel/registry-refresh"
16
+FetchText = Callable[[str], str]
17
+
18
+
19
+@dataclass(frozen=True)
20
+class Drift:
21
+    """Structured diff between a local registry entry and its live sources."""
22
+
23
+    key: str
24
+    hf_id: str
25
+    fields: tuple[tuple[str, str, str], ...]
26
+
27
+    def render(self) -> str:
28
+        lines = [f"  {self.key} ({self.hf_id})"]
29
+        for name, pinned, observed in self.fields:
30
+            lines.append(f"    {name:<22} {pinned!r} → {observed!r}")
31
+        return "\n".join(lines)
32
+
33
+
34
+def fetch_text(url: str) -> str:
35
+    """Fetch `url` as text for provenance checks."""
36
+
37
+    req = Request(url, headers={"User-Agent": _USER_AGENT})
38
+    with urlopen(req, timeout=15) as resp:
39
+        body = bytes(resp.read())
40
+        charset = str(resp.headers.get_content_charset() or "utf-8")
41
+    return body.decode(charset, errors="replace")
42
+
43
+
44
+def check_entry(
45
+    api: HfApi,
46
+    entry: BaseModelSpec,
47
+    *,
48
+    fetch_url_text: FetchText = fetch_text,
49
+) -> Drift | None:
50
+    """Return a structured drift report for one curated entry, if any."""
51
+
52
+    try:
53
+        info = api.model_info(entry.hf_id)
54
+    except GatedRepoError:
55
+        return Drift(
56
+            key=entry.key,
57
+            hf_id=entry.hf_id,
58
+            fields=(("gating", "readable", "now fully gated"),),
59
+        )
60
+    except RepositoryNotFoundError:
61
+        return Drift(
62
+            key=entry.key,
63
+            hf_id=entry.hf_id,
64
+            fields=(("repository", "present", "missing (renamed or deleted)"),),
65
+        )
66
+
67
+    drifted: list[tuple[str, str, str]] = []
68
+
69
+    current_sha = info.sha
70
+    if current_sha and current_sha != entry.revision:
71
+        drifted.append(("revision", entry.revision, current_sha))
72
+
73
+    if entry.refresh_check_hf_gating:
74
+        gated = getattr(info, "gated", False)
75
+        gated_observed = bool(gated and gated != "False")
76
+        if gated_observed != entry.requires_acceptance:
77
+            drifted.append(
78
+                (
79
+                    "requires_acceptance",
80
+                    str(entry.requires_acceptance),
81
+                    str(gated_observed),
82
+                ),
83
+            )
84
+
85
+    if entry.provenance_url and entry.provenance_match_text:
86
+        expected = entry.provenance_match_text
87
+        try:
88
+            page = fetch_url_text(entry.provenance_url)
89
+        except (HTTPError, URLError, TimeoutError, ValueError) as exc:
90
+            drifted.append(
91
+                (
92
+                    "provenance_url",
93
+                    f"{entry.provenance_url} contains {expected!r}",
94
+                    f"unreachable ({type(exc).__name__})",
95
+                )
96
+            )
97
+        else:
98
+            if expected.casefold() not in page.casefold():
99
+                drifted.append(
100
+                    (
101
+                        "provenance_marker",
102
+                        expected,
103
+                        f"missing from {entry.provenance_url}",
104
+                    )
105
+                )
106
+
107
+    return Drift(key=entry.key, hf_id=entry.hf_id, fields=tuple(drifted)) if drifted else None
108
+
109
+
110
+def check_registry(*, fetch_url_text: FetchText = fetch_text) -> list[Drift]:
111
+    """Check every curated entry and return drift reports."""
112
+
113
+    api = HfApi()
114
+    drifts: list[Drift] = []
115
+    for entry in BASE_MODELS.values():
116
+        drift = check_entry(api, entry, fetch_url_text=fetch_url_text)
117
+        if drift is not None:
118
+            drifts.append(drift)
119
+    return drifts
src/dlm/base_models/schema.pymodified
@@ -16,6 +16,10 @@ point:
1616
 - `reasoning_tuned` / `context_length_effective`: additive registry
1717
   hints for prompt defaults and realistic doctor estimates. The
1818
   effective length defaults to the nominal context window when unset.
19
+- `refresh_check_hf_gating` / `provenance_url` /
20
+  `provenance_match_text`: live-registry refresh hints for entries
21
+  whose fetch mirror and first-party provenance page are not the same
22
+  system.
1923
 - License / gating: separate fields for SPDX, acceptance gating, and
2024
   re-distribution — each consumed by a different policy gate (license
2125
   acceptance, pack `--include-base`, share-protocol refusal).
@@ -141,6 +145,9 @@ class BaseModelSpec(BaseModel):
141145
     context_length_effective: int | None = Field(None, gt=0)
142146
     recommended_seq_len: int = Field(..., gt=0)
143147
     reasoning_tuned: bool = False
148
+    refresh_check_hf_gating: bool = True
149
+    provenance_url: str | None = None
150
+    provenance_match_text: str | None = None
144151
 
145152
     # Modality + multi-modal preprocessing (schema v10 + v11, plus
146153
     # Sprint 40's additive `text-moe` discriminator).
@@ -202,6 +209,21 @@ class BaseModelSpec(BaseModel):
202209
             )
203210
         return self
204211
 
212
+    @model_validator(mode="after")
213
+    def _provenance_probe_is_complete(self) -> BaseModelSpec:
214
+        url_set = self.provenance_url is not None
215
+        text_set = self.provenance_match_text is not None
216
+        if url_set != text_set:
217
+            raise ValueError(
218
+                f"base {self.key!r}: provenance_url and provenance_match_text must be set together"
219
+            )
220
+        if not self.refresh_check_hf_gating and not url_set:
221
+            raise ValueError(
222
+                f"base {self.key!r}: refresh_check_hf_gating=False requires a "
223
+                "first-party provenance_url + provenance_match_text"
224
+            )
225
+        return self
226
+
205227
     @property
206228
     def suggested_prompt_temperature(self) -> float:
207229
         """Default sampling temperature for `dlm prompt`.
tests/unit/base_models/test_errors.pymodified
@@ -54,6 +54,7 @@ class TestGatedModelError:
5454
         assert "meta-llama/Llama-3.2-1B-Instruct" in msg
5555
         assert "https://example.com/license" in msg
5656
         assert "--i-accept-license" in msg
57
+        assert "requires license acceptance" in msg
5758
 
5859
     def test_no_license_url_still_renders(self) -> None:
5960
         err = GatedModelError("org/gated", None)
tests/unit/base_models/test_registry_2026.pymodified
@@ -62,8 +62,9 @@ class TestLlama33RegistryEntry:
6262
     def test_entry_present(self) -> None:
6363
         assert "llama-3.3-8b-instruct" in BASE_MODELS
6464
 
65
-    def test_follows_existing_llama_gating_pattern(self) -> None:
65
+    def test_keeps_existing_llama_policy_surface(self) -> None:
6666
         spec = BASE_MODELS["llama-3.3-8b-instruct"]
67
+        assert spec.hf_id == "allura-forge/Llama-3.3-8B-Instruct"
6768
         assert spec.architecture == "LlamaForCausalLM"
6869
         assert spec.template == "llama3"
6970
         assert spec.gguf_arch == "llama"
@@ -71,6 +72,14 @@ class TestLlama33RegistryEntry:
7172
         assert spec.redistributable is False
7273
         assert spec.license_spdx == "Other"
7374
 
75
+    def test_refreshes_against_mirror_plus_official_provenance_page(self) -> None:
76
+        spec = BASE_MODELS["llama-3.3-8b-instruct"]
77
+        assert spec.refresh_check_hf_gating is False
78
+        assert spec.provenance_url == (
79
+            "https://about.fb.com/br/news/2025/04/tudo-o-que-anunciamos-no-nosso-primeiro-llamacon/"
80
+        )
81
+        assert spec.provenance_match_text == "novo modelo Llama 3.3 8B"
82
+
7483
     def test_effective_context_hint_is_lower_than_nominal(self) -> None:
7584
         spec = BASE_MODELS["llama-3.3-8b-instruct"]
7685
         assert spec.context_length == 131_072
tests/unit/base_models/test_registry_refresh.pyadded
@@ -0,0 +1,82 @@
1
+"""Live-drift helper coverage for registry refresh."""
2
+
3
+from __future__ import annotations
4
+
5
+from types import SimpleNamespace
6
+
7
+from dlm.base_models.registry_refresh import Drift, check_entry
8
+from dlm.base_models.schema import BaseModelSpec
9
+
10
+
11
+def _spec(**overrides: object) -> BaseModelSpec:
12
+    defaults: dict[str, object] = {
13
+        "key": "demo-1b",
14
+        "hf_id": "org/demo-1b",
15
+        "revision": "0123456789abcdef0123456789abcdef01234567",
16
+        "architecture": "DemoForCausalLM",
17
+        "params": 1_000_000_000,
18
+        "target_modules": ["q_proj", "v_proj"],
19
+        "template": "chatml",
20
+        "gguf_arch": "demo",
21
+        "tokenizer_pre": "demo",
22
+        "license_spdx": "Apache-2.0",
23
+        "redistributable": True,
24
+        "size_gb_fp16": 2.0,
25
+        "context_length": 4096,
26
+        "recommended_seq_len": 2048,
27
+    }
28
+    defaults.update(overrides)
29
+    return BaseModelSpec.model_validate(defaults)
30
+
31
+
32
+class _Api:
33
+    def __init__(self, *, sha: str, gated: object = False) -> None:
34
+        self._info = SimpleNamespace(sha=sha, gated=gated)
35
+
36
+    def model_info(self, _hf_id: str) -> SimpleNamespace:
37
+        return self._info
38
+
39
+
40
+class TestCheckEntry:
41
+    def test_no_drift_when_revision_and_gating_match(self) -> None:
42
+        spec = _spec()
43
+        drift = check_entry(_Api(sha=spec.revision), spec)
44
+        assert drift is None
45
+
46
+    def test_revision_drift_is_reported(self) -> None:
47
+        spec = _spec()
48
+        drift = check_entry(_Api(sha="a" * 40), spec)
49
+        assert isinstance(drift, Drift)
50
+        assert ("revision", spec.revision, "a" * 40) in drift.fields
51
+
52
+    def test_gating_drift_is_skipped_when_entry_opts_out(self) -> None:
53
+        spec = _spec(
54
+            requires_acceptance=True,
55
+            refresh_check_hf_gating=False,
56
+            provenance_url="https://example.com/provenance",
57
+            provenance_match_text="official marker",
58
+        )
59
+        drift = check_entry(
60
+            _Api(sha=spec.revision, gated=False),
61
+            spec,
62
+            fetch_url_text=lambda _url: "official marker",
63
+        )
64
+        assert drift is None
65
+
66
+    def test_provenance_marker_missing_is_reported(self) -> None:
67
+        spec = _spec(
68
+            refresh_check_hf_gating=False,
69
+            provenance_url="https://example.com/provenance",
70
+            provenance_match_text="official marker",
71
+        )
72
+        drift = check_entry(
73
+            _Api(sha=spec.revision),
74
+            spec,
75
+            fetch_url_text=lambda _url: "different text",
76
+        )
77
+        assert isinstance(drift, Drift)
78
+        assert (
79
+            "provenance_marker",
80
+            "official marker",
81
+            "missing from https://example.com/provenance",
82
+        ) in drift.fields
tests/unit/base_models/test_schema.pymodified
@@ -167,6 +167,28 @@ class TestSprint40Substrate:
167167
         spec = _minimal(context_length=8192, context_length_effective=4096)
168168
         assert spec.effective_context_length == 4096
169169
 
170
+    def test_refresh_hf_gating_check_defaults_true(self) -> None:
171
+        assert _minimal().refresh_check_hf_gating is True
172
+
173
+    def test_provenance_probe_requires_url_and_marker_together(self) -> None:
174
+        with pytest.raises(ValidationError, match="must be set together"):
175
+            _minimal(provenance_url="https://example.com")
176
+        with pytest.raises(ValidationError, match="must be set together"):
177
+            _minimal(provenance_match_text="marker")
178
+
179
+    def test_disabling_hf_gating_check_requires_provenance_probe(self) -> None:
180
+        with pytest.raises(ValidationError, match="requires a first-party provenance_url"):
181
+            _minimal(refresh_check_hf_gating=False)
182
+
183
+    def test_disabling_hf_gating_check_with_provenance_is_valid(self) -> None:
184
+        spec = _minimal(
185
+            refresh_check_hf_gating=False,
186
+            provenance_url="https://example.com/provenance",
187
+            provenance_match_text="official marker",
188
+        )
189
+        assert spec.refresh_check_hf_gating is False
190
+        assert spec.provenance_match_text == "official marker"
191
+
170192
 
171193
 class TestImmutability:
172194
     def test_spec_is_frozen(self) -> None: