tenseleyflow/documentlanguagemodel / add9a24

Browse files

Block sprint jargon leaks

Authored by espadonne
SHA
add9a2416eea1fd02b927c78c3ccb234fd34b3e7
Parents
d8be404
Tree
5214717

10 changed files

StatusFile+-
M scripts/pregate.sh 43 0
M src/dlm/export/preflight.py 2 2
M src/dlm/export/weighted_merge.py 1 1
M src/dlm/hardware/refusals.py 1 1
M src/dlm/store/errors.py 1 1
M src/dlm/templates/fetcher.py 2 2
M tests/unit/export/test_preflight.py 2 1
M tests/unit/hardware/test_refusals.py 2 1
M tests/unit/store/test_manifest.py 2 0
M tests/unit/templates/test_cli.py 2 0
scripts/pregate.shmodified
@@ -72,6 +72,49 @@ if [[ -n "$scatter" ]]; then
7272
     exit 1
7373
 fi
7474
 
75
+echo "==> new sprint jargon in src/dlm"
76
+# Sprint 39 M4: planning terms like `Sprint 23` or `audit-08` should
77
+# not leak into newly added product/runtime strings under src/dlm.
78
+# Compare the current tree against the upstream merge-base when one
79
+# exists, so committed fixes in the working tree override older
80
+# branch-local additions that have not been pushed yet.
81
+collect_src_dlm_diff() {
82
+    local upstream
83
+    upstream=$(git rev-parse --abbrev-ref --symbolic-full-name '@{upstream}' 2>/dev/null || true)
84
+    if [[ -n "$upstream" ]]; then
85
+        local merge_base
86
+        merge_base=$(git merge-base "$upstream" HEAD 2>/dev/null || true)
87
+        if [[ -n "$merge_base" ]]; then
88
+            git diff --unified=0 --no-color "$merge_base" -- 'src/dlm/**' 2>/dev/null || true
89
+            return
90
+        fi
91
+    fi
92
+
93
+    git diff --unified=0 --no-color HEAD -- 'src/dlm/**' 2>/dev/null || true
94
+}
95
+
96
+jargon_hits=$(
97
+    collect_src_dlm_diff | awk '
98
+        /^diff --git / {
99
+            file = $4
100
+            sub("^b/", "", file)
101
+            next
102
+        }
103
+        /^\+\+\+ b\// {
104
+            file = substr($0, 7)
105
+            next
106
+        }
107
+        /^\+[^+]/ && ($0 ~ /Sprint [0-9]+/ || $0 ~ /audit-[0-9]+/) {
108
+            print file ":" substr($0, 2)
109
+        }
110
+    ' | sort -u
111
+)
112
+if [[ -n "$jargon_hits" ]]; then
113
+    echo "$jargon_hits"
114
+    echo "  new Sprint/audit jargon leaked into src/dlm/ — translate it into product or operator language."
115
+    exit 1
116
+fi
117
+
75118
 echo "==> stale dlm_version pin"
76119
 # Any test that hard-pins a frontmatter version exact-match should use
77120
 # >= so schema bumps don't retroactively break the test. Exact pins are
src/dlm/export/preflight.pymodified
@@ -86,8 +86,8 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int:
8686
             probe="tokenizer_vocab",
8787
             detail=(
8888
                 f"adapter dir {adapter_dir} is missing tokenizer_config.json. "
89
-                "Sprint 07 bringup writes this at training end; a checkpoint "
90
-                "predating Sprint 07 can't be exported — re-train."
89
+                "This checkpoint predates tokenizer metadata capture, so "
90
+                "export cannot verify vocab size safely — re-train."
9191
             ),
9292
         )
9393
     try:
src/dlm/export/weighted_merge.pymodified
@@ -300,7 +300,7 @@ def build_and_stage( # pragma: no cover - heavy path
300300
     merge_dir = store.cache_dir_for("_export_merged_" + "_".join(e.name for e in entries))
301301
     # Copy tokenizer + training_run.json from a source adapter so the
302302
     # downstream preflight (tokenizer_vocab) + shared precision-safety
303
-    # gate both work on the composite (audit-07 B2).
303
+    # gate both work on the composite artifact.
304304
     first_source = resolve_first_source_path(store, entries)
305305
     return save_merged_to_tmp(
306306
         merged,
src/dlm/hardware/refusals.pymodified
@@ -130,7 +130,7 @@ def check_multi_gpu_refusals(caps: Capabilities, world_size: int) -> None:
130130
         )
131131
     if caps.backend == Backend.ROCM:
132132
         raise ResolutionError(
133
-            "Multi-GPU training on ROCm is out of scope for Sprint 23; "
133
+            "Multi-GPU training on ROCm is not supported yet; "
134134
             "train single-GPU on ROCm or use a CUDA host for multi-GPU runs.",
135135
         )
136136
     # CUDA path — heterogeneous detection is the caller's responsibility
src/dlm/store/errors.pymodified
@@ -36,7 +36,7 @@ class ManifestVersionError(ManifestCorruptError):
3636
         self.expected_version = expected_version
3737
         reason = (
3838
             f"schema_version {found_version} requires migration to {expected_version} "
39
-            "(Sprint 12b owns the framework)"
39
+            "before this store can be used"
4040
         )
4141
         super().__init__(path, reason)
4242
 
src/dlm/templates/fetcher.pymodified
@@ -58,6 +58,6 @@ def fetch_all(
5858
     this always raises `RemoteFetchUnavailable`.
5959
     """
6060
     raise RemoteFetchUnavailable(
61
-        "remote template gallery fetch is not wired yet — upstream repo "
62
-        "and signing key are pending (Sprint 27 deferred polish).",
61
+        "remote template gallery fetch is not available yet — no pinned "
62
+        "upstream gallery or signing key has been configured.",
6363
     )
tests/unit/export/test_preflight.pymodified
@@ -76,8 +76,9 @@ class TestTokenizerVocab:
7676
         assert check_tokenizer_vocab(tmp_path) == 5000
7777
 
7878
     def test_missing_tokenizer_config_raises(self, tmp_path: Path) -> None:
79
-        with pytest.raises(PreflightError, match="tokenizer_config.json"):
79
+        with pytest.raises(PreflightError, match="tokenizer metadata capture") as exc_info:
8080
             check_tokenizer_vocab(tmp_path)
81
+        assert "Sprint" not in str(exc_info.value)
8182
 
8283
     def test_malformed_config_raises(self, tmp_path: Path) -> None:
8384
         (tmp_path / "tokenizer_config.json").write_text("not json {{{")
tests/unit/hardware/test_refusals.pymodified
@@ -129,8 +129,9 @@ class TestMultiGpuRefusals:
129129
     ) -> None:
130130
         with ctx():
131131
             caps = probe()
132
-        with pytest.raises(ResolutionError, match=expected):
132
+        with pytest.raises(ResolutionError, match=expected) as exc_info:
133133
             check_multi_gpu_refusals(caps, world_size=2)
134
+        assert "Sprint" not in str(exc_info.value)
134135
 
135136
     def test_homogeneous_cuda_accepts_unknown_or_matching_sms(self) -> None:
136137
         assert_homogeneous_cuda([(8, 0), (8, 0), None])
tests/unit/store/test_manifest.pymodified
@@ -195,6 +195,8 @@ class TestCorruptHandling:
195195
             load_manifest(path)
196196
         assert exc_info.value.found_version == 999
197197
         assert exc_info.value.expected_version == CURRENT_MANIFEST_SCHEMA_VERSION
198
+        assert "requires migration to" in str(exc_info.value)
199
+        assert "Sprint" not in str(exc_info.value)
198200
         # Still catchable as the parent class:
199201
         assert isinstance(exc_info.value, ManifestCorruptError)
200202
 
tests/unit/templates/test_cli.pymodified
@@ -47,6 +47,8 @@ def test_templates_list_refresh_falls_back_to_bundled() -> None:
4747
     # Refresh is currently a no-op that warns + falls back; the bundled
4848
     # list still renders and the exit code is 0.
4949
     assert result.exit_code == 0, result.output
50
+    assert "not available yet" in result.output
51
+    assert "Sprint" not in result.output
5052
     assert "coding-tutor" in result.output
5153
 
5254