`add9a24`

Block sprint jargon leaks

Authored by

espadonne 3 weeks ago

SHA: add9a2416eea1fd02b927c78c3ccb234fd34b3e7
Parents: d8be404
Tree: 5214717

10 changed files

Status	File	+	-
M	`scripts/pregate.sh`	43	0
M	`src/dlm/export/preflight.py`	2	2
M	`src/dlm/export/weighted_merge.py`	1	1
M	`src/dlm/hardware/refusals.py`	1	1
M	`src/dlm/store/errors.py`	1	1
M	`src/dlm/templates/fetcher.py`	2	2
M	`tests/unit/export/test_preflight.py`	2	1
M	`tests/unit/hardware/test_refusals.py`	2	1
M	`tests/unit/store/test_manifest.py`	2	0
M	`tests/unit/templates/test_cli.py`	2	0

scripts/pregate.shmodified

      exit 1
  fi
 +echo "==> new sprint jargon in src/dlm"
 +# Sprint 39 M4: planning terms like `Sprint 23` or `audit-08` should
 +# not leak into newly added product/runtime strings under src/dlm.
 +# Compare the current tree against the upstream merge-base when one
 +# exists, so committed fixes in the working tree override older
 +# branch-local additions that have not been pushed yet.
 +collect_src_dlm_diff() {
 +    local upstream
 +    upstream=$(git rev-parse --abbrev-ref --symbolic-full-name '@{upstream}' 2>/dev/null || true)
 +    if [[ -n "$upstream" ]]; then
 +        local merge_base
 +        merge_base=$(git merge-base "$upstream" HEAD 2>/dev/null || true)
 +        if [[ -n "$merge_base" ]]; then
 +            git diff --unified=0 --no-color "$merge_base" -- 'src/dlm/**' 2>/dev/null || true
 +            return
 +        fi
 +    fi
++
 +    git diff --unified=0 --no-color HEAD -- 'src/dlm/**' 2>/dev/null || true
 +}
++
 +jargon_hits=$(
 +    collect_src_dlm_diff | awk '
 +        /^diff --git / {
 +            file = $4
 +            sub("^b/", "", file)
 +            next
 +        }
 +        /^\+\+\+ b\// {
 +            file = substr($0, 7)
 +            next
 +        }
 +        /^\+[^+]/ && ($0 ~ /Sprint [0-9]+/ || $0 ~ /audit-[0-9]+/) {
 +            print file ":" substr($0, 2)
 +        }
 +    ' | sort -u
 +)
 +if [[ -n "$jargon_hits" ]]; then
 +    echo "$jargon_hits"
 +    echo "  new Sprint/audit jargon leaked into src/dlm/ — translate it into product or operator language."
 +    exit 1
 +fi
++
  echo "==> stale dlm_version pin"
  # Any test that hard-pins a frontmatter version exact-match should use
  # >= so schema bumps don't retroactively break the test. Exact pins are

src/dlm/export/preflight.pymodified

              probe="tokenizer_vocab",
              detail=(
                  f"adapter dir {adapter_dir} is missing tokenizer_config.json. "
 -                "Sprint 07 bringup writes this at training end; a checkpoint "
 -                "predating Sprint 07 can't be exported — re-train."
 +                "This checkpoint predates tokenizer metadata capture, so "
 +                "export cannot verify vocab size safely — re-train."
              ),
+         )
      try:

src/dlm/export/weighted_merge.pymodified

      merge_dir = store.cache_dir_for("_export_merged_" + "_".join(e.name for e in entries))
      # Copy tokenizer + training_run.json from a source adapter so the
      # downstream preflight (tokenizer_vocab) + shared precision-safety
 -    # gate both work on the composite (audit-07 B2).
 +    # gate both work on the composite artifact.
      first_source = resolve_first_source_path(store, entries)
      return save_merged_to_tmp(
          merged,

src/dlm/hardware/refusals.pymodified


         )
     if caps.backend == Backend.ROCM:
         raise ResolutionError(
-            "Multi-GPU training on ROCm is out of scope for Sprint 23; "
+            "Multi-GPU training on ROCm is not supported yet; "
             "train single-GPU on ROCm or use a CUDA host for multi-GPU runs.",
         )
     # CUDA path — heterogeneous detection is the caller's responsibility

src/dlm/store/errors.pymodified

          self.expected_version = expected_version
          reason = (
              f"schema_version {found_version} requires migration to {expected_version} "
 -            "(Sprint 12b owns the framework)"
 +            "before this store can be used"
+         )
          super().__init__(path, reason)

src/dlm/templates/fetcher.pymodified

      this always raises `RemoteFetchUnavailable`.
      """
      raise RemoteFetchUnavailable(
 -        "remote template gallery fetch is not wired yet — upstream repo "
 -        "and signing key are pending (Sprint 27 deferred polish).",
 +        "remote template gallery fetch is not available yet — no pinned "
 +        "upstream gallery or signing key has been configured.",
+     )

tests/unit/export/test_preflight.pymodified

          assert check_tokenizer_vocab(tmp_path) == 5000
      def test_missing_tokenizer_config_raises(self, tmp_path: Path) -> None:
 -        with pytest.raises(PreflightError, match="tokenizer_config.json"):
 +        with pytest.raises(PreflightError, match="tokenizer metadata capture") as exc_info:
              check_tokenizer_vocab(tmp_path)
 +        assert "Sprint" not in str(exc_info.value)
      def test_malformed_config_raises(self, tmp_path: Path) -> None:
          (tmp_path / "tokenizer_config.json").write_text("not json {{{")

tests/unit/hardware/test_refusals.pymodified

      ) -> None:
          with ctx():
              caps = probe()
 -        with pytest.raises(ResolutionError, match=expected):
 +        with pytest.raises(ResolutionError, match=expected) as exc_info:
              check_multi_gpu_refusals(caps, world_size=2)
 +        assert "Sprint" not in str(exc_info.value)
      def test_homogeneous_cuda_accepts_unknown_or_matching_sms(self) -> None:
          assert_homogeneous_cuda([(8, 0), (8, 0), None])

tests/unit/store/test_manifest.pymodified

              load_manifest(path)
          assert exc_info.value.found_version == 999
          assert exc_info.value.expected_version == CURRENT_MANIFEST_SCHEMA_VERSION
 +        assert "requires migration to" in str(exc_info.value)
 +        assert "Sprint" not in str(exc_info.value)
          # Still catchable as the parent class:
          assert isinstance(exc_info.value, ManifestCorruptError)

tests/unit/templates/test_cli.pymodified

      # Refresh is currently a no-op that warns + falls back; the bundled
      # list still renders and the exit code is 0.
      assert result.exit_code == 0, result.output
 +    assert "not available yet" in result.output
 +    assert "Sprint" not in result.output
      assert "coding-tutor" in result.output

`@@ -130,7 +130,7 @@` def check_multi_gpu_refusals(caps: Capabilities, world_size: int) -> None:
130	130	)
131	131	if caps.backend == Backend.ROCM:
132	132	raise ResolutionError(
133		- "Multi-GPU training on ROCm is out of scope for Sprint 23; "
	133	+ "Multi-GPU training on ROCm is not supported yet; "
134	134	"train single-GPU on ROCm or use a CUDA host for multi-GPU runs.",
135	135	)
136	136	# CUDA path — heterogeneous detection is the caller's responsibility