"""S25 — gradient_ghost integration tests. Two flavors: 1. **Real-store (skipped on CI):** runs against a known-undertrained adapter at ``~/.dlm/store/01KPPFAB2Z6DWCWY0QV702TSTX/`` if present. This is the prove-the-value test the sprint DoD requires on a real dlm-trained adapter. Skipped cleanly when the store is absent so CI without local dlm install still passes. 2. **Synthetic-converged (runs everywhere):** writes a fully-formed converged training_state.pt + matching safetensors fixture and asserts PASS. Pairs with the real-store FAIL case to give end- to-end "FAIL on undertrained, PASS on converged" coverage in CI. Marked ``slow + online`` because building a synthetic converged training_state.pt requires torch-pickle round-tripping a real-shape optimizer state — heavier than a unit test should be. """ from __future__ import annotations from pathlib import Path import numpy as np import pytest torch = pytest.importorskip("torch", reason="needs the [hf] extra (torch)") safetensors_numpy = pytest.importorskip( "safetensors.numpy", reason="needs the [hf] extra (safetensors)" ) from dlm_sway.core.result import Verdict # noqa: E402 from dlm_sway.probes.base import RunContext, build_probe # noqa: E402 pytestmark = [pytest.mark.slow, pytest.mark.online] _REAL_STORE_PATH = ( Path.home() / ".dlm" / "store" / "01KPPFAB2Z6DWCWY0QV702TSTX" / "adapter" / "versions" / "v0001" ) def test_real_undertrained_dlm_store_fails(tmp_path: Path) -> None: """If a known dlm-trained undertrained adapter is on disk, the probe must FAIL on it. Skipped on machines without the local fixture (CI). The store was the ground-truth artifact that drove the sprint design — it was a real ``--max-steps 2`` smoke-test run. """ if not (_REAL_STORE_PATH / "training_state.pt").exists(): pytest.skip( f"no dlm store fixture at {_REAL_STORE_PATH} — skipping the " "real-adapter prove-the-value test (synthetic test below " "still runs)" ) probe, spec = build_probe( { "name": "gg_real", "kind": "gradient_ghost", "adapter_path": str(_REAL_STORE_PATH), } ) result = probe.run(spec, RunContext()) assert result.verdict == Verdict.FAIL, ( f"expected FAIL on a known-undertrained dlm store, got {result.verdict}: {result.message}" ) # The real fixture is global_step=2 — a clean primary-signal hit. assert result.evidence["global_step"] < 50 assert result.evidence["primary_signal"] in ( "global_step_below_threshold", "all_optimizer_state_nan", ) def _build_converged_fixture(adapter_dir: Path) -> int: """Write a synthetic 'converged' adapter pair. - safetensors with realistic per-layer LoRA tensor names - training_state.pt with global_step=500 (well above threshold) and a flat per-param exp_avg_sq distribution (no layer crosses the per-layer ratio). """ adapter_dir.mkdir(parents=True, exist_ok=True) num_layers = 4 target_modules = ("q_proj", "v_proj") rank = 8 in_features = 64 weights: dict[str, np.ndarray] = {} for layer_idx in range(num_layers): for mod in target_modules: base = f"base_model.model.model.layers.{layer_idx}.self_attn.{mod}" weights[f"{base}.lora_A.weight"] = np.zeros((rank, in_features), dtype=np.float32) weights[f"{base}.lora_B.weight"] = np.zeros((in_features, rank), dtype=np.float32) safetensors_numpy.save_file(weights, str(adapter_dir / "adapter_model.safetensors")) num_keys = len(weights) # Flat distribution: every param's exp_avg_sq is 0.1 (a small but # finite value typical of a converged Adam state). state_dict: dict[int, dict[str, object]] = {} for pid in range(num_keys): state_dict[pid] = { "step": torch.tensor(500.0), "exp_avg": torch.zeros((4,), dtype=torch.float32), "exp_avg_sq": torch.full((4,), 0.1, dtype=torch.float32), } payload = { "optimizer_state_dict": { "state": state_dict, "param_groups": [{"lr": 1e-4, "params": list(range(num_keys))}], }, "scheduler_state_dict": {}, "scaler_state_dict": None, "torch_rng_state": torch.zeros(8, dtype=torch.uint8), "cuda_rng_state": None, "numpy_rng_state": None, "python_random_state": None, "global_step": 500, "epoch": 5.0, "best_val_loss": 0.42, "dlm_manifest_hash": None, "base_model_revision": "synthetic-test-fixture", "pinned_versions": {"torch": "2.11.0"}, "use_qlora": False, } torch.save(payload, str(adapter_dir / "training_state.pt")) return num_keys def test_synthetic_converged_adapter_passes(tmp_path: Path) -> None: """A hand-rolled converged training_state (global_step=500, flat exp_avg_sq distribution) must PASS. Together with the real-store FAIL test above, covers the sprint's prove-the-value: 'undertrained → FAIL, converged → PASS'. """ adapter_dir = tmp_path / "synthetic-converged" _build_converged_fixture(adapter_dir) probe, spec = build_probe( { "name": "gg_synth", "kind": "gradient_ghost", "adapter_path": str(adapter_dir), } ) result = probe.run(spec, RunContext()) assert result.verdict == Verdict.PASS, ( f"expected PASS on a synthetic converged adapter, got {result.verdict}: {result.message}" ) assert result.evidence["global_step"] == 500 assert result.evidence["frac_layers_undertrained"] == 0.0 assert result.evidence["num_layers"] == 4 def test_runner_skips_backend_for_pure_pre_run_suite(tmp_path: Path) -> None: """End-to-end: a suite containing only gradient_ghost runs successfully with backend=None. Confirms the S25 P5 runner contract holds end-to-end (not just at the probe level).""" from dlm_sway.core.model import ModelSpec from dlm_sway.suite.runner import run as run_suite from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec adapter_dir = tmp_path / "synthetic-converged" _build_converged_fixture(adapter_dir) spec = SwaySpec( version=1, models=SuiteModels( base=ModelSpec(base="dummy", kind="dummy"), ft=ModelSpec(base="dummy", kind="dummy", adapter=adapter_dir), ), defaults=SuiteDefaults(seed=0), suite=[ { "name": "gg", "kind": "gradient_ghost", "adapter_path": str(adapter_dir), }, ], ) result = run_suite(spec, backend=None, spec_path="") assert len(result.probes) == 1 assert result.probes[0].verdict == Verdict.PASS # No backend, no backend stats. assert result.backend_stats == {}