`c15794c`

tests/integration: gradient_ghost real-store FAIL + synthetic converged PASS + runner skip-backend e2e (S25 P8)

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 2 weeks ago

SHA: c15794cf28952551bad07c0460419b48e2ef882d
Parents: 67dd8d0
Tree: 4c3d091

1 changed file

Status	File	+	-
A	`tests/integration/test_probe_gradient_ghost.py`	191	0

tests/integration/test_probe_gradient_ghost.pyadded

 +"""S25 — gradient_ghost integration tests.
++
 +Two flavors:
++
 +1. **Real-store (skipped on CI):** runs against a known-undertrained
 +   adapter at ``~/.dlm/store/01KPPFAB2Z6DWCWY0QV702TSTX/`` if
 +   present. This is the prove-the-value test the sprint DoD requires
 +   on a real dlm-trained adapter. Skipped cleanly when the store is
 +   absent so CI without local dlm install still passes.
 +2. **Synthetic-converged (runs everywhere):** writes a fully-formed
 +   converged training_state.pt + matching safetensors fixture and
 +   asserts PASS. Pairs with the real-store FAIL case to give end-
 +   to-end "FAIL on undertrained, PASS on converged" coverage in CI.
++
 +Marked ``slow + online`` because building a synthetic converged
 +training_state.pt requires torch-pickle round-tripping a real-shape
 +optimizer state — heavier than a unit test should be.
 +"""
++
 +from __future__ import annotations
++
 +from pathlib import Path
++
 +import numpy as np
 +import pytest
++
 +torch = pytest.importorskip("torch", reason="needs the [hf] extra (torch)")
 +safetensors_numpy = pytest.importorskip(
 +    "safetensors.numpy", reason="needs the [hf] extra (safetensors)"
 +)
++
 +from dlm_sway.core.result import Verdict  # noqa: E402
 +from dlm_sway.probes.base import RunContext, build_probe  # noqa: E402
++
 +pytestmark = [pytest.mark.slow, pytest.mark.online]
++
++
 +_REAL_STORE_PATH = (
 +    Path.home() / ".dlm" / "store" / "01KPPFAB2Z6DWCWY0QV702TSTX" / "adapter" / "versions" / "v0001"
 +)
++
++
 +def test_real_undertrained_dlm_store_fails(tmp_path: Path) -> None:
 +    """If a known dlm-trained undertrained adapter is on disk, the
 +    probe must FAIL on it.
++
 +    Skipped on machines without the local fixture (CI). The store
 +    was the ground-truth artifact that drove the sprint design — it
 +    was a real ``--max-steps 2`` smoke-test run.
 +    """
 +    if not (_REAL_STORE_PATH / "training_state.pt").exists():
 +        pytest.skip(
 +            f"no dlm store fixture at {_REAL_STORE_PATH} — skipping the "
 +            "real-adapter prove-the-value test (synthetic test below "
 +            "still runs)"
 +        )
++
 +    probe, spec = build_probe(
 +        {
 +            "name": "gg_real",
 +            "kind": "gradient_ghost",
 +            "adapter_path": str(_REAL_STORE_PATH),
 +        }
 +    )
 +    result = probe.run(spec, RunContext())
++
 +    assert result.verdict == Verdict.FAIL, (
 +        f"expected FAIL on a known-undertrained dlm store, got {result.verdict}: {result.message}"
 +    )
 +    # The real fixture is global_step=2 — a clean primary-signal hit.
 +    assert result.evidence["global_step"] < 50
 +    assert result.evidence["primary_signal"] in (
 +        "global_step_below_threshold",
 +        "all_optimizer_state_nan",
 +    )
++
++
 +def _build_converged_fixture(adapter_dir: Path) -> int:
 +    """Write a synthetic 'converged' adapter pair.
++
 +    - safetensors with realistic per-layer LoRA tensor names
 +    - training_state.pt with global_step=500 (well above threshold)
 +      and a flat per-param exp_avg_sq distribution (no layer
 +      crosses the per-layer ratio).
 +    """
 +    adapter_dir.mkdir(parents=True, exist_ok=True)
 +    num_layers = 4
 +    target_modules = ("q_proj", "v_proj")
 +    rank = 8
 +    in_features = 64
++
 +    weights: dict[str, np.ndarray] = {}
 +    for layer_idx in range(num_layers):
 +        for mod in target_modules:
 +            base = f"base_model.model.model.layers.{layer_idx}.self_attn.{mod}"
 +            weights[f"{base}.lora_A.weight"] = np.zeros((rank, in_features), dtype=np.float32)
 +            weights[f"{base}.lora_B.weight"] = np.zeros((in_features, rank), dtype=np.float32)
 +    safetensors_numpy.save_file(weights, str(adapter_dir / "adapter_model.safetensors"))
 +    num_keys = len(weights)
++
 +    # Flat distribution: every param's exp_avg_sq is 0.1 (a small but
 +    # finite value typical of a converged Adam state).
 +    state_dict: dict[int, dict[str, object]] = {}
 +    for pid in range(num_keys):
 +        state_dict[pid] = {
 +            "step": torch.tensor(500.0),
 +            "exp_avg": torch.zeros((4,), dtype=torch.float32),
 +            "exp_avg_sq": torch.full((4,), 0.1, dtype=torch.float32),
 +        }
++
 +    payload = {
 +        "optimizer_state_dict": {
 +            "state": state_dict,
 +            "param_groups": [{"lr": 1e-4, "params": list(range(num_keys))}],
 +        },
 +        "scheduler_state_dict": {},
 +        "scaler_state_dict": None,
 +        "torch_rng_state": torch.zeros(8, dtype=torch.uint8),
 +        "cuda_rng_state": None,
 +        "numpy_rng_state": None,
 +        "python_random_state": None,
 +        "global_step": 500,
 +        "epoch": 5.0,
 +        "best_val_loss": 0.42,
 +        "dlm_manifest_hash": None,
 +        "base_model_revision": "synthetic-test-fixture",
 +        "pinned_versions": {"torch": "2.11.0"},
 +        "use_qlora": False,
 +    }
 +    torch.save(payload, str(adapter_dir / "training_state.pt"))
 +    return num_keys
++
++
 +def test_synthetic_converged_adapter_passes(tmp_path: Path) -> None:
 +    """A hand-rolled converged training_state (global_step=500, flat
 +    exp_avg_sq distribution) must PASS.
++
 +    Together with the real-store FAIL test above, covers the
 +    sprint's prove-the-value: 'undertrained → FAIL, converged → PASS'.
 +    """
 +    adapter_dir = tmp_path / "synthetic-converged"
 +    _build_converged_fixture(adapter_dir)
++
 +    probe, spec = build_probe(
 +        {
 +            "name": "gg_synth",
 +            "kind": "gradient_ghost",
 +            "adapter_path": str(adapter_dir),
 +        }
 +    )
 +    result = probe.run(spec, RunContext())
++
 +    assert result.verdict == Verdict.PASS, (
 +        f"expected PASS on a synthetic converged adapter, got {result.verdict}: {result.message}"
 +    )
 +    assert result.evidence["global_step"] == 500
 +    assert result.evidence["frac_layers_undertrained"] == 0.0
 +    assert result.evidence["num_layers"] == 4
++
++
 +def test_runner_skips_backend_for_pure_pre_run_suite(tmp_path: Path) -> None:
 +    """End-to-end: a suite containing only gradient_ghost runs
 +    successfully with backend=None. Confirms the S25 P5 runner
 +    contract holds end-to-end (not just at the probe level)."""
 +    from dlm_sway.core.model import ModelSpec
 +    from dlm_sway.suite.runner import run as run_suite
 +    from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec
++
 +    adapter_dir = tmp_path / "synthetic-converged"
 +    _build_converged_fixture(adapter_dir)
++
 +    spec = SwaySpec(
 +        version=1,
 +        models=SuiteModels(
 +            base=ModelSpec(base="dummy", kind="dummy"),
 +            ft=ModelSpec(base="dummy", kind="dummy", adapter=adapter_dir),
 +        ),
 +        defaults=SuiteDefaults(seed=0),
 +        suite=[
 +            {
 +                "name": "gg",
 +                "kind": "gradient_ghost",
 +                "adapter_path": str(adapter_dir),
 +            },
 +        ],
 +    )
 +    result = run_suite(spec, backend=None, spec_path="<integration>")
 +    assert len(result.probes) == 1
 +    assert result.probes[0].verdict == Verdict.PASS
 +    # No backend, no backend stats.
 +    assert result.backend_stats == {}