`88c59e5`

tests/integration: HF multi-rank null calibration — correctness + no-reload timing

Authored by

espadonne 3 weeks ago

SHA: 88c59e5500ec5572a4f7e39fc9e72d9c9faa8b0c
Parents: c5e9de2
Tree: c151bc1

1 changed file

Status	File	+	-
A	`tests/integration/test_hf_multi_rank_null.py`	137	0

tests/integration/test_hf_multi_rank_null.pyadded

 +"""Integration test: HF backend under multi-rank null calibration (S10 / F4).
++
 +Two contracts:
++
 +1. **Correctness.** Three ``rank_multipliers`` against SmolLM2-135M
 +   produce three per-rank null-stats groups; each rank's stats are
 +   finite and the std rises with rank (larger rank_scale → larger
 +   effective noise by the sqrt(r) scaling the backend uses).
 +2. **Performance.** Wall time for ``rank_multipliers=[0.5, 1.0, 2.0]``
 +   stays under ``2×`` the single-rank baseline — i.e., we don't
 +   reload the base model per multiplier. The noise-scaling approach
 +   makes rank switches free (no tensor reshape, no reload).
++
 +Marked ``slow+online``.
 +"""
++
 +from __future__ import annotations
++
 +import time
 +from collections.abc import Iterator
 +from pathlib import Path
++
 +import pytest
++
 +from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
 +from dlm_sway.core.model import ModelSpec
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.probes.base import RunContext, build_probe
++
 +pytestmark = [pytest.mark.slow, pytest.mark.online]
++
++
 +def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
 +    import torch
 +    from peft import LoraConfig, get_peft_model
 +    from transformers import AutoModelForCausalLM, AutoTokenizer
++
 +    torch.manual_seed(0)
 +    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
 +    if tokenizer.pad_token_id is None:
 +        tokenizer.pad_token = tokenizer.eos_token
 +    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
 +    cfg = LoraConfig(
 +        r=8,
 +        lora_alpha=16,
 +        target_modules=["q_proj", "v_proj"],
 +        lora_dropout=0.0,
 +        bias="none",
 +        task_type="CAUSAL_LM",
 +    )
 +    peft_model = get_peft_model(base, cfg)
 +    with torch.no_grad():
 +        for name, param in peft_model.named_parameters():
 +            if "lora_B" in name:
 +                param.copy_(torch.randn_like(param) * 0.05)
 +    peft_model.save_pretrained(str(out_dir))
 +    tokenizer.save_pretrained(str(out_dir))
++
++
 +@pytest.fixture(scope="module")
 +def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
 +    adapter_dir = tmp_path_factory.mktemp("multi-rank-random-adapter")
 +    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
 +    return adapter_dir
++
++
 +@pytest.fixture(scope="module")
 +def hf_backend(
 +    tiny_model_dir: Path, random_adapter: Path
 +) -> Iterator[HuggingFaceDifferentialBackend]:
 +    backend = HuggingFaceDifferentialBackend(
 +        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
 +        adapter_path=random_adapter,
 +    )
 +    yield backend
 +    backend.close()
++
++
 +def _run_null(
 +    backend: HuggingFaceDifferentialBackend, rank_multipliers: list[float]
 +) -> tuple[float, dict[str, dict[str, dict[str, float]]]]:
 +    """Run null_adapter once and return (wall_seconds, null_stats_by_rank)."""
 +    probe, spec = build_probe(
 +        {
 +            "name": "null",
 +            "kind": "null_adapter",
 +            "runs": 2,
 +            "rank_multipliers": rank_multipliers,
 +            "calibrate_kinds": ["delta_kl"],
 +            "cache": False,  # force real compute for the timing comparison
 +        }
 +    )
 +    ctx = RunContext(backend=backend)
 +    t0 = time.perf_counter()
 +    result = probe.run(spec, ctx)
 +    wall = time.perf_counter() - t0
 +    assert result.verdict == Verdict.PASS, result.message
 +    return wall, dict(result.evidence["null_stats_by_rank"])
++
++
 +def test_three_ranks_produce_three_stats_groups(
 +    hf_backend: HuggingFaceDifferentialBackend,
 +) -> None:
 +    _, by_rank = _run_null(hf_backend, [0.5, 1.0, 2.0])
 +    assert set(by_rank) == {"rank_0.50", "rank_1.00", "rank_2.00"}
 +    for rkey, kind_stats in by_rank.items():
 +        delta_kl = kind_stats.get("delta_kl")
 +        assert delta_kl is not None, f"{rkey} missing delta_kl stats"
 +        assert delta_kl["n"] == 2.0
 +        assert delta_kl["std"] > 0.0
++
++
 +def test_multi_rank_does_not_reload_base(
 +    hf_backend: HuggingFaceDifferentialBackend,
 +) -> None:
 +    """Three ranks must scale ~linearly with probe iterations, *not*
 +    incur a per-rank base-model reload.
++
 +    Three ranks × two seeds = 6 calibration iterations vs single-rank's
 +    2 iterations — so a linear-compute upper bound is ≈3×. The S07
 +    forward-pass cache on the base view can save more, but doesn't
 +    always (null-side view_ids are distinct per rank and seed). We
 +    assert < 4× as the clear "no reload" ceiling: a per-rank base
 +    reload would blow this past 10× on a 135M model.
 +    """
 +    # Warmup: first call amortizes the base-model load. Without this
 +    # the single-rank baseline absorbs the load cost and the ratio
 +    # becomes uninformative.
 +    _run_null(hf_backend, [1.0])
++
 +    single_wall, _ = _run_null(hf_backend, [1.0])
 +    multi_wall, _ = _run_null(hf_backend, [0.5, 1.0, 2.0])
 +    ratio = multi_wall / max(single_wall, 0.01)
 +    assert ratio < 4.0, (
 +        f"multi-rank wall {multi_wall:.2f}s is {ratio:.2f}× single-rank {single_wall:.2f}s "
 +        "(threshold: < 4× — a true base-model reload would exceed 10×)"
 +    )