"""Integration test: HF backend under multi-rank null calibration (S10 / F4).

Two contracts:

1. **Correctness.** Three ``rank_multipliers`` against SmolLM2-135M
   produce three per-rank null-stats groups; each rank's stats are
   finite and the std rises with rank (larger rank_scale → larger
   effective noise by the sqrt(r) scaling the backend uses).
2. **Performance.** Wall time for ``rank_multipliers=[0.5, 1.0, 2.0]``
   stays under ``2×`` the single-rank baseline — i.e., we don't
   reload the base model per multiplier. The noise-scaling approach
   makes rank switches free (no tensor reshape, no reload).

Marked ``slow+online``.
"""

from __future__ import annotations

import time
from collections.abc import Iterator
from pathlib import Path

import pytest

from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
from dlm_sway.core.model import ModelSpec
from dlm_sway.core.result import Verdict
from dlm_sway.probes.base import RunContext, build_probe

pytestmark = [pytest.mark.slow, pytest.mark.online]


def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
    import torch
    from peft import LoraConfig, get_peft_model
    from transformers import AutoModelForCausalLM, AutoTokenizer

    torch.manual_seed(0)
    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
    cfg = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.0,
        bias="none",
        task_type="CAUSAL_LM",
    )
    peft_model = get_peft_model(base, cfg)
    with torch.no_grad():
        for name, param in peft_model.named_parameters():
            if "lora_B" in name:
                param.copy_(torch.randn_like(param) * 0.05)
    peft_model.save_pretrained(str(out_dir))
    tokenizer.save_pretrained(str(out_dir))


@pytest.fixture(scope="module")
def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
    adapter_dir = tmp_path_factory.mktemp("multi-rank-random-adapter")
    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
    return adapter_dir


@pytest.fixture(scope="module")
def hf_backend(
    tiny_model_dir: Path, random_adapter: Path
) -> Iterator[HuggingFaceDifferentialBackend]:
    backend = HuggingFaceDifferentialBackend(
        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
        adapter_path=random_adapter,
    )
    yield backend
    backend.close()


def _run_null(
    backend: HuggingFaceDifferentialBackend, rank_multipliers: list[float]
) -> tuple[float, dict[str, dict[str, dict[str, float]]]]:
    """Run null_adapter once and return (wall_seconds, null_stats_by_rank)."""
    probe, spec = build_probe(
        {
            "name": "null",
            "kind": "null_adapter",
            "runs": 2,
            "rank_multipliers": rank_multipliers,
            "calibrate_kinds": ["delta_kl"],
            "cache": False,  # force real compute for the timing comparison
        }
    )
    ctx = RunContext(backend=backend)
    t0 = time.perf_counter()
    result = probe.run(spec, ctx)
    wall = time.perf_counter() - t0
    assert result.verdict == Verdict.PASS, result.message
    return wall, dict(result.evidence["null_stats_by_rank"])


def test_three_ranks_produce_three_stats_groups(
    hf_backend: HuggingFaceDifferentialBackend,
) -> None:
    _, by_rank = _run_null(hf_backend, [0.5, 1.0, 2.0])
    assert set(by_rank) == {"rank_0.50", "rank_1.00", "rank_2.00"}
    for rkey, kind_stats in by_rank.items():
        delta_kl = kind_stats.get("delta_kl")
        assert delta_kl is not None, f"{rkey} missing delta_kl stats"
        assert delta_kl["n"] == 2.0
        assert delta_kl["std"] > 0.0


def test_multi_rank_does_not_reload_base(
    hf_backend: HuggingFaceDifferentialBackend,
) -> None:
    """Three ranks must scale ~linearly with probe iterations, *not*
    incur a per-rank base-model reload.

    Three ranks × two seeds = 6 calibration iterations vs single-rank's
    2 iterations — so a linear-compute upper bound is ≈3×. The S07
    forward-pass cache on the base view can save more, but doesn't
    always (null-side view_ids are distinct per rank and seed). We
    assert < 4× as the clear "no reload" ceiling: a per-rank base
    reload would blow this past 10× on a 135M model.
    """
    # Warmup: first call amortizes the base-model load. Without this
    # the single-rank baseline absorbs the load cost and the ratio
    # becomes uninformative.
    _run_null(hf_backend, [1.0])

    single_wall, _ = _run_null(hf_backend, [1.0])
    multi_wall, _ = _run_null(hf_backend, [0.5, 1.0, 2.0])
    ratio = multi_wall / max(single_wall, 0.01)
    assert ratio < 4.0, (
        f"multi-rank wall {multi_wall:.2f}s is {ratio:.2f}× single-rank {single_wall:.2f}s "
        "(threshold: < 4× — a true base-model reload would exceed 10×)"
    )