`332e32f`

tests/integration: end-to-end multi-turn smoke on a tiny LoRA

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 2 weeks ago

SHA: 332e32f298f2d628e36d10db406e968777dd9ba8
Parents: d955dfd
Tree: 15180a1

1 changed file

Status	File	+	-
A	`tests/integration/test_probe_multi_turn_coherence.py`	109	0

tests/integration/test_probe_multi_turn_coherence.pyadded

 +"""Integration test: multi_turn_coherence_decay end-to-end on a tiny LoRA.
++
 +Builds a tiny random LoRA on SmolLM2-135M-Instruct (which has a real
 +chat_template) and runs the probe through 4 turns of synthetic
 +dialogue. The intent isn't to assert specific KL values — they
 +depend on the random adapter — but to exercise the full code path
 +on a real backend so a regression in the chat-template wiring,
 +turn-loop, or curve-fit plumbing surfaces in slow CI.
++
 +Marked ``slow + online``.
 +"""
++
 +from __future__ import annotations
++
 +from pathlib import Path
++
 +import pytest
++
 +pytestmark = [pytest.mark.slow, pytest.mark.online]
++
++
 +def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
 +    """Same shape as the other slow-lane backend tests."""
 +    import torch
 +    from peft import LoraConfig, get_peft_model
 +    from transformers import AutoModelForCausalLM, AutoTokenizer
++
 +    torch.manual_seed(0)
 +    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
 +    if tokenizer.pad_token_id is None:
 +        tokenizer.pad_token = tokenizer.eos_token
 +    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
 +    cfg = LoraConfig(
 +        r=8,
 +        lora_alpha=16,
 +        target_modules=["q_proj", "v_proj"],
 +        lora_dropout=0.0,
 +        bias="none",
 +        task_type="CAUSAL_LM",
 +    )
 +    peft_model = get_peft_model(base, cfg)
 +    with torch.no_grad():
 +        for name, param in peft_model.named_parameters():
 +            if "lora_B" in name:
 +                # Tiny perturbation — base != ft, but generations stay sane
 +                # enough to thread through 4 dialogue turns without
 +                # collapsing to junk.
 +                param.copy_(torch.randn_like(param) * 0.02)
 +    peft_model.save_pretrained(str(out_dir))
 +    tokenizer.save_pretrained(str(out_dir))
++
++
 +@pytest.fixture(scope="module")
 +def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
 +    out = tmp_path_factory.mktemp("multi-turn-coherence-adapter")
 +    _build_random_lora_adapter(tiny_model_dir, out)
 +    return out
++
++
 +def test_probe_runs_end_to_end_on_real_adapter(tiny_model_dir: Path, random_adapter: Path) -> None:
 +    """Smoke: HF backend + chat_template-equipped tokenizer + real
 +    multi-turn dialogue produces a finalized result with the
 +    documented evidence keys + finite per-turn KLs."""
 +    from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
 +    from dlm_sway.core.model import ModelSpec
 +    from dlm_sway.core.result import Verdict
 +    from dlm_sway.probes.base import RunContext, build_probe
++
 +    backend = HuggingFaceDifferentialBackend(
 +        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
 +        adapter_path=random_adapter,
 +    )
 +    try:
 +        probe, spec = build_probe(
 +            {
 +                "name": "mtc_smoke",
 +                "kind": "multi_turn_coherence_decay",
 +                "prompts": [
 +                    "What's the difference between TCP and UDP?",
 +                    "Explain how a neural network learns.",
 +                ],
 +                "max_turns": 3,
 +                "max_new_tokens": 32,  # keep CPU runtime under control
 +            }
 +        )
 +        ctx = RunContext(backend=backend, seed=0, top_k=64)
 +        result = probe.run(spec, ctx)
 +    finally:
 +        backend.close()
++
 +    # Shape: any verdict that isn't ERROR is fine. We don't pin
 +    # PASS/FAIL because the random adapter's actual decay shape isn't
 +    # under our control.
 +    assert result.verdict in {
 +        Verdict.PASS,
 +        Verdict.FAIL,
 +        Verdict.WARN,
 +    }, result.message
 +    assert result.evidence["max_turns"] == 3
 +    assert result.evidence["num_prompts"] == 2
 +    per_turn = result.evidence["per_turn_kls"]
 +    assert len(per_turn) == 2  # max_turns - 1
 +    for kl in per_turn:
 +        assert isinstance(kl, float)
 +        assert kl >= 0.0  # KL is non-negative
 +    assert result.evidence["fit_status"] in {"ok", "stable", "non_monotonic", "degenerate"}
 +    sparkline = result.evidence["sparkline"]
 +    assert isinstance(sparkline, str)
 +    assert len(sparkline) == 2