`d9dcb5d`

tests/integration: cluster_kl end-to-end on SmolLM2-135M + random LoRA (F02)

Authored by

espadonne 3 weeks ago

SHA: d9dcb5dc1c63700e6af9691e686aa0bd20312a78
Parents: 5cf3724
Tree: 0f39782

1 changed file

Status	File	+	-
A	`tests/integration/test_cluster_kl_e2e.py`	174	0

tests/integration/test_cluster_kl_e2e.pyadded

 +"""Integration test: ``cluster_kl`` end-to-end on a real tiny model.
++
 +Mirrors ``test_external_perplexity_e2e`` — the sprint file for S16
 +explicitly lists this as a DoD item but the fixture was never shipped.
++
 +The test:
++
 +1. Builds a small random LoRA on SmolLM2-135M (same template as
 +   ``test_external_perplexity_e2e``).
 +2. Runs ``cluster_kl`` with a 16-prompt two-topic set (animals +
 +   programming) — split the ft signal across topics so the specificity
 +   ratio has a chance to be meaningfully non-0.5.
 +3. Asserts the probe terminates in a non-ERROR verdict, the specificity
 +   is finite and in ``[0, 1]``, and when preceded by ``null_adapter`` in
 +   a suite the z-score field is populated.
++
 +Needs the ``[semsim]`` extra at runtime (sentence-transformers +
 +scikit-learn). We assume integration runners install those; skip
 +gracefully when they don't.
 +"""
++
 +from __future__ import annotations
++
 +import math
 +from collections.abc import Iterator
 +from pathlib import Path
++
 +import pytest
++
 +from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
 +from dlm_sway.core.model import ModelSpec
 +from dlm_sway.core.result import Verdict
 +from dlm_sway.probes.base import RunContext, build_probe
 +from dlm_sway.suite.runner import run as run_suite
 +from dlm_sway.suite.spec import SwaySpec
++
 +pytestmark = [pytest.mark.slow, pytest.mark.online]
++
++
 +# 16 prompts split 8/8 across two obvious topics.
 +_PROMPTS = [
 +    # Animals (topic A)
 +    "The cat chased the mouse around the house.",
 +    "Dogs wag their tails when they are happy.",
 +    "Elephants never forget a face they have seen.",
 +    "Lions hunt in packs called prides.",
 +    "Horses gallop across open fields.",
 +    "Sharks have rows of sharp teeth.",
 +    "Bees pollinate flowers as they gather nectar.",
 +    "Owls hunt small rodents at night.",
 +    # Programming (topic B)
 +    "Write a Python decorator that logs every call.",
 +    "Implement binary search in Rust.",
 +    "Debug a segmentation fault in C++ pointer arithmetic.",
 +    "Explain ownership semantics in Rust.",
 +    "Refactor this JavaScript callback hell into promises.",
 +    "Optimize the SQL query by adding an index.",
 +    "Profile the memory usage of a Go program.",
 +    "Write unit tests for a REST API endpoint.",
 +]
++
++
 +def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
 +    import torch
 +    from peft import LoraConfig, get_peft_model
 +    from transformers import AutoModelForCausalLM, AutoTokenizer
++
 +    torch.manual_seed(0)
 +    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
 +    if tokenizer.pad_token_id is None:
 +        tokenizer.pad_token = tokenizer.eos_token
 +    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
 +    cfg = LoraConfig(
 +        r=8,
 +        lora_alpha=16,
 +        target_modules=["q_proj", "v_proj"],
 +        lora_dropout=0.0,
 +        bias="none",
 +        task_type="CAUSAL_LM",
 +    )
 +    peft_model = get_peft_model(base, cfg)
 +    with torch.no_grad():
 +        for name, param in peft_model.named_parameters():
 +            if "lora_B" in name:
 +                param.copy_(torch.randn_like(param) * 0.05)
 +    peft_model.save_pretrained(str(out_dir))
 +    tokenizer.save_pretrained(str(out_dir))
++
++
 +@pytest.fixture(scope="module")
 +def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
 +    adapter_dir = tmp_path_factory.mktemp("cluster-kl-random-adapter")
 +    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
 +    return adapter_dir
++
++
 +@pytest.fixture(scope="module")
 +def hf_backend(
 +    tiny_model_dir: Path, random_adapter: Path
 +) -> Iterator[HuggingFaceDifferentialBackend]:
 +    backend = HuggingFaceDifferentialBackend(
 +        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
 +        adapter_path=random_adapter,
 +    )
 +    yield backend
 +    backend.close()
++
++
 +def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
 +    pytest.importorskip("sklearn")
 +    pytest.importorskip("sentence_transformers")
++
 +    probe, spec = build_probe(
 +        {
 +            "name": "ck",
 +            "kind": "cluster_kl",
 +            "prompts": _PROMPTS,
 +            "num_clusters": 2,
 +            "min_prompts": 16,
 +        }
 +    )
 +    ctx = RunContext(backend=hf_backend)
 +    result = probe.run(spec, ctx)
++
 +    assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
 +    # Under a small random LoRA we don't know the specificity sign;
 +    # just pin that it's finite and in [0, 1].
 +    assert result.raw is not None
 +    assert math.isfinite(result.raw)
 +    assert 0.0 <= result.raw <= 1.0
 +    assert result.evidence["num_clusters"] == 2
 +    assert result.evidence["num_prompts"] == 16
 +    per_cluster = result.evidence["per_cluster_mean_kl"]
 +    assert len(per_cluster) == 2
++
++
 +def test_null_calibration_lights_up_zscore(
 +    hf_backend: HuggingFaceDifferentialBackend,
 +) -> None:
 +    """null_adapter → cluster_kl produces a z_score end-to-end."""
 +    pytest.importorskip("sklearn")
 +    pytest.importorskip("sentence_transformers")
++
 +    raw_spec = SwaySpec.model_validate(
 +        {
 +            "version": 1,
 +            "models": {
 +                "base": {"base": "placeholder"},
 +                "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
 +            },
 +            "suite": [
 +                {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
 +                {
 +                    "name": "ck",
 +                    "kind": "cluster_kl",
 +                    "prompts": _PROMPTS,
 +                    "num_clusters": 2,
 +                    "min_prompts": 16,
 +                    "assert_z_gte": -100.0,  # permissive — just want z populated
 +                },
 +            ],
 +        }
 +    )
 +    result = run_suite(raw_spec, hf_backend)
 +    assert len(result.probes) == 2
 +    null_result = result.probes[0]
 +    ck_result = result.probes[1]
 +    assert null_result.verdict == Verdict.PASS
 +    assert ck_result.verdict != Verdict.ERROR
 +    assert ck_result.z_score is not None, (
 +        f"cluster_kl should have z-scored against null baseline; "
 +        f"evidence={ck_result.evidence}, message={ck_result.message}"
 +    )
 +    assert math.isfinite(ck_result.z_score)