"""Integration test: ``cluster_kl`` end-to-end on a real tiny model. Mirrors ``test_external_perplexity_e2e`` — the sprint file for S16 explicitly lists this as a DoD item but the fixture was never shipped. The test: 1. Builds a small random LoRA on SmolLM2-135M (same template as ``test_external_perplexity_e2e``). 2. Runs ``cluster_kl`` with a 16-prompt two-topic set (animals + programming) — split the ft signal across topics so the specificity ratio has a chance to be meaningfully non-0.5. 3. Asserts the probe terminates in a non-ERROR verdict, the specificity is finite and in ``[0, 1]``, and when preceded by ``null_adapter`` in a suite the z-score field is populated. Needs the ``[semsim]`` extra at runtime (sentence-transformers + scikit-learn). We assume integration runners install those; skip gracefully when they don't. """ from __future__ import annotations import math from collections.abc import Iterator from pathlib import Path import pytest from dlm_sway.backends.hf import HuggingFaceDifferentialBackend from dlm_sway.core.model import ModelSpec from dlm_sway.core.result import Verdict from dlm_sway.probes.base import RunContext, build_probe from dlm_sway.suite.runner import run as run_suite from dlm_sway.suite.spec import SwaySpec pytestmark = [pytest.mark.slow, pytest.mark.online] # 16 prompts split 8/8 across two obvious topics. _PROMPTS = [ # Animals (topic A) "The cat chased the mouse around the house.", "Dogs wag their tails when they are happy.", "Elephants never forget a face they have seen.", "Lions hunt in packs called prides.", "Horses gallop across open fields.", "Sharks have rows of sharp teeth.", "Bees pollinate flowers as they gather nectar.", "Owls hunt small rodents at night.", # Programming (topic B) "Write a Python decorator that logs every call.", "Implement binary search in Rust.", "Debug a segmentation fault in C++ pointer arithmetic.", "Explain ownership semantics in Rust.", "Refactor this JavaScript callback hell into promises.", "Optimize the SQL query by adding an index.", "Profile the memory usage of a Go program.", "Write unit tests for a REST API endpoint.", ] def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None: import torch from peft import LoraConfig, get_peft_model from transformers import AutoModelForCausalLM, AutoTokenizer torch.manual_seed(0) tokenizer = AutoTokenizer.from_pretrained(str(base_dir)) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32) cfg = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.0, bias="none", task_type="CAUSAL_LM", ) peft_model = get_peft_model(base, cfg) with torch.no_grad(): for name, param in peft_model.named_parameters(): if "lora_B" in name: param.copy_(torch.randn_like(param) * 0.05) peft_model.save_pretrained(str(out_dir)) tokenizer.save_pretrained(str(out_dir)) @pytest.fixture(scope="module") def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path: adapter_dir = tmp_path_factory.mktemp("cluster-kl-random-adapter") _build_random_lora_adapter(tiny_model_dir, adapter_dir) return adapter_dir @pytest.fixture(scope="module") def hf_backend( tiny_model_dir: Path, random_adapter: Path ) -> Iterator[HuggingFaceDifferentialBackend]: backend = HuggingFaceDifferentialBackend( base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"), adapter_path=random_adapter, ) yield backend backend.close() def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None: pytest.importorskip("sklearn") pytest.importorskip("sentence_transformers") probe, spec = build_probe( { "name": "ck", "kind": "cluster_kl", "prompts": _PROMPTS, "num_clusters": 2, "min_prompts": 16, } ) ctx = RunContext(backend=hf_backend) result = probe.run(spec, ctx) assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}" # Under a small random LoRA we don't know the specificity sign; # just pin that it's finite and in [0, 1]. assert result.raw is not None assert math.isfinite(result.raw) assert 0.0 <= result.raw <= 1.0 assert result.evidence["num_clusters"] == 2 assert result.evidence["num_prompts"] == 16 per_cluster = result.evidence["per_cluster_mean_kl"] assert len(per_cluster) == 2 def test_null_calibration_lights_up_zscore( hf_backend: HuggingFaceDifferentialBackend, ) -> None: """null_adapter → cluster_kl produces a z_score end-to-end.""" pytest.importorskip("sklearn") pytest.importorskip("sentence_transformers") raw_spec = SwaySpec.model_validate( { "version": 1, "models": { "base": {"base": "placeholder"}, "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"}, }, "suite": [ {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False}, { "name": "ck", "kind": "cluster_kl", "prompts": _PROMPTS, "num_clusters": 2, "min_prompts": 16, "assert_z_gte": -100.0, # permissive — just want z populated }, ], } ) result = run_suite(raw_spec, hf_backend) assert len(result.probes) == 2 null_result = result.probes[0] ck_result = result.probes[1] assert null_result.verdict == Verdict.PASS assert ck_result.verdict != Verdict.ERROR assert ck_result.z_score is not None, ( f"cluster_kl should have z-scored against null baseline; " f"evidence={ck_result.evidence}, message={ck_result.message}" ) assert math.isfinite(ck_result.z_score)