Python · 6224 bytes Raw Blame History
1 """Integration test: ``cluster_kl`` end-to-end on a real tiny model.
2
3 Mirrors ``test_external_perplexity_e2e`` — the sprint file for S16
4 explicitly lists this as a DoD item but the fixture was never shipped.
5
6 The test:
7
8 1. Builds a small random LoRA on SmolLM2-135M (same template as
9 ``test_external_perplexity_e2e``).
10 2. Runs ``cluster_kl`` with a 16-prompt two-topic set (animals +
11 programming) — split the ft signal across topics so the specificity
12 ratio has a chance to be meaningfully non-0.5.
13 3. Asserts the probe terminates in a non-ERROR verdict, the specificity
14 is finite and in ``[0, 1]``, and when preceded by ``null_adapter`` in
15 a suite the z-score field is populated.
16
17 Needs the ``[semsim]`` extra at runtime (sentence-transformers +
18 scikit-learn). We assume integration runners install those; skip
19 gracefully when they don't.
20 """
21
22 from __future__ import annotations
23
24 import math
25 from collections.abc import Iterator
26 from pathlib import Path
27
28 import pytest
29
30 from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
31 from dlm_sway.core.model import ModelSpec
32 from dlm_sway.core.result import Verdict
33 from dlm_sway.probes.base import RunContext, build_probe
34 from dlm_sway.suite.runner import run as run_suite
35 from dlm_sway.suite.spec import SwaySpec
36
37 pytestmark = [pytest.mark.slow, pytest.mark.online]
38
39
40 # 16 prompts split 8/8 across two obvious topics.
41 _PROMPTS = [
42 # Animals (topic A)
43 "The cat chased the mouse around the house.",
44 "Dogs wag their tails when they are happy.",
45 "Elephants never forget a face they have seen.",
46 "Lions hunt in packs called prides.",
47 "Horses gallop across open fields.",
48 "Sharks have rows of sharp teeth.",
49 "Bees pollinate flowers as they gather nectar.",
50 "Owls hunt small rodents at night.",
51 # Programming (topic B)
52 "Write a Python decorator that logs every call.",
53 "Implement binary search in Rust.",
54 "Debug a segmentation fault in C++ pointer arithmetic.",
55 "Explain ownership semantics in Rust.",
56 "Refactor this JavaScript callback hell into promises.",
57 "Optimize the SQL query by adding an index.",
58 "Profile the memory usage of a Go program.",
59 "Write unit tests for a REST API endpoint.",
60 ]
61
62
63 def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
64 import torch
65 from peft import LoraConfig, get_peft_model
66 from transformers import AutoModelForCausalLM, AutoTokenizer
67
68 torch.manual_seed(0)
69 tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
70 if tokenizer.pad_token_id is None:
71 tokenizer.pad_token = tokenizer.eos_token
72 base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
73 cfg = LoraConfig(
74 r=8,
75 lora_alpha=16,
76 target_modules=["q_proj", "v_proj"],
77 lora_dropout=0.0,
78 bias="none",
79 task_type="CAUSAL_LM",
80 )
81 peft_model = get_peft_model(base, cfg)
82 with torch.no_grad():
83 for name, param in peft_model.named_parameters():
84 if "lora_B" in name:
85 param.copy_(torch.randn_like(param) * 0.05)
86 peft_model.save_pretrained(str(out_dir))
87 tokenizer.save_pretrained(str(out_dir))
88
89
90 @pytest.fixture(scope="module")
91 def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
92 adapter_dir = tmp_path_factory.mktemp("cluster-kl-random-adapter")
93 _build_random_lora_adapter(tiny_model_dir, adapter_dir)
94 return adapter_dir
95
96
97 @pytest.fixture(scope="module")
98 def hf_backend(
99 tiny_model_dir: Path, random_adapter: Path
100 ) -> Iterator[HuggingFaceDifferentialBackend]:
101 backend = HuggingFaceDifferentialBackend(
102 base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
103 adapter_path=random_adapter,
104 )
105 yield backend
106 backend.close()
107
108
109 def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
110 pytest.importorskip("sklearn")
111 pytest.importorskip("sentence_transformers")
112
113 probe, spec = build_probe(
114 {
115 "name": "ck",
116 "kind": "cluster_kl",
117 "prompts": _PROMPTS,
118 "num_clusters": 2,
119 "min_prompts": 16,
120 }
121 )
122 ctx = RunContext(backend=hf_backend)
123 result = probe.run(spec, ctx)
124
125 assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
126 # Under a small random LoRA we don't know the specificity sign;
127 # just pin that it's finite and in [0, 1].
128 assert result.raw is not None
129 assert math.isfinite(result.raw)
130 assert 0.0 <= result.raw <= 1.0
131 assert result.evidence["num_clusters"] == 2
132 assert result.evidence["num_prompts"] == 16
133 per_cluster = result.evidence["per_cluster_mean_kl"]
134 assert len(per_cluster) == 2
135
136
137 def test_null_calibration_lights_up_zscore(
138 hf_backend: HuggingFaceDifferentialBackend,
139 ) -> None:
140 """null_adapter → cluster_kl produces a z_score end-to-end."""
141 pytest.importorskip("sklearn")
142 pytest.importorskip("sentence_transformers")
143
144 raw_spec = SwaySpec.model_validate(
145 {
146 "version": 1,
147 "models": {
148 "base": {"base": "placeholder"},
149 "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
150 },
151 "suite": [
152 {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
153 {
154 "name": "ck",
155 "kind": "cluster_kl",
156 "prompts": _PROMPTS,
157 "num_clusters": 2,
158 "min_prompts": 16,
159 "assert_z_gte": -100.0, # permissive — just want z populated
160 },
161 ],
162 }
163 )
164 result = run_suite(raw_spec, hf_backend)
165 assert len(result.probes) == 2
166 null_result = result.probes[0]
167 ck_result = result.probes[1]
168 assert null_result.verdict == Verdict.PASS
169 assert ck_result.verdict != Verdict.ERROR
170 assert ck_result.z_score is not None, (
171 f"cluster_kl should have z-scored against null baseline; "
172 f"evidence={ck_result.evidence}, message={ck_result.message}"
173 )
174 assert math.isfinite(ck_result.z_score)