tenseleyflow/sway / d9dcb5d

Browse files

tests/integration: cluster_kl end-to-end on SmolLM2-135M + random LoRA (F02)

Authored by espadonne
SHA
d9dcb5dc1c63700e6af9691e686aa0bd20312a78
Parents
5cf3724
Tree
0f39782

1 changed file

StatusFile+-
A tests/integration/test_cluster_kl_e2e.py 174 0
tests/integration/test_cluster_kl_e2e.pyadded
@@ -0,0 +1,174 @@
1
+"""Integration test: ``cluster_kl`` end-to-end on a real tiny model.
2
+
3
+Mirrors ``test_external_perplexity_e2e`` — the sprint file for S16
4
+explicitly lists this as a DoD item but the fixture was never shipped.
5
+
6
+The test:
7
+
8
+1. Builds a small random LoRA on SmolLM2-135M (same template as
9
+   ``test_external_perplexity_e2e``).
10
+2. Runs ``cluster_kl`` with a 16-prompt two-topic set (animals +
11
+   programming) — split the ft signal across topics so the specificity
12
+   ratio has a chance to be meaningfully non-0.5.
13
+3. Asserts the probe terminates in a non-ERROR verdict, the specificity
14
+   is finite and in ``[0, 1]``, and when preceded by ``null_adapter`` in
15
+   a suite the z-score field is populated.
16
+
17
+Needs the ``[semsim]`` extra at runtime (sentence-transformers +
18
+scikit-learn). We assume integration runners install those; skip
19
+gracefully when they don't.
20
+"""
21
+
22
+from __future__ import annotations
23
+
24
+import math
25
+from collections.abc import Iterator
26
+from pathlib import Path
27
+
28
+import pytest
29
+
30
+from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
31
+from dlm_sway.core.model import ModelSpec
32
+from dlm_sway.core.result import Verdict
33
+from dlm_sway.probes.base import RunContext, build_probe
34
+from dlm_sway.suite.runner import run as run_suite
35
+from dlm_sway.suite.spec import SwaySpec
36
+
37
+pytestmark = [pytest.mark.slow, pytest.mark.online]
38
+
39
+
40
+# 16 prompts split 8/8 across two obvious topics.
41
+_PROMPTS = [
42
+    # Animals (topic A)
43
+    "The cat chased the mouse around the house.",
44
+    "Dogs wag their tails when they are happy.",
45
+    "Elephants never forget a face they have seen.",
46
+    "Lions hunt in packs called prides.",
47
+    "Horses gallop across open fields.",
48
+    "Sharks have rows of sharp teeth.",
49
+    "Bees pollinate flowers as they gather nectar.",
50
+    "Owls hunt small rodents at night.",
51
+    # Programming (topic B)
52
+    "Write a Python decorator that logs every call.",
53
+    "Implement binary search in Rust.",
54
+    "Debug a segmentation fault in C++ pointer arithmetic.",
55
+    "Explain ownership semantics in Rust.",
56
+    "Refactor this JavaScript callback hell into promises.",
57
+    "Optimize the SQL query by adding an index.",
58
+    "Profile the memory usage of a Go program.",
59
+    "Write unit tests for a REST API endpoint.",
60
+]
61
+
62
+
63
+def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
64
+    import torch
65
+    from peft import LoraConfig, get_peft_model
66
+    from transformers import AutoModelForCausalLM, AutoTokenizer
67
+
68
+    torch.manual_seed(0)
69
+    tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
70
+    if tokenizer.pad_token_id is None:
71
+        tokenizer.pad_token = tokenizer.eos_token
72
+    base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
73
+    cfg = LoraConfig(
74
+        r=8,
75
+        lora_alpha=16,
76
+        target_modules=["q_proj", "v_proj"],
77
+        lora_dropout=0.0,
78
+        bias="none",
79
+        task_type="CAUSAL_LM",
80
+    )
81
+    peft_model = get_peft_model(base, cfg)
82
+    with torch.no_grad():
83
+        for name, param in peft_model.named_parameters():
84
+            if "lora_B" in name:
85
+                param.copy_(torch.randn_like(param) * 0.05)
86
+    peft_model.save_pretrained(str(out_dir))
87
+    tokenizer.save_pretrained(str(out_dir))
88
+
89
+
90
+@pytest.fixture(scope="module")
91
+def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
92
+    adapter_dir = tmp_path_factory.mktemp("cluster-kl-random-adapter")
93
+    _build_random_lora_adapter(tiny_model_dir, adapter_dir)
94
+    return adapter_dir
95
+
96
+
97
+@pytest.fixture(scope="module")
98
+def hf_backend(
99
+    tiny_model_dir: Path, random_adapter: Path
100
+) -> Iterator[HuggingFaceDifferentialBackend]:
101
+    backend = HuggingFaceDifferentialBackend(
102
+        base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
103
+        adapter_path=random_adapter,
104
+    )
105
+    yield backend
106
+    backend.close()
107
+
108
+
109
+def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
110
+    pytest.importorskip("sklearn")
111
+    pytest.importorskip("sentence_transformers")
112
+
113
+    probe, spec = build_probe(
114
+        {
115
+            "name": "ck",
116
+            "kind": "cluster_kl",
117
+            "prompts": _PROMPTS,
118
+            "num_clusters": 2,
119
+            "min_prompts": 16,
120
+        }
121
+    )
122
+    ctx = RunContext(backend=hf_backend)
123
+    result = probe.run(spec, ctx)
124
+
125
+    assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
126
+    # Under a small random LoRA we don't know the specificity sign;
127
+    # just pin that it's finite and in [0, 1].
128
+    assert result.raw is not None
129
+    assert math.isfinite(result.raw)
130
+    assert 0.0 <= result.raw <= 1.0
131
+    assert result.evidence["num_clusters"] == 2
132
+    assert result.evidence["num_prompts"] == 16
133
+    per_cluster = result.evidence["per_cluster_mean_kl"]
134
+    assert len(per_cluster) == 2
135
+
136
+
137
+def test_null_calibration_lights_up_zscore(
138
+    hf_backend: HuggingFaceDifferentialBackend,
139
+) -> None:
140
+    """null_adapter → cluster_kl produces a z_score end-to-end."""
141
+    pytest.importorskip("sklearn")
142
+    pytest.importorskip("sentence_transformers")
143
+
144
+    raw_spec = SwaySpec.model_validate(
145
+        {
146
+            "version": 1,
147
+            "models": {
148
+                "base": {"base": "placeholder"},
149
+                "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
150
+            },
151
+            "suite": [
152
+                {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
153
+                {
154
+                    "name": "ck",
155
+                    "kind": "cluster_kl",
156
+                    "prompts": _PROMPTS,
157
+                    "num_clusters": 2,
158
+                    "min_prompts": 16,
159
+                    "assert_z_gte": -100.0,  # permissive — just want z populated
160
+                },
161
+            ],
162
+        }
163
+    )
164
+    result = run_suite(raw_spec, hf_backend)
165
+    assert len(result.probes) == 2
166
+    null_result = result.probes[0]
167
+    ck_result = result.probes[1]
168
+    assert null_result.verdict == Verdict.PASS
169
+    assert ck_result.verdict != Verdict.ERROR
170
+    assert ck_result.z_score is not None, (
171
+        f"cluster_kl should have z-scored against null baseline; "
172
+        f"evidence={ck_result.evidence}, message={ck_result.message}"
173
+    )
174
+    assert math.isfinite(ck_result.z_score)