sway Public

Watch 0 Fork 0 Star 0

Python · 6224 bytes Raw Blame History

  
        1
        """Integration test: ``cluster_kl`` end-to-end on a real tiny model.
      
        2
        
        3
        Mirrors ``test_external_perplexity_e2e`` — the sprint file for S16
      
        4
        explicitly lists this as a DoD item but the fixture was never shipped.
      
        5
        
        6
        The test:
      
        7
        
        8
        1. Builds a small random LoRA on SmolLM2-135M (same template as
      
        9
           ``test_external_perplexity_e2e``).
      
        10
        2. Runs ``cluster_kl`` with a 16-prompt two-topic set (animals +
      
        11
           programming) — split the ft signal across topics so the specificity
      
        12
           ratio has a chance to be meaningfully non-0.5.
      
        13
        3. Asserts the probe terminates in a non-ERROR verdict, the specificity
      
        14
           is finite and in ``[0, 1]``, and when preceded by ``null_adapter`` in
      
        15
           a suite the z-score field is populated.
      
        16
        
        17
        Needs the ``[semsim]`` extra at runtime (sentence-transformers +
      
        18
        scikit-learn). We assume integration runners install those; skip
      
        19
        gracefully when they don't.
      
        20
        """
      
        21
        
        22
        from __future__ import annotations
      
        23
        
        24
        import math
      
        25
        from collections.abc import Iterator
      
        26
        from pathlib import Path
      
        27
        
        28
        import pytest
      
        29
        
        30
        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        31
        from dlm_sway.core.model import ModelSpec
      
        32
        from dlm_sway.core.result import Verdict
      
        33
        from dlm_sway.probes.base import RunContext, build_probe
      
        34
        from dlm_sway.suite.runner import run as run_suite
      
        35
        from dlm_sway.suite.spec import SwaySpec
      
        36
        
        37
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        38
        
        39
        
        40
        # 16 prompts split 8/8 across two obvious topics.
      
        41
        _PROMPTS = [
      
        42
            # Animals (topic A)
      
        43
            "The cat chased the mouse around the house.",
      
        44
            "Dogs wag their tails when they are happy.",
      
        45
            "Elephants never forget a face they have seen.",
      
        46
            "Lions hunt in packs called prides.",
      
        47
            "Horses gallop across open fields.",
      
        48
            "Sharks have rows of sharp teeth.",
      
        49
            "Bees pollinate flowers as they gather nectar.",
      
        50
            "Owls hunt small rodents at night.",
      
        51
            # Programming (topic B)
      
        52
            "Write a Python decorator that logs every call.",
      
        53
            "Implement binary search in Rust.",
      
        54
            "Debug a segmentation fault in C++ pointer arithmetic.",
      
        55
            "Explain ownership semantics in Rust.",
      
        56
            "Refactor this JavaScript callback hell into promises.",
      
        57
            "Optimize the SQL query by adding an index.",
      
        58
            "Profile the memory usage of a Go program.",
      
        59
            "Write unit tests for a REST API endpoint.",
      
        60
        ]
      
        61
        
        62
        
        63
        def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        64
            import torch
      
        65
            from peft import LoraConfig, get_peft_model
      
        66
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        67
        
        68
            torch.manual_seed(0)
      
        69
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        70
            if tokenizer.pad_token_id is None:
      
        71
                tokenizer.pad_token = tokenizer.eos_token
      
        72
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        73
            cfg = LoraConfig(
      
        74
                r=8,
      
        75
                lora_alpha=16,
      
        76
                target_modules=["q_proj", "v_proj"],
      
        77
                lora_dropout=0.0,
      
        78
                bias="none",
      
        79
                task_type="CAUSAL_LM",
      
        80
            )
      
        81
            peft_model = get_peft_model(base, cfg)
      
        82
            with torch.no_grad():
      
        83
                for name, param in peft_model.named_parameters():
      
        84
                    if "lora_B" in name:
      
        85
                        param.copy_(torch.randn_like(param) * 0.05)
      
        86
            peft_model.save_pretrained(str(out_dir))
      
        87
            tokenizer.save_pretrained(str(out_dir))
      
        88
        
        89
        
        90
        @pytest.fixture(scope="module")
      
        91
        def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        92
            adapter_dir = tmp_path_factory.mktemp("cluster-kl-random-adapter")
      
        93
            _build_random_lora_adapter(tiny_model_dir, adapter_dir)
      
        94
            return adapter_dir
      
        95
        
        96
        
        97
        @pytest.fixture(scope="module")
      
        98
        def hf_backend(
      
        99
            tiny_model_dir: Path, random_adapter: Path
      
        100
        ) -> Iterator[HuggingFaceDifferentialBackend]:
      
        101
            backend = HuggingFaceDifferentialBackend(
      
        102
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        103
                adapter_path=random_adapter,
      
        104
            )
      
        105
            yield backend
      
        106
            backend.close()
      
        107
        
        108
        
        109
        def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
      
        110
            pytest.importorskip("sklearn")
      
        111
            pytest.importorskip("sentence_transformers")
      
        112
        
        113
            probe, spec = build_probe(
      
        114
                {
      
        115
                    "name": "ck",
      
        116
                    "kind": "cluster_kl",
      
        117
                    "prompts": _PROMPTS,
      
        118
                    "num_clusters": 2,
      
        119
                    "min_prompts": 16,
      
        120
                }
      
        121
            )
      
        122
            ctx = RunContext(backend=hf_backend)
      
        123
            result = probe.run(spec, ctx)
      
        124
        
        125
            assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
      
        126
            # Under a small random LoRA we don't know the specificity sign;
      
        127
            # just pin that it's finite and in [0, 1].
      
        128
            assert result.raw is not None
      
        129
            assert math.isfinite(result.raw)
      
        130
            assert 0.0 <= result.raw <= 1.0
      
        131
            assert result.evidence["num_clusters"] == 2
      
        132
            assert result.evidence["num_prompts"] == 16
      
        133
            per_cluster = result.evidence["per_cluster_mean_kl"]
      
        134
            assert len(per_cluster) == 2
      
        135
        
        136
        
        137
        def test_null_calibration_lights_up_zscore(
      
        138
            hf_backend: HuggingFaceDifferentialBackend,
      
        139
        ) -> None:
      
        140
            """null_adapter → cluster_kl produces a z_score end-to-end."""
      
        141
            pytest.importorskip("sklearn")
      
        142
            pytest.importorskip("sentence_transformers")
      
        143
        
        144
            raw_spec = SwaySpec.model_validate(
      
        145
                {
      
        146
                    "version": 1,
      
        147
                    "models": {
      
        148
                        "base": {"base": "placeholder"},
      
        149
                        "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
      
        150
                    },
      
        151
                    "suite": [
      
        152
                        {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
      
        153
                        {
      
        154
                            "name": "ck",
      
        155
                            "kind": "cluster_kl",
      
        156
                            "prompts": _PROMPTS,
      
        157
                            "num_clusters": 2,
      
        158
                            "min_prompts": 16,
      
        159
                            "assert_z_gte": -100.0,  # permissive — just want z populated
      
        160
                        },
      
        161
                    ],
      
        162
                }
      
        163
            )
      
        164
            result = run_suite(raw_spec, hf_backend)
      
        165
            assert len(result.probes) == 2
      
        166
            null_result = result.probes[0]
      
        167
            ck_result = result.probes[1]
      
        168
            assert null_result.verdict == Verdict.PASS
      
        169
            assert ck_result.verdict != Verdict.ERROR
      
        170
            assert ck_result.z_score is not None, (
      
        171
                f"cluster_kl should have z-scored against null baseline; "
      
        172
                f"evidence={ck_result.evidence}, message={ck_result.message}"
      
        173
            )
      
        174
            assert math.isfinite(ck_result.z_score)

1	"""Integration test: ``cluster_kl`` end-to-end on a real tiny model.
2
3	Mirrors ``test_external_perplexity_e2e`` — the sprint file for S16
4	explicitly lists this as a DoD item but the fixture was never shipped.
5
6	The test:
7
8	1. Builds a small random LoRA on SmolLM2-135M (same template as
9	``test_external_perplexity_e2e``).
10	2. Runs ``cluster_kl`` with a 16-prompt two-topic set (animals +
11	programming) — split the ft signal across topics so the specificity
12	ratio has a chance to be meaningfully non-0.5.
13	3. Asserts the probe terminates in a non-ERROR verdict, the specificity
14	is finite and in ``[0, 1]``, and when preceded by ``null_adapter`` in
15	a suite the z-score field is populated.
16
17	Needs the ``[semsim]`` extra at runtime (sentence-transformers +
18	scikit-learn). We assume integration runners install those; skip
19	gracefully when they don't.
20	"""
21
22	from __future__ import annotations
23
24	import math
25	from collections.abc import Iterator
26	from pathlib import Path
27
28	import pytest
29
30	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
31	from dlm_sway.core.model import ModelSpec
32	from dlm_sway.core.result import Verdict
33	from dlm_sway.probes.base import RunContext, build_probe
34	from dlm_sway.suite.runner import run as run_suite
35	from dlm_sway.suite.spec import SwaySpec
36
37	pytestmark = [pytest.mark.slow, pytest.mark.online]
38
39
40	# 16 prompts split 8/8 across two obvious topics.
41	_PROMPTS = [
42	# Animals (topic A)
43	"The cat chased the mouse around the house.",
44	"Dogs wag their tails when they are happy.",
45	"Elephants never forget a face they have seen.",
46	"Lions hunt in packs called prides.",
47	"Horses gallop across open fields.",
48	"Sharks have rows of sharp teeth.",
49	"Bees pollinate flowers as they gather nectar.",
50	"Owls hunt small rodents at night.",
51	# Programming (topic B)
52	"Write a Python decorator that logs every call.",
53	"Implement binary search in Rust.",
54	"Debug a segmentation fault in C++ pointer arithmetic.",
55	"Explain ownership semantics in Rust.",
56	"Refactor this JavaScript callback hell into promises.",
57	"Optimize the SQL query by adding an index.",
58	"Profile the memory usage of a Go program.",
59	"Write unit tests for a REST API endpoint.",
60	]
61
62
63	def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
64	import torch
65	from peft import LoraConfig, get_peft_model
66	from transformers import AutoModelForCausalLM, AutoTokenizer
67
68	torch.manual_seed(0)
69	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
70	if tokenizer.pad_token_id is None:
71	tokenizer.pad_token = tokenizer.eos_token
72	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
73	cfg = LoraConfig(
74	r=8,
75	lora_alpha=16,
76	target_modules=["q_proj", "v_proj"],
77	lora_dropout=0.0,
78	bias="none",
79	task_type="CAUSAL_LM",
80	)
81	peft_model = get_peft_model(base, cfg)
82	with torch.no_grad():
83	for name, param in peft_model.named_parameters():
84	if "lora_B" in name:
85	param.copy_(torch.randn_like(param) * 0.05)
86	peft_model.save_pretrained(str(out_dir))
87	tokenizer.save_pretrained(str(out_dir))
88
89
90	@pytest.fixture(scope="module")
91	def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
92	adapter_dir = tmp_path_factory.mktemp("cluster-kl-random-adapter")
93	_build_random_lora_adapter(tiny_model_dir, adapter_dir)
94	return adapter_dir
95
96
97	@pytest.fixture(scope="module")
98	def hf_backend(
99	tiny_model_dir: Path, random_adapter: Path
100	) -> Iterator[HuggingFaceDifferentialBackend]:
101	backend = HuggingFaceDifferentialBackend(
102	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
103	adapter_path=random_adapter,
104	)
105	yield backend
106	backend.close()
107
108
109	def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
110	pytest.importorskip("sklearn")
111	pytest.importorskip("sentence_transformers")
112
113	probe, spec = build_probe(
114	{
115	"name": "ck",
116	"kind": "cluster_kl",
117	"prompts": _PROMPTS,
118	"num_clusters": 2,
119	"min_prompts": 16,
120	}
121	)
122	ctx = RunContext(backend=hf_backend)
123	result = probe.run(spec, ctx)
124
125	assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
126	# Under a small random LoRA we don't know the specificity sign;
127	# just pin that it's finite and in [0, 1].
128	assert result.raw is not None
129	assert math.isfinite(result.raw)
130	assert 0.0 <= result.raw <= 1.0
131	assert result.evidence["num_clusters"] == 2
132	assert result.evidence["num_prompts"] == 16
133	per_cluster = result.evidence["per_cluster_mean_kl"]
134	assert len(per_cluster) == 2
135
136
137	def test_null_calibration_lights_up_zscore(
138	hf_backend: HuggingFaceDifferentialBackend,
139	) -> None:
140	"""null_adapter → cluster_kl produces a z_score end-to-end."""
141	pytest.importorskip("sklearn")
142	pytest.importorskip("sentence_transformers")
143
144	raw_spec = SwaySpec.model_validate(
145	{
146	"version": 1,
147	"models": {
148	"base": {"base": "placeholder"},
149	"ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
150	},
151	"suite": [
152	{"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
153	{
154	"name": "ck",
155	"kind": "cluster_kl",
156	"prompts": _PROMPTS,
157	"num_clusters": 2,
158	"min_prompts": 16,
159	"assert_z_gte": -100.0, # permissive — just want z populated
160	},
161	],
162	}
163	)
164	result = run_suite(raw_spec, hf_backend)
165	assert len(result.probes) == 2
166	null_result = result.probes[0]
167	ck_result = result.probes[1]
168	assert null_result.verdict == Verdict.PASS
169	assert ck_result.verdict != Verdict.ERROR
170	assert ck_result.z_score is not None, (
171	f"cluster_kl should have z-scored against null baseline; "
172	f"evidence={ck_result.evidence}, message={ck_result.message}"
173	)
174	assert math.isfinite(ck_result.z_score)