sway Public

Watch 0 Fork 0 Star 0

Python · 7191 bytes Raw Blame History

  
        1
        """Integration test: HF backend scoring methods on a real tiny model.
      
        2
        
        3
        Covers ``logprob_of`` / ``rolling_logprob`` / ``next_token_dist`` for
      
        4
        both base and ft views — the surface area sway probes hammer hardest
      
        5
        and the area Audit 01 flagged as 21% covered (C2).
      
        6
        
        7
        The zero-token-completion path of ``logprob_of`` (which raises
      
        8
        ``ProbeError``) is exercised here too, since the alternative is the
      
        9
        full CLI integration test catching it for the wrong reason.
      
        10
        
        11
        Marked ``slow+online``.
      
        12
        """
      
        13
        
        14
        from __future__ import annotations
      
        15
        
        16
        import math
      
        17
        from pathlib import Path
      
        18
        
        19
        import numpy as np
      
        20
        import pytest
      
        21
        
        22
        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        23
        from dlm_sway.core.errors import ProbeError
      
        24
        from dlm_sway.core.model import ModelSpec
      
        25
        
        26
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        27
        
        28
        
        29
        def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        30
            """Same shape as the toggle-test adapter."""
      
        31
            import torch
      
        32
            from peft import LoraConfig, get_peft_model
      
        33
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        34
        
        35
            torch.manual_seed(0)
      
        36
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        37
            if tokenizer.pad_token_id is None:
      
        38
                tokenizer.pad_token = tokenizer.eos_token
      
        39
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        40
            cfg = LoraConfig(
      
        41
                r=8,
      
        42
                lora_alpha=16,
      
        43
                target_modules=["q_proj", "v_proj"],
      
        44
                lora_dropout=0.0,
      
        45
                bias="none",
      
        46
                task_type="CAUSAL_LM",
      
        47
            )
      
        48
            peft_model = get_peft_model(base, cfg)
      
        49
            with torch.no_grad():
      
        50
                for name, param in peft_model.named_parameters():
      
        51
                    if "lora_B" in name:
      
        52
                        param.copy_(torch.randn_like(param) * 0.05)
      
        53
            peft_model.save_pretrained(str(out_dir))
      
        54
            tokenizer.save_pretrained(str(out_dir))
      
        55
        
        56
        
        57
        @pytest.fixture(scope="module")
      
        58
        def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        59
            adapter_dir = tmp_path_factory.mktemp("scoring-random-adapter")
      
        60
            _build_random_lora_adapter(tiny_model_dir, adapter_dir)
      
        61
            return adapter_dir
      
        62
        
        63
        
        64
        @pytest.fixture(scope="module")
      
        65
        def hf_backend(tiny_model_dir: Path, random_adapter: Path) -> HuggingFaceDifferentialBackend:
      
        66
            backend = HuggingFaceDifferentialBackend(
      
        67
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        68
                adapter_path=random_adapter,
      
        69
            )
      
        70
            yield backend
      
        71
            backend.close()
      
        72
        
        73
        
        74
        _PROMPTS_AND_COMPLETIONS = [
      
        75
            ("The capital of France is", " Paris"),
      
        76
            ("Two plus two equals", " four"),
      
        77
            ("The quick brown fox jumps over the", " lazy dog"),
      
        78
        ]
      
        79
        
        80
        
        81
        class TestLogprobOf:
      
        82
            @pytest.mark.parametrize(("prompt", "completion"), _PROMPTS_AND_COMPLETIONS)
      
        83
            def test_finite_negative_for_real_completions(
      
        84
                self,
      
        85
                hf_backend: HuggingFaceDifferentialBackend,
      
        86
                prompt: str,
      
        87
                completion: str,
      
        88
            ) -> None:
      
        89
                with hf_backend.as_base() as b:
      
        90
                    lp_base = b.logprob_of(prompt, completion)
      
        91
                with hf_backend.as_finetuned() as f:
      
        92
                    lp_ft = f.logprob_of(prompt, completion)
      
        93
                assert math.isfinite(lp_base)
      
        94
                assert lp_base < 0.0
      
        95
                assert math.isfinite(lp_ft)
      
        96
                assert lp_ft < 0.0
      
        97
        
        98
            def test_zero_token_completion_raises_probe_error(
      
        99
                self, hf_backend: HuggingFaceDifferentialBackend
      
        100
            ) -> None:
      
        101
                """Empty completion tokenizes to zero new tokens — the entry
      
        102
                point must reject it loudly so a probe can route to ERROR."""
      
        103
                with hf_backend.as_base() as b:
      
        104
                    with pytest.raises(ProbeError, match="completion tokenized to zero"):
      
        105
                        b.logprob_of("hello", "")
      
        106
        
        107
            def test_longer_completion_is_more_negative(
      
        108
                self, hf_backend: HuggingFaceDifferentialBackend
      
        109
            ) -> None:
      
        110
                """Sanity: extending a completion can only add negative logprob."""
      
        111
                with hf_backend.as_base() as b:
      
        112
                    short = b.logprob_of("the prefix is", " short")
      
        113
                    longer = b.logprob_of("the prefix is", " short and gets longer here")
      
        114
                assert longer < short, f"longer={longer}, short={short}"
      
        115
        
        116
        
        117
        class TestRollingLogprob:
      
        118
            def test_returns_per_position_logprobs_and_finite_summary(
      
        119
                self, hf_backend: HuggingFaceDifferentialBackend
      
        120
            ) -> None:
      
        121
                with hf_backend.as_base() as b:
      
        122
                    r = b.rolling_logprob("Hello world. This is a sentence.")
      
        123
                assert r.num_tokens >= 2
      
        124
                assert r.logprobs.size == r.num_tokens - 1
      
        125
                assert math.isfinite(r.total_logprob)
      
        126
                assert math.isfinite(r.mean_logprob)
      
        127
                assert math.isfinite(r.perplexity)
      
        128
                assert r.perplexity > 1.0  # any text past one token has PPL > 1
      
        129
        
        130
            def test_short_text_under_two_tokens_returns_empty(
      
        131
                self, hf_backend: HuggingFaceDifferentialBackend
      
        132
            ) -> None:
      
        133
                """Single-token text has no per-position predictions to gather."""
      
        134
                with hf_backend.as_base() as b:
      
        135
                    r = b.rolling_logprob("a")
      
        136
                assert r.logprobs.size == 0
      
        137
                assert r.total_logprob == 0.0
      
        138
        
        139
        
        140
        class TestGenerate:
      
        141
            def test_greedy_generation_returns_string(
      
        142
                self, hf_backend: HuggingFaceDifferentialBackend
      
        143
            ) -> None:
      
        144
                with hf_backend.as_base() as b:
      
        145
                    out = b.generate("Hello", max_new_tokens=8, seed=0)
      
        146
                assert isinstance(out, str)
      
        147
                assert len(out) > 0
      
        148
        
        149
            def test_sampled_generation_obeys_seed(
      
        150
                self, hf_backend: HuggingFaceDifferentialBackend
      
        151
            ) -> None:
      
        152
                """``temperature > 0`` engages the sampling path (do_sample=True)."""
      
        153
                with hf_backend.as_base() as b:
      
        154
                    a = b.generate("The future of AI is", max_new_tokens=8, temperature=0.7, seed=7)
      
        155
                    b1 = b.generate("The future of AI is", max_new_tokens=8, temperature=0.7, seed=7)
      
        156
                assert a == b1, f"sampled generation not deterministic at seed=7: {a!r} vs {b1!r}"
      
        157
        
        158
        
        159
        class TestNextTokenDist:
      
        160
            def test_top_k_dist_finite_and_sorted(self, hf_backend: HuggingFaceDifferentialBackend) -> None:
      
        161
                with hf_backend.as_base() as b:
      
        162
                    d = b.next_token_dist("The capital of France is", top_k=64)
      
        163
                assert d.token_ids.shape == (64,)
      
        164
                assert d.logprobs.shape == (64,)
      
        165
                assert np.all(np.isfinite(d.logprobs))
      
        166
                # Top-k must arrive in descending probability order.
      
        167
                assert np.all(np.diff(d.logprobs) <= 1e-7)
      
        168
                assert d.vocab_size > 64
      
        169
                # B6: tail_logprob is None (k covers vocab — won't happen here),
      
        170
                # 0.0 (underflow), or a finite negative log-prob.
      
        171
                assert d.tail_logprob is None or math.isfinite(d.tail_logprob)
      
        172
        
        173
            def test_dist_changes_under_adapter(self, hf_backend: HuggingFaceDifferentialBackend) -> None:
      
        174
                prompt = "the adapter influences"
      
        175
                with hf_backend.as_base() as b:
      
        176
                    base_dist = b.next_token_dist(prompt, top_k=32)
      
        177
                with hf_backend.as_finetuned() as f:
      
        178
                    ft_dist = f.next_token_dist(prompt, top_k=32)
      
        179
                # Either the top-32 token IDs reordered, or at least one logprob
      
        180
                # moved by more than fp32 noise.
      
        181
                same_ids = np.array_equal(base_dist.token_ids, ft_dist.token_ids)
      
        182
                if same_ids:
      
        183
                    assert not np.allclose(base_dist.logprobs, ft_dist.logprobs, atol=1e-5)

1	"""Integration test: HF backend scoring methods on a real tiny model.
2
3	Covers ``logprob_of`` / ``rolling_logprob`` / ``next_token_dist`` for
4	both base and ft views — the surface area sway probes hammer hardest
5	and the area Audit 01 flagged as 21% covered (C2).
6
7	The zero-token-completion path of ``logprob_of`` (which raises
8	``ProbeError``) is exercised here too, since the alternative is the
9	full CLI integration test catching it for the wrong reason.
10
11	Marked ``slow+online``.
12	"""
13
14	from __future__ import annotations
15
16	import math
17	from pathlib import Path
18
19	import numpy as np
20	import pytest
21
22	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
23	from dlm_sway.core.errors import ProbeError
24	from dlm_sway.core.model import ModelSpec
25
26	pytestmark = [pytest.mark.slow, pytest.mark.online]
27
28
29	def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
30	"""Same shape as the toggle-test adapter."""
31	import torch
32	from peft import LoraConfig, get_peft_model
33	from transformers import AutoModelForCausalLM, AutoTokenizer
34
35	torch.manual_seed(0)
36	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
37	if tokenizer.pad_token_id is None:
38	tokenizer.pad_token = tokenizer.eos_token
39	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
40	cfg = LoraConfig(
41	r=8,
42	lora_alpha=16,
43	target_modules=["q_proj", "v_proj"],
44	lora_dropout=0.0,
45	bias="none",
46	task_type="CAUSAL_LM",
47	)
48	peft_model = get_peft_model(base, cfg)
49	with torch.no_grad():
50	for name, param in peft_model.named_parameters():
51	if "lora_B" in name:
52	param.copy_(torch.randn_like(param) * 0.05)
53	peft_model.save_pretrained(str(out_dir))
54	tokenizer.save_pretrained(str(out_dir))
55
56
57	@pytest.fixture(scope="module")
58	def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
59	adapter_dir = tmp_path_factory.mktemp("scoring-random-adapter")
60	_build_random_lora_adapter(tiny_model_dir, adapter_dir)
61	return adapter_dir
62
63
64	@pytest.fixture(scope="module")
65	def hf_backend(tiny_model_dir: Path, random_adapter: Path) -> HuggingFaceDifferentialBackend:
66	backend = HuggingFaceDifferentialBackend(
67	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
68	adapter_path=random_adapter,
69	)
70	yield backend
71	backend.close()
72
73
74	_PROMPTS_AND_COMPLETIONS = [
75	("The capital of France is", " Paris"),
76	("Two plus two equals", " four"),
77	("The quick brown fox jumps over the", " lazy dog"),
78	]
79
80
81	class TestLogprobOf:
82	@pytest.mark.parametrize(("prompt", "completion"), _PROMPTS_AND_COMPLETIONS)
83	def test_finite_negative_for_real_completions(
84	self,
85	hf_backend: HuggingFaceDifferentialBackend,
86	prompt: str,
87	completion: str,
88	) -> None:
89	with hf_backend.as_base() as b:
90	lp_base = b.logprob_of(prompt, completion)
91	with hf_backend.as_finetuned() as f:
92	lp_ft = f.logprob_of(prompt, completion)
93	assert math.isfinite(lp_base)
94	assert lp_base < 0.0
95	assert math.isfinite(lp_ft)
96	assert lp_ft < 0.0
97
98	def test_zero_token_completion_raises_probe_error(
99	self, hf_backend: HuggingFaceDifferentialBackend
100	) -> None:
101	"""Empty completion tokenizes to zero new tokens — the entry
102	point must reject it loudly so a probe can route to ERROR."""
103	with hf_backend.as_base() as b:
104	with pytest.raises(ProbeError, match="completion tokenized to zero"):
105	b.logprob_of("hello", "")
106
107	def test_longer_completion_is_more_negative(
108	self, hf_backend: HuggingFaceDifferentialBackend
109	) -> None:
110	"""Sanity: extending a completion can only add negative logprob."""
111	with hf_backend.as_base() as b:
112	short = b.logprob_of("the prefix is", " short")
113	longer = b.logprob_of("the prefix is", " short and gets longer here")
114	assert longer < short, f"longer={longer}, short={short}"
115
116
117	class TestRollingLogprob:
118	def test_returns_per_position_logprobs_and_finite_summary(
119	self, hf_backend: HuggingFaceDifferentialBackend
120	) -> None:
121	with hf_backend.as_base() as b:
122	r = b.rolling_logprob("Hello world. This is a sentence.")
123	assert r.num_tokens >= 2
124	assert r.logprobs.size == r.num_tokens - 1
125	assert math.isfinite(r.total_logprob)
126	assert math.isfinite(r.mean_logprob)
127	assert math.isfinite(r.perplexity)
128	assert r.perplexity > 1.0 # any text past one token has PPL > 1
129
130	def test_short_text_under_two_tokens_returns_empty(
131	self, hf_backend: HuggingFaceDifferentialBackend
132	) -> None:
133	"""Single-token text has no per-position predictions to gather."""
134	with hf_backend.as_base() as b:
135	r = b.rolling_logprob("a")
136	assert r.logprobs.size == 0
137	assert r.total_logprob == 0.0
138
139
140	class TestGenerate:
141	def test_greedy_generation_returns_string(
142	self, hf_backend: HuggingFaceDifferentialBackend
143	) -> None:
144	with hf_backend.as_base() as b:
145	out = b.generate("Hello", max_new_tokens=8, seed=0)
146	assert isinstance(out, str)
147	assert len(out) > 0
148
149	def test_sampled_generation_obeys_seed(
150	self, hf_backend: HuggingFaceDifferentialBackend
151	) -> None:
152	"""``temperature > 0`` engages the sampling path (do_sample=True)."""
153	with hf_backend.as_base() as b:
154	a = b.generate("The future of AI is", max_new_tokens=8, temperature=0.7, seed=7)
155	b1 = b.generate("The future of AI is", max_new_tokens=8, temperature=0.7, seed=7)
156	assert a == b1, f"sampled generation not deterministic at seed=7: {a!r} vs {b1!r}"
157
158
159	class TestNextTokenDist:
160	def test_top_k_dist_finite_and_sorted(self, hf_backend: HuggingFaceDifferentialBackend) -> None:
161	with hf_backend.as_base() as b:
162	d = b.next_token_dist("The capital of France is", top_k=64)
163	assert d.token_ids.shape == (64,)
164	assert d.logprobs.shape == (64,)
165	assert np.all(np.isfinite(d.logprobs))
166	# Top-k must arrive in descending probability order.
167	assert np.all(np.diff(d.logprobs) <= 1e-7)
168	assert d.vocab_size > 64
169	# B6: tail_logprob is None (k covers vocab — won't happen here),
170	# 0.0 (underflow), or a finite negative log-prob.
171	assert d.tail_logprob is None or math.isfinite(d.tail_logprob)
172
173	def test_dist_changes_under_adapter(self, hf_backend: HuggingFaceDifferentialBackend) -> None:
174	prompt = "the adapter influences"
175	with hf_backend.as_base() as b:
176	base_dist = b.next_token_dist(prompt, top_k=32)
177	with hf_backend.as_finetuned() as f:
178	ft_dist = f.next_token_dist(prompt, top_k=32)
179	# Either the top-32 token IDs reordered, or at least one logprob
180	# moved by more than fp32 noise.
181	same_ids = np.array_equal(base_dist.token_ids, ft_dist.token_ids)
182	if same_ids:
183	assert not np.allclose(base_dist.logprobs, ft_dist.logprobs, atol=1e-5)