sway Public

Watch 0 Fork 0 Star 0

Python · 4894 bytes Raw Blame History

  
        1
        """Integration test: ``external_perplexity`` end-to-end on a real tiny model.
      
        2
        
        3
        Runs the probe against SmolLM2-135M with a small random LoRA so both
      
        4
        sides produce real rolling-logprob values. The test asserts three
      
        5
        contracts:
      
        6
        
        7
        1. The probe terminates in a non-ERROR verdict (the real backend's
      
        8
           ``rolling_logprob`` returns finite logprobs on natural English prose).
      
        9
        2. The per-chunk delta array has the requested length and no NaNs.
      
        10
        3. The null-calibration path lights up the ``z_score`` field in a
      
        11
           two-probe suite (``null_adapter`` first, then ``external_perplexity``).
      
        12
        
        13
        Marked ``slow+online``.
      
        14
        """
      
        15
        
        16
        from __future__ import annotations
      
        17
        
        18
        import math
      
        19
        from collections.abc import Iterator
      
        20
        from pathlib import Path
      
        21
        
        22
        import numpy as np
      
        23
        import pytest
      
        24
        
        25
        from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        26
        from dlm_sway.core.model import ModelSpec
      
        27
        from dlm_sway.core.result import Verdict
      
        28
        from dlm_sway.probes.base import RunContext, build_probe
      
        29
        from dlm_sway.suite.runner import run as run_suite
      
        30
        from dlm_sway.suite.spec import SwaySpec
      
        31
        
        32
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        33
        
        34
        
        35
        def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        36
            import torch
      
        37
            from peft import LoraConfig, get_peft_model
      
        38
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        39
        
        40
            torch.manual_seed(0)
      
        41
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        42
            if tokenizer.pad_token_id is None:
      
        43
                tokenizer.pad_token = tokenizer.eos_token
      
        44
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        45
            cfg = LoraConfig(
      
        46
                r=8,
      
        47
                lora_alpha=16,
      
        48
                target_modules=["q_proj", "v_proj"],
      
        49
                lora_dropout=0.0,
      
        50
                bias="none",
      
        51
                task_type="CAUSAL_LM",
      
        52
            )
      
        53
            peft_model = get_peft_model(base, cfg)
      
        54
            with torch.no_grad():
      
        55
                for name, param in peft_model.named_parameters():
      
        56
                    if "lora_B" in name:
      
        57
                        param.copy_(torch.randn_like(param) * 0.05)
      
        58
            peft_model.save_pretrained(str(out_dir))
      
        59
            tokenizer.save_pretrained(str(out_dir))
      
        60
        
        61
        
        62
        @pytest.fixture(scope="module")
      
        63
        def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        64
            adapter_dir = tmp_path_factory.mktemp("ext-ppl-random-adapter")
      
        65
            _build_random_lora_adapter(tiny_model_dir, adapter_dir)
      
        66
            return adapter_dir
      
        67
        
        68
        
        69
        @pytest.fixture(scope="module")
      
        70
        def hf_backend(
      
        71
            tiny_model_dir: Path, random_adapter: Path
      
        72
        ) -> Iterator[HuggingFaceDifferentialBackend]:
      
        73
            backend = HuggingFaceDifferentialBackend(
      
        74
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        75
                adapter_path=random_adapter,
      
        76
            )
      
        77
            yield backend
      
        78
            backend.close()
      
        79
        
        80
        
        81
        def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
      
        82
            probe, spec = build_probe(
      
        83
                {
      
        84
                    "name": "ext_ppl",
      
        85
                    "kind": "external_perplexity",
      
        86
                    "max_chunks": 2,
      
        87
                    "chunk_chars": 512,
      
        88
                }
      
        89
            )
      
        90
            ctx = RunContext(backend=hf_backend)
      
        91
            result = probe.run(spec, ctx)
      
        92
            assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
      
        93
            assert result.raw is not None
      
        94
            assert math.isfinite(result.raw)
      
        95
            per_chunk = result.evidence["per_chunk_delta"]
      
        96
            assert len(per_chunk) == 2
      
        97
            assert all(math.isfinite(d) for d in per_chunk)
      
        98
            assert np.all(np.isfinite(np.asarray(per_chunk, dtype=np.float64)))
      
        99
        
        100
        
        101
        def test_null_calibration_lights_up_zscore(hf_backend: HuggingFaceDifferentialBackend) -> None:
      
        102
            """null_adapter → external_perplexity produces a z_score end-to-end."""
      
        103
            raw_spec = SwaySpec.model_validate(
      
        104
                {
      
        105
                    "version": 1,
      
        106
                    "models": {
      
        107
                        "base": {"base": "placeholder"},
      
        108
                        "ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
      
        109
                    },
      
        110
                    "suite": [
      
        111
                        # Two null seeds keep runtime bounded; std just has to be
      
        112
                        # non-zero for the z-score path to engage.
      
        113
                        {"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
      
        114
                        {
      
        115
                            "name": "ext",
      
        116
                            "kind": "external_perplexity",
      
        117
                            "max_chunks": 2,
      
        118
                            "chunk_chars": 512,
      
        119
                            "assert_z_gte": -100.0,  # permissive — sign/magnitude is adapter-specific
      
        120
                        },
      
        121
                    ],
      
        122
                }
      
        123
            )
      
        124
            result = run_suite(raw_spec, hf_backend)
      
        125
            assert len(result.probes) == 2
      
        126
            null_result = result.probes[0]
      
        127
            ext_result = result.probes[1]
      
        128
            assert null_result.verdict == Verdict.PASS
      
        129
            assert ext_result.verdict != Verdict.ERROR
      
        130
            assert ext_result.z_score is not None, (
      
        131
                f"external_perplexity should have z-scored against null baseline; "
      
        132
                f"evidence={ext_result.evidence}, message={ext_result.message}"
      
        133
            )
      
        134
            assert math.isfinite(ext_result.z_score)

1	"""Integration test: ``external_perplexity`` end-to-end on a real tiny model.
2
3	Runs the probe against SmolLM2-135M with a small random LoRA so both
4	sides produce real rolling-logprob values. The test asserts three
5	contracts:
6
7	1. The probe terminates in a non-ERROR verdict (the real backend's
8	``rolling_logprob`` returns finite logprobs on natural English prose).
9	2. The per-chunk delta array has the requested length and no NaNs.
10	3. The null-calibration path lights up the ``z_score`` field in a
11	two-probe suite (``null_adapter`` first, then ``external_perplexity``).
12
13	Marked ``slow+online``.
14	"""
15
16	from __future__ import annotations
17
18	import math
19	from collections.abc import Iterator
20	from pathlib import Path
21
22	import numpy as np
23	import pytest
24
25	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
26	from dlm_sway.core.model import ModelSpec
27	from dlm_sway.core.result import Verdict
28	from dlm_sway.probes.base import RunContext, build_probe
29	from dlm_sway.suite.runner import run as run_suite
30	from dlm_sway.suite.spec import SwaySpec
31
32	pytestmark = [pytest.mark.slow, pytest.mark.online]
33
34
35	def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
36	import torch
37	from peft import LoraConfig, get_peft_model
38	from transformers import AutoModelForCausalLM, AutoTokenizer
39
40	torch.manual_seed(0)
41	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
42	if tokenizer.pad_token_id is None:
43	tokenizer.pad_token = tokenizer.eos_token
44	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
45	cfg = LoraConfig(
46	r=8,
47	lora_alpha=16,
48	target_modules=["q_proj", "v_proj"],
49	lora_dropout=0.0,
50	bias="none",
51	task_type="CAUSAL_LM",
52	)
53	peft_model = get_peft_model(base, cfg)
54	with torch.no_grad():
55	for name, param in peft_model.named_parameters():
56	if "lora_B" in name:
57	param.copy_(torch.randn_like(param) * 0.05)
58	peft_model.save_pretrained(str(out_dir))
59	tokenizer.save_pretrained(str(out_dir))
60
61
62	@pytest.fixture(scope="module")
63	def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
64	adapter_dir = tmp_path_factory.mktemp("ext-ppl-random-adapter")
65	_build_random_lora_adapter(tiny_model_dir, adapter_dir)
66	return adapter_dir
67
68
69	@pytest.fixture(scope="module")
70	def hf_backend(
71	tiny_model_dir: Path, random_adapter: Path
72	) -> Iterator[HuggingFaceDifferentialBackend]:
73	backend = HuggingFaceDifferentialBackend(
74	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
75	adapter_path=random_adapter,
76	)
77	yield backend
78	backend.close()
79
80
81	def test_probe_runs_on_real_backend(hf_backend: HuggingFaceDifferentialBackend) -> None:
82	probe, spec = build_probe(
83	{
84	"name": "ext_ppl",
85	"kind": "external_perplexity",
86	"max_chunks": 2,
87	"chunk_chars": 512,
88	}
89	)
90	ctx = RunContext(backend=hf_backend)
91	result = probe.run(spec, ctx)
92	assert result.verdict != Verdict.ERROR, f"probe errored: {result.message}"
93	assert result.raw is not None
94	assert math.isfinite(result.raw)
95	per_chunk = result.evidence["per_chunk_delta"]
96	assert len(per_chunk) == 2
97	assert all(math.isfinite(d) for d in per_chunk)
98	assert np.all(np.isfinite(np.asarray(per_chunk, dtype=np.float64)))
99
100
101	def test_null_calibration_lights_up_zscore(hf_backend: HuggingFaceDifferentialBackend) -> None:
102	"""null_adapter → external_perplexity produces a z_score end-to-end."""
103	raw_spec = SwaySpec.model_validate(
104	{
105	"version": 1,
106	"models": {
107	"base": {"base": "placeholder"},
108	"ft": {"base": "placeholder", "adapter": "/tmp/placeholder"},
109	},
110	"suite": [
111	# Two null seeds keep runtime bounded; std just has to be
112	# non-zero for the z-score path to engage.
113	{"name": "null", "kind": "null_adapter", "runs": 2, "cache": False},
114	{
115	"name": "ext",
116	"kind": "external_perplexity",
117	"max_chunks": 2,
118	"chunk_chars": 512,
119	"assert_z_gte": -100.0, # permissive — sign/magnitude is adapter-specific
120	},
121	],
122	}
123	)
124	result = run_suite(raw_spec, hf_backend)
125	assert len(result.probes) == 2
126	null_result = result.probes[0]
127	ext_result = result.probes[1]
128	assert null_result.verdict == Verdict.PASS
129	assert ext_result.verdict != Verdict.ERROR
130	assert ext_result.z_score is not None, (
131	f"external_perplexity should have z-scored against null baseline; "
132	f"evidence={ext_result.evidence}, message={ext_result.message}"
133	)
134	assert math.isfinite(ext_result.z_score)