sway Public

Watch 0 Fork 0 Star 0

Python · 4140 bytes Raw Blame History

  
        1
        """Integration test: multi_turn_coherence_decay end-to-end on a tiny LoRA.
      
        2
        
        3
        Builds a tiny random LoRA on SmolLM2-135M-Instruct (which has a real
      
        4
        chat_template) and runs the probe through 4 turns of synthetic
      
        5
        dialogue. The intent isn't to assert specific KL values — they
      
        6
        depend on the random adapter — but to exercise the full code path
      
        7
        on a real backend so a regression in the chat-template wiring,
      
        8
        turn-loop, or curve-fit plumbing surfaces in slow CI.
      
        9
        
        10
        Marked ``slow + online``.
      
        11
        """
      
        12
        
        13
        from __future__ import annotations
      
        14
        
        15
        from pathlib import Path
      
        16
        
        17
        import pytest
      
        18
        
        19
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        20
        
        21
        
        22
        def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        23
            """Same shape as the other slow-lane backend tests."""
      
        24
            import torch
      
        25
            from peft import LoraConfig, get_peft_model
      
        26
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        27
        
        28
            torch.manual_seed(0)
      
        29
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        30
            if tokenizer.pad_token_id is None:
      
        31
                tokenizer.pad_token = tokenizer.eos_token
      
        32
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        33
            cfg = LoraConfig(
      
        34
                r=8,
      
        35
                lora_alpha=16,
      
        36
                target_modules=["q_proj", "v_proj"],
      
        37
                lora_dropout=0.0,
      
        38
                bias="none",
      
        39
                task_type="CAUSAL_LM",
      
        40
            )
      
        41
            peft_model = get_peft_model(base, cfg)
      
        42
            with torch.no_grad():
      
        43
                for name, param in peft_model.named_parameters():
      
        44
                    if "lora_B" in name:
      
        45
                        # Tiny perturbation — base != ft, but generations stay sane
      
        46
                        # enough to thread through 4 dialogue turns without
      
        47
                        # collapsing to junk.
      
        48
                        param.copy_(torch.randn_like(param) * 0.02)
      
        49
            peft_model.save_pretrained(str(out_dir))
      
        50
            tokenizer.save_pretrained(str(out_dir))
      
        51
        
        52
        
        53
        @pytest.fixture(scope="module")
      
        54
        def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        55
            out = tmp_path_factory.mktemp("multi-turn-coherence-adapter")
      
        56
            _build_random_lora_adapter(tiny_model_dir, out)
      
        57
            return out
      
        58
        
        59
        
        60
        def test_probe_runs_end_to_end_on_real_adapter(tiny_model_dir: Path, random_adapter: Path) -> None:
      
        61
            """Smoke: HF backend + chat_template-equipped tokenizer + real
      
        62
            multi-turn dialogue produces a finalized result with the
      
        63
            documented evidence keys + finite per-turn KLs."""
      
        64
            from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        65
            from dlm_sway.core.model import ModelSpec
      
        66
            from dlm_sway.core.result import Verdict
      
        67
            from dlm_sway.probes.base import RunContext, build_probe
      
        68
        
        69
            backend = HuggingFaceDifferentialBackend(
      
        70
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        71
                adapter_path=random_adapter,
      
        72
            )
      
        73
            try:
      
        74
                probe, spec = build_probe(
      
        75
                    {
      
        76
                        "name": "mtc_smoke",
      
        77
                        "kind": "multi_turn_coherence_decay",
      
        78
                        "prompts": [
      
        79
                            "What's the difference between TCP and UDP?",
      
        80
                            "Explain how a neural network learns.",
      
        81
                        ],
      
        82
                        "max_turns": 3,
      
        83
                        "max_new_tokens": 32,  # keep CPU runtime under control
      
        84
                    }
      
        85
                )
      
        86
                ctx = RunContext(backend=backend, seed=0, top_k=64)
      
        87
                result = probe.run(spec, ctx)
      
        88
            finally:
      
        89
                backend.close()
      
        90
        
        91
            # Shape: any verdict that isn't ERROR is fine. We don't pin
      
        92
            # PASS/FAIL because the random adapter's actual decay shape isn't
      
        93
            # under our control.
      
        94
            assert result.verdict in {
      
        95
                Verdict.PASS,
      
        96
                Verdict.FAIL,
      
        97
                Verdict.WARN,
      
        98
            }, result.message
      
        99
            assert result.evidence["max_turns"] == 3
      
        100
            assert result.evidence["num_prompts"] == 2
      
        101
            per_turn = result.evidence["per_turn_kls"]
      
        102
            assert len(per_turn) == 2  # max_turns - 1
      
        103
            for kl in per_turn:
      
        104
                assert isinstance(kl, float)
      
        105
                assert kl >= 0.0  # KL is non-negative
      
        106
            assert result.evidence["fit_status"] in {"ok", "stable", "non_monotonic", "degenerate"}
      
        107
            sparkline = result.evidence["sparkline"]
      
        108
            assert isinstance(sparkline, str)
      
        109
            assert len(sparkline) == 2

1	"""Integration test: multi_turn_coherence_decay end-to-end on a tiny LoRA.
2
3	Builds a tiny random LoRA on SmolLM2-135M-Instruct (which has a real
4	chat_template) and runs the probe through 4 turns of synthetic
5	dialogue. The intent isn't to assert specific KL values — they
6	depend on the random adapter — but to exercise the full code path
7	on a real backend so a regression in the chat-template wiring,
8	turn-loop, or curve-fit plumbing surfaces in slow CI.
9
10	Marked ``slow + online``.
11	"""
12
13	from __future__ import annotations
14
15	from pathlib import Path
16
17	import pytest
18
19	pytestmark = [pytest.mark.slow, pytest.mark.online]
20
21
22	def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
23	"""Same shape as the other slow-lane backend tests."""
24	import torch
25	from peft import LoraConfig, get_peft_model
26	from transformers import AutoModelForCausalLM, AutoTokenizer
27
28	torch.manual_seed(0)
29	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
30	if tokenizer.pad_token_id is None:
31	tokenizer.pad_token = tokenizer.eos_token
32	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
33	cfg = LoraConfig(
34	r=8,
35	lora_alpha=16,
36	target_modules=["q_proj", "v_proj"],
37	lora_dropout=0.0,
38	bias="none",
39	task_type="CAUSAL_LM",
40	)
41	peft_model = get_peft_model(base, cfg)
42	with torch.no_grad():
43	for name, param in peft_model.named_parameters():
44	if "lora_B" in name:
45	# Tiny perturbation — base != ft, but generations stay sane
46	# enough to thread through 4 dialogue turns without
47	# collapsing to junk.
48	param.copy_(torch.randn_like(param) * 0.02)
49	peft_model.save_pretrained(str(out_dir))
50	tokenizer.save_pretrained(str(out_dir))
51
52
53	@pytest.fixture(scope="module")
54	def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
55	out = tmp_path_factory.mktemp("multi-turn-coherence-adapter")
56	_build_random_lora_adapter(tiny_model_dir, out)
57	return out
58
59
60	def test_probe_runs_end_to_end_on_real_adapter(tiny_model_dir: Path, random_adapter: Path) -> None:
61	"""Smoke: HF backend + chat_template-equipped tokenizer + real
62	multi-turn dialogue produces a finalized result with the
63	documented evidence keys + finite per-turn KLs."""
64	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
65	from dlm_sway.core.model import ModelSpec
66	from dlm_sway.core.result import Verdict
67	from dlm_sway.probes.base import RunContext, build_probe
68
69	backend = HuggingFaceDifferentialBackend(
70	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
71	adapter_path=random_adapter,
72	)
73	try:
74	probe, spec = build_probe(
75	{
76	"name": "mtc_smoke",
77	"kind": "multi_turn_coherence_decay",
78	"prompts": [
79	"What's the difference between TCP and UDP?",
80	"Explain how a neural network learns.",
81	],
82	"max_turns": 3,
83	"max_new_tokens": 32, # keep CPU runtime under control
84	}
85	)
86	ctx = RunContext(backend=backend, seed=0, top_k=64)
87	result = probe.run(spec, ctx)
88	finally:
89	backend.close()
90
91	# Shape: any verdict that isn't ERROR is fine. We don't pin
92	# PASS/FAIL because the random adapter's actual decay shape isn't
93	# under our control.
94	assert result.verdict in {
95	Verdict.PASS,
96	Verdict.FAIL,
97	Verdict.WARN,
98	}, result.message
99	assert result.evidence["max_turns"] == 3
100	assert result.evidence["num_prompts"] == 2
101	per_turn = result.evidence["per_turn_kls"]
102	assert len(per_turn) == 2 # max_turns - 1
103	for kl in per_turn:
104	assert isinstance(kl, float)
105	assert kl >= 0.0 # KL is non-negative
106	assert result.evidence["fit_status"] in {"ok", "stable", "non_monotonic", "degenerate"}
107	sparkline = result.evidence["sparkline"]
108	assert isinstance(sparkline, str)
109	assert len(sparkline) == 2