sway Public

Watch 0 Fork 0 Star 0

Python · 5087 bytes Raw Blame History

  
        1
        """Integration test: tool_use_fidelity end-to-end on a real tiny adapter.
      
        2
        
        3
        Builds a tiny random LoRA on SmolLM2-135M-Instruct and runs the probe
      
        4
        against a single tool-use case. The intent isn't to assert that the
      
        5
        135M base produces useful tool calls — it almost certainly won't —
      
        6
        but to exercise the full code path on a real backend so a regression
      
        7
        in the JSON-extraction / schema-check / hallucination plumbing
      
        8
        surfaces in slow CI rather than only at user-fix time.
      
        9
        
        10
        Marked ``slow + online``.
      
        11
        """
      
        12
        
        13
        from __future__ import annotations
      
        14
        
        15
        from pathlib import Path
      
        16
        
        17
        import pytest
      
        18
        
        19
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        20
        
        21
        
        22
        def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
      
        23
            """A trivially-random LoRA over q_proj/v_proj — same shape used by
      
        24
            the other slow-lane backend tests."""
      
        25
            import torch
      
        26
            from peft import LoraConfig, get_peft_model
      
        27
            from transformers import AutoModelForCausalLM, AutoTokenizer
      
        28
        
        29
            torch.manual_seed(0)
      
        30
            tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
      
        31
            if tokenizer.pad_token_id is None:
      
        32
                tokenizer.pad_token = tokenizer.eos_token
      
        33
            base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
      
        34
            cfg = LoraConfig(
      
        35
                r=8,
      
        36
                lora_alpha=16,
      
        37
                target_modules=["q_proj", "v_proj"],
      
        38
                lora_dropout=0.0,
      
        39
                bias="none",
      
        40
                task_type="CAUSAL_LM",
      
        41
            )
      
        42
            peft_model = get_peft_model(base, cfg)
      
        43
            with torch.no_grad():
      
        44
                for name, param in peft_model.named_parameters():
      
        45
                    if "lora_B" in name:
      
        46
                        # Tiny perturbation — enough that base.generate ≠ ft.generate
      
        47
                        # but small enough to keep generations finite + sensible.
      
        48
                        param.copy_(torch.randn_like(param) * 0.01)
      
        49
            peft_model.save_pretrained(str(out_dir))
      
        50
            tokenizer.save_pretrained(str(out_dir))
      
        51
        
        52
        
        53
        @pytest.fixture(scope="module")
      
        54
        def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
      
        55
            out = tmp_path_factory.mktemp("tool-use-fidelity-adapter")
      
        56
            _build_random_lora_adapter(tiny_model_dir, out)
      
        57
            return out
      
        58
        
        59
        
        60
        def test_probe_runs_end_to_end_on_real_adapter(tiny_model_dir: Path, random_adapter: Path) -> None:
      
        61
            """Smoke: HF backend + real adapter + probe execution returns a finalized
      
        62
            result with all evidence keys populated. SmolLM2-135M can't reliably
      
        63
            emit OpenAI-shape calls, so the probe will likely FAIL the validity
      
        64
            floor — what we assert here is that it produced a structured verdict
      
        65
            + finite metrics, not a particular pass/fail outcome."""
      
        66
            from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
      
        67
            from dlm_sway.core.model import ModelSpec
      
        68
            from dlm_sway.core.result import Verdict
      
        69
            from dlm_sway.probes.base import RunContext, build_probe
      
        70
        
        71
            backend = HuggingFaceDifferentialBackend(
      
        72
                base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
      
        73
                adapter_path=random_adapter,
      
        74
            )
      
        75
            try:
      
        76
                probe, spec = build_probe(
      
        77
                    {
      
        78
                        "name": "tuf_smoke",
      
        79
                        "kind": "tool_use_fidelity",
      
        80
                        "cases": [
      
        81
                            {
      
        82
                                "prompt": (
      
        83
                                    "You are a tool-using assistant. The user asks: "
      
        84
                                    "search the web for cats. Reply with ONLY a JSON "
      
        85
                                    'object of the form {"name": ..., "arguments": {...}}.'
      
        86
                                ),
      
        87
                                "tool_spec": {
      
        88
                                    "name": "search_web",
      
        89
                                    "parameters": {
      
        90
                                        "type": "object",
      
        91
                                        "properties": {
      
        92
                                            "query": {"type": "string"},
      
        93
                                            "max_results": {"type": "integer"},
      
        94
                                        },
      
        95
                                        "required": ["query"],
      
        96
                                    },
      
        97
                                },
      
        98
                                "gold_tool_name": "search_web",
      
        99
                                "max_new_tokens": 64,
      
        100
                            }
      
        101
                        ],
      
        102
                        "allowed_tools": ["search_web"],
      
        103
                    }
      
        104
                )
      
        105
                ctx = RunContext(backend=backend, seed=0)
      
        106
                result = probe.run(spec, ctx)
      
        107
            finally:
      
        108
                backend.close()
      
        109
        
        110
            # End-to-end shape: a verdict came back, evidence carries every
      
        111
            # documented key, and the rates are in [0, 1].
      
        112
            assert result.verdict in {Verdict.PASS, Verdict.FAIL, Verdict.WARN}, result.message
      
        113
            assert result.evidence["num_cases"] == 1
      
        114
            for key in (
      
        115
                "json_valid_rate_base",
      
        116
                "json_valid_rate_ft",
      
        117
                "validity_delta",
      
        118
                "mean_arg_disagreement",
      
        119
                "hallucination_rate",
      
        120
            ):
      
        121
                assert key in result.evidence, f"missing evidence key {key}"
      
        122
            assert 0.0 <= result.evidence["json_valid_rate_ft"] <= 1.0
      
        123
            assert 0.0 <= result.evidence["json_valid_rate_base"] <= 1.0
      
        124
            assert -1.0 <= result.evidence["validity_delta"] <= 1.0
      
        125
            assert 0.0 <= result.evidence["hallucination_rate"] <= 1.0

1	"""Integration test: tool_use_fidelity end-to-end on a real tiny adapter.
2
3	Builds a tiny random LoRA on SmolLM2-135M-Instruct and runs the probe
4	against a single tool-use case. The intent isn't to assert that the
5	135M base produces useful tool calls — it almost certainly won't —
6	but to exercise the full code path on a real backend so a regression
7	in the JSON-extraction / schema-check / hallucination plumbing
8	surfaces in slow CI rather than only at user-fix time.
9
10	Marked ``slow + online``.
11	"""
12
13	from __future__ import annotations
14
15	from pathlib import Path
16
17	import pytest
18
19	pytestmark = [pytest.mark.slow, pytest.mark.online]
20
21
22	def _build_random_lora_adapter(base_dir: Path, out_dir: Path) -> None:
23	"""A trivially-random LoRA over q_proj/v_proj — same shape used by
24	the other slow-lane backend tests."""
25	import torch
26	from peft import LoraConfig, get_peft_model
27	from transformers import AutoModelForCausalLM, AutoTokenizer
28
29	torch.manual_seed(0)
30	tokenizer = AutoTokenizer.from_pretrained(str(base_dir))
31	if tokenizer.pad_token_id is None:
32	tokenizer.pad_token = tokenizer.eos_token
33	base = AutoModelForCausalLM.from_pretrained(str(base_dir), torch_dtype=torch.float32)
34	cfg = LoraConfig(
35	r=8,
36	lora_alpha=16,
37	target_modules=["q_proj", "v_proj"],
38	lora_dropout=0.0,
39	bias="none",
40	task_type="CAUSAL_LM",
41	)
42	peft_model = get_peft_model(base, cfg)
43	with torch.no_grad():
44	for name, param in peft_model.named_parameters():
45	if "lora_B" in name:
46	# Tiny perturbation — enough that base.generate ≠ ft.generate
47	# but small enough to keep generations finite + sensible.
48	param.copy_(torch.randn_like(param) * 0.01)
49	peft_model.save_pretrained(str(out_dir))
50	tokenizer.save_pretrained(str(out_dir))
51
52
53	@pytest.fixture(scope="module")
54	def random_adapter(tiny_model_dir: Path, tmp_path_factory: pytest.TempPathFactory) -> Path:
55	out = tmp_path_factory.mktemp("tool-use-fidelity-adapter")
56	_build_random_lora_adapter(tiny_model_dir, out)
57	return out
58
59
60	def test_probe_runs_end_to_end_on_real_adapter(tiny_model_dir: Path, random_adapter: Path) -> None:
61	"""Smoke: HF backend + real adapter + probe execution returns a finalized
62	result with all evidence keys populated. SmolLM2-135M can't reliably
63	emit OpenAI-shape calls, so the probe will likely FAIL the validity
64	floor — what we assert here is that it produced a structured verdict
65	+ finite metrics, not a particular pass/fail outcome."""
66	from dlm_sway.backends.hf import HuggingFaceDifferentialBackend
67	from dlm_sway.core.model import ModelSpec
68	from dlm_sway.core.result import Verdict
69	from dlm_sway.probes.base import RunContext, build_probe
70
71	backend = HuggingFaceDifferentialBackend(
72	base_spec=ModelSpec(base=str(tiny_model_dir), kind="hf", dtype="fp32", device="cpu"),
73	adapter_path=random_adapter,
74	)
75	try:
76	probe, spec = build_probe(
77	{
78	"name": "tuf_smoke",
79	"kind": "tool_use_fidelity",
80	"cases": [
81	{
82	"prompt": (
83	"You are a tool-using assistant. The user asks: "
84	"search the web for cats. Reply with ONLY a JSON "
85	'object of the form {"name": ..., "arguments": {...}}.'
86	),
87	"tool_spec": {
88	"name": "search_web",
89	"parameters": {
90	"type": "object",
91	"properties": {
92	"query": {"type": "string"},
93	"max_results": {"type": "integer"},
94	},
95	"required": ["query"],
96	},
97	},
98	"gold_tool_name": "search_web",
99	"max_new_tokens": 64,
100	}
101	],
102	"allowed_tools": ["search_web"],
103	}
104	)
105	ctx = RunContext(backend=backend, seed=0)
106	result = probe.run(spec, ctx)
107	finally:
108	backend.close()
109
110	# End-to-end shape: a verdict came back, evidence carries every
111	# documented key, and the rates are in [0, 1].
112	assert result.verdict in {Verdict.PASS, Verdict.FAIL, Verdict.WARN}, result.message
113	assert result.evidence["num_cases"] == 1
114	for key in (
115	"json_valid_rate_base",
116	"json_valid_rate_ft",
117	"validity_delta",
118	"mean_arg_disagreement",
119	"hallucination_rate",
120	):
121	assert key in result.evidence, f"missing evidence key {key}"
122	assert 0.0 <= result.evidence["json_valid_rate_ft"] <= 1.0
123	assert 0.0 <= result.evidence["json_valid_rate_base"] <= 1.0
124	assert -1.0 <= result.evidence["validity_delta"] <= 1.0
125	assert 0.0 <= result.evidence["hallucination_rate"] <= 1.0