sway Public

Watch 0 Fork 0 Star 0

Python · 5888 bytes Raw Blame History

  
        1
        """Smoke test for the two-model differential wrapper.
      
        2
        
        3
        Covers the ``defaults.differential: false`` code path: the runner
      
        4
        routes ``as_base()`` through one independent backend and
      
        5
        ``as_finetuned()`` through another. Proper integration in S04.
      
        6
        """
      
        7
        
        8
        from __future__ import annotations
      
        9
        
        10
        import numpy as np
      
        11
        
        12
        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
      
        13
        from dlm_sway.backends.two_model import TwoModelDifferential
      
        14
        from dlm_sway.core.result import Verdict
      
        15
        from dlm_sway.core.scoring import TokenDist
      
        16
        from dlm_sway.suite.runner import run as run_suite
      
        17
        from dlm_sway.suite.spec import SwaySpec
      
        18
        
        19
        
        20
        def _backend_with(token: str, prob: float) -> DummyDifferentialBackend:
      
        21
            """A dummy backend whose next-token dist puts ``prob`` mass on ``token``."""
      
        22
            other = (1.0 - prob) / 2.0
      
        23
            dist = TokenDist(
      
        24
                token_ids=np.array([1, 2, 3], dtype=np.int64),
      
        25
                logprobs=np.log(np.asarray([prob, other, other], dtype=np.float32)),
      
        26
                vocab_size=100,
      
        27
            )
      
        28
            responses = DummyResponses(token_dists={"q1": dist, "q2": dist})
      
        29
            return DummyDifferentialBackend(base=responses, ft=responses)
      
        30
        
        31
        
        32
        class TestRoutesBaseAndFTToTwoBackends:
      
        33
            def test_two_backends_produce_distinct_dists(self) -> None:
      
        34
                """Each side of the wrapper yields its own independent dist."""
      
        35
                base_backend = _backend_with("a", 0.9)
      
        36
                ft_backend = _backend_with("a", 0.1)
      
        37
                wrapper = TwoModelDifferential(base=base_backend, ft=ft_backend)
      
        38
        
        39
                with wrapper.as_base() as v:
      
        40
                    base_dist = v.next_token_dist("q1")
      
        41
                with wrapper.as_finetuned() as v:
      
        42
                    ft_dist = v.next_token_dist("q1")
      
        43
        
        44
                # First-token logprob on base should be high (low magnitude),
      
        45
                # and on ft should be much lower (large negative magnitude).
      
        46
                assert base_dist.logprobs[0] > ft_dist.logprobs[0], (
      
        47
                    f"expected base[0] > ft[0]; got base={base_dist.logprobs[0]}, ft={ft_dist.logprobs[0]}"
      
        48
                )
      
        49
        
        50
            def test_runner_routes_through_wrapper_end_to_end(self) -> None:
      
        51
                """A full suite run picks up divergence between the two backends."""
      
        52
                base_backend = _backend_with("a", 0.9)
      
        53
                ft_backend = _backend_with("a", 0.1)
      
        54
                wrapper = TwoModelDifferential(base=base_backend, ft=ft_backend)
      
        55
        
        56
                spec = SwaySpec.model_validate(
      
        57
                    {
      
        58
                        "version": 1,
      
        59
                        "models": {
      
        60
                            "base": {"base": "b"},
      
        61
                            "ft": {"base": "b", "adapter": "/tmp/a"},
      
        62
                        },
      
        63
                        "defaults": {"differential": False},
      
        64
                        "suite": [
      
        65
                            {
      
        66
                                "name": "dk",
      
        67
                                "kind": "delta_kl",
      
        68
                                "prompts": ["q1", "q2"],
      
        69
                                "assert_mean_gte": 0.0,
      
        70
                            }
      
        71
                        ],
      
        72
                    }
      
        73
                )
      
        74
                result = run_suite(spec, wrapper)
      
        75
                assert len(result.probes) == 1
      
        76
                dk = result.probes[0]
      
        77
                assert dk.verdict in (Verdict.PASS, Verdict.FAIL)
      
        78
                # Real divergence between the two backends — raw must be > 0.
      
        79
                assert dk.raw is not None
      
        80
                assert dk.raw > 0.0
      
        81
        
        82
        
        83
        class TestPreflightPassthrough:
      
        84
            def test_preflight_delegates_to_ft_backend(self) -> None:
      
        85
                base = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        86
                ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        87
                wrapper = TwoModelDifferential(base=base, ft=ft)
      
        88
                ok, _ = wrapper.preflight_finite_check()
      
        89
                assert ok is True
      
        90
        
        91
        
        92
        class TestConcurrencyFlagComposition:
      
        93
            """F06 regression — wrapper's ``safe_for_concurrent_views`` is the
      
        94
            AND of the two inner backends' flags, defaulting to ``False`` when
      
        95
            either is absent. Before F06, the attribute was missing entirely and
      
        96
            the runner defaulted to ``False`` even when both inners set ``True``.
      
        97
            """
      
        98
        
        99
            def test_missing_on_both_defaults_false(self) -> None:
      
        100
                base = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        101
                ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        102
                wrapper = TwoModelDifferential(base=base, ft=ft)
      
        103
                # Dummy has safe_for_concurrent_views=False by class default.
      
        104
                assert wrapper.safe_for_concurrent_views is False
      
        105
        
        106
            def test_both_true_composes_true(self) -> None:
      
        107
                class SafeDummy(DummyDifferentialBackend):
      
        108
                    safe_for_concurrent_views = True
      
        109
        
        110
                base = SafeDummy(base=DummyResponses(), ft=DummyResponses())
      
        111
                ft = SafeDummy(base=DummyResponses(), ft=DummyResponses())
      
        112
                wrapper = TwoModelDifferential(base=base, ft=ft)
      
        113
                assert wrapper.safe_for_concurrent_views is True
      
        114
        
        115
            def test_one_true_one_false_composes_false(self) -> None:
      
        116
                class SafeDummy(DummyDifferentialBackend):
      
        117
                    safe_for_concurrent_views = True
      
        118
        
        119
                base = SafeDummy(base=DummyResponses(), ft=DummyResponses())
      
        120
                ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        121
                wrapper = TwoModelDifferential(base=base, ft=ft)
      
        122
                assert wrapper.safe_for_concurrent_views is False
      
        123
        
        124
        
        125
        class TestSpecAcceptsDifferentialFalse:
      
        126
            def test_loader_accepts_false_then_uses_two_separate(self, tmp_path) -> None:
      
        127
                """The CLI path: spec.defaults.differential=False routes through
      
        128
                ``build_two_separate``. We don't exercise the actual HF backend
      
        129
                here — the unit smoke is that the spec parses + the wrapper
      
        130
                runs end-to-end on dummy."""
      
        131
                spec = SwaySpec.model_validate(
      
        132
                    {
      
        133
                        "version": 1,
      
        134
                        "models": {
      
        135
                            "base": {"base": "b"},
      
        136
                            "ft": {"base": "b", "adapter": "/tmp/a"},
      
        137
                        },
      
        138
                        "defaults": {"differential": False},
      
        139
                        "suite": [],
      
        140
                    }
      
        141
                )
      
        142
                assert spec.defaults.differential is False

1	"""Smoke test for the two-model differential wrapper.
2
3	Covers the ``defaults.differential: false`` code path: the runner
4	routes ``as_base()`` through one independent backend and
5	``as_finetuned()`` through another. Proper integration in S04.
6	"""
7
8	from __future__ import annotations
9
10	import numpy as np
11
12	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
13	from dlm_sway.backends.two_model import TwoModelDifferential
14	from dlm_sway.core.result import Verdict
15	from dlm_sway.core.scoring import TokenDist
16	from dlm_sway.suite.runner import run as run_suite
17	from dlm_sway.suite.spec import SwaySpec
18
19
20	def _backend_with(token: str, prob: float) -> DummyDifferentialBackend:
21	"""A dummy backend whose next-token dist puts ``prob`` mass on ``token``."""
22	other = (1.0 - prob) / 2.0
23	dist = TokenDist(
24	token_ids=np.array([1, 2, 3], dtype=np.int64),
25	logprobs=np.log(np.asarray([prob, other, other], dtype=np.float32)),
26	vocab_size=100,
27	)
28	responses = DummyResponses(token_dists={"q1": dist, "q2": dist})
29	return DummyDifferentialBackend(base=responses, ft=responses)
30
31
32	class TestRoutesBaseAndFTToTwoBackends:
33	def test_two_backends_produce_distinct_dists(self) -> None:
34	"""Each side of the wrapper yields its own independent dist."""
35	base_backend = _backend_with("a", 0.9)
36	ft_backend = _backend_with("a", 0.1)
37	wrapper = TwoModelDifferential(base=base_backend, ft=ft_backend)
38
39	with wrapper.as_base() as v:
40	base_dist = v.next_token_dist("q1")
41	with wrapper.as_finetuned() as v:
42	ft_dist = v.next_token_dist("q1")
43
44	# First-token logprob on base should be high (low magnitude),
45	# and on ft should be much lower (large negative magnitude).
46	assert base_dist.logprobs[0] > ft_dist.logprobs[0], (
47	f"expected base[0] > ft[0]; got base={base_dist.logprobs[0]}, ft={ft_dist.logprobs[0]}"
48	)
49
50	def test_runner_routes_through_wrapper_end_to_end(self) -> None:
51	"""A full suite run picks up divergence between the two backends."""
52	base_backend = _backend_with("a", 0.9)
53	ft_backend = _backend_with("a", 0.1)
54	wrapper = TwoModelDifferential(base=base_backend, ft=ft_backend)
55
56	spec = SwaySpec.model_validate(
57	{
58	"version": 1,
59	"models": {
60	"base": {"base": "b"},
61	"ft": {"base": "b", "adapter": "/tmp/a"},
62	},
63	"defaults": {"differential": False},
64	"suite": [
65	{
66	"name": "dk",
67	"kind": "delta_kl",
68	"prompts": ["q1", "q2"],
69	"assert_mean_gte": 0.0,
70	}
71	],
72	}
73	)
74	result = run_suite(spec, wrapper)
75	assert len(result.probes) == 1
76	dk = result.probes[0]
77	assert dk.verdict in (Verdict.PASS, Verdict.FAIL)
78	# Real divergence between the two backends — raw must be > 0.
79	assert dk.raw is not None
80	assert dk.raw > 0.0
81
82
83	class TestPreflightPassthrough:
84	def test_preflight_delegates_to_ft_backend(self) -> None:
85	base = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
86	ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
87	wrapper = TwoModelDifferential(base=base, ft=ft)
88	ok, _ = wrapper.preflight_finite_check()
89	assert ok is True
90
91
92	class TestConcurrencyFlagComposition:
93	"""F06 regression — wrapper's ``safe_for_concurrent_views`` is the
94	AND of the two inner backends' flags, defaulting to ``False`` when
95	either is absent. Before F06, the attribute was missing entirely and
96	the runner defaulted to ``False`` even when both inners set ``True``.
97	"""
98
99	def test_missing_on_both_defaults_false(self) -> None:
100	base = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
101	ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
102	wrapper = TwoModelDifferential(base=base, ft=ft)
103	# Dummy has safe_for_concurrent_views=False by class default.
104	assert wrapper.safe_for_concurrent_views is False
105
106	def test_both_true_composes_true(self) -> None:
107	class SafeDummy(DummyDifferentialBackend):
108	safe_for_concurrent_views = True
109
110	base = SafeDummy(base=DummyResponses(), ft=DummyResponses())
111	ft = SafeDummy(base=DummyResponses(), ft=DummyResponses())
112	wrapper = TwoModelDifferential(base=base, ft=ft)
113	assert wrapper.safe_for_concurrent_views is True
114
115	def test_one_true_one_false_composes_false(self) -> None:
116	class SafeDummy(DummyDifferentialBackend):
117	safe_for_concurrent_views = True
118
119	base = SafeDummy(base=DummyResponses(), ft=DummyResponses())
120	ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
121	wrapper = TwoModelDifferential(base=base, ft=ft)
122	assert wrapper.safe_for_concurrent_views is False
123
124
125	class TestSpecAcceptsDifferentialFalse:
126	def test_loader_accepts_false_then_uses_two_separate(self, tmp_path) -> None:
127	"""The CLI path: spec.defaults.differential=False routes through
128	``build_two_separate``. We don't exercise the actual HF backend
129	here — the unit smoke is that the spec parses + the wrapper
130	runs end-to-end on dummy."""
131	spec = SwaySpec.model_validate(
132	{
133	"version": 1,
134	"models": {
135	"base": {"base": "b"},
136	"ft": {"base": "b", "adapter": "/tmp/a"},
137	},
138	"defaults": {"differential": False},
139	"suite": [],
140	}
141	)
142	assert spec.defaults.differential is False