Python · 5888 bytes Raw Blame History
1 """Smoke test for the two-model differential wrapper.
2
3 Covers the ``defaults.differential: false`` code path: the runner
4 routes ``as_base()`` through one independent backend and
5 ``as_finetuned()`` through another. Proper integration in S04.
6 """
7
8 from __future__ import annotations
9
10 import numpy as np
11
12 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
13 from dlm_sway.backends.two_model import TwoModelDifferential
14 from dlm_sway.core.result import Verdict
15 from dlm_sway.core.scoring import TokenDist
16 from dlm_sway.suite.runner import run as run_suite
17 from dlm_sway.suite.spec import SwaySpec
18
19
20 def _backend_with(token: str, prob: float) -> DummyDifferentialBackend:
21 """A dummy backend whose next-token dist puts ``prob`` mass on ``token``."""
22 other = (1.0 - prob) / 2.0
23 dist = TokenDist(
24 token_ids=np.array([1, 2, 3], dtype=np.int64),
25 logprobs=np.log(np.asarray([prob, other, other], dtype=np.float32)),
26 vocab_size=100,
27 )
28 responses = DummyResponses(token_dists={"q1": dist, "q2": dist})
29 return DummyDifferentialBackend(base=responses, ft=responses)
30
31
32 class TestRoutesBaseAndFTToTwoBackends:
33 def test_two_backends_produce_distinct_dists(self) -> None:
34 """Each side of the wrapper yields its own independent dist."""
35 base_backend = _backend_with("a", 0.9)
36 ft_backend = _backend_with("a", 0.1)
37 wrapper = TwoModelDifferential(base=base_backend, ft=ft_backend)
38
39 with wrapper.as_base() as v:
40 base_dist = v.next_token_dist("q1")
41 with wrapper.as_finetuned() as v:
42 ft_dist = v.next_token_dist("q1")
43
44 # First-token logprob on base should be high (low magnitude),
45 # and on ft should be much lower (large negative magnitude).
46 assert base_dist.logprobs[0] > ft_dist.logprobs[0], (
47 f"expected base[0] > ft[0]; got base={base_dist.logprobs[0]}, ft={ft_dist.logprobs[0]}"
48 )
49
50 def test_runner_routes_through_wrapper_end_to_end(self) -> None:
51 """A full suite run picks up divergence between the two backends."""
52 base_backend = _backend_with("a", 0.9)
53 ft_backend = _backend_with("a", 0.1)
54 wrapper = TwoModelDifferential(base=base_backend, ft=ft_backend)
55
56 spec = SwaySpec.model_validate(
57 {
58 "version": 1,
59 "models": {
60 "base": {"base": "b"},
61 "ft": {"base": "b", "adapter": "/tmp/a"},
62 },
63 "defaults": {"differential": False},
64 "suite": [
65 {
66 "name": "dk",
67 "kind": "delta_kl",
68 "prompts": ["q1", "q2"],
69 "assert_mean_gte": 0.0,
70 }
71 ],
72 }
73 )
74 result = run_suite(spec, wrapper)
75 assert len(result.probes) == 1
76 dk = result.probes[0]
77 assert dk.verdict in (Verdict.PASS, Verdict.FAIL)
78 # Real divergence between the two backends — raw must be > 0.
79 assert dk.raw is not None
80 assert dk.raw > 0.0
81
82
83 class TestPreflightPassthrough:
84 def test_preflight_delegates_to_ft_backend(self) -> None:
85 base = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
86 ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
87 wrapper = TwoModelDifferential(base=base, ft=ft)
88 ok, _ = wrapper.preflight_finite_check()
89 assert ok is True
90
91
92 class TestConcurrencyFlagComposition:
93 """F06 regression — wrapper's ``safe_for_concurrent_views`` is the
94 AND of the two inner backends' flags, defaulting to ``False`` when
95 either is absent. Before F06, the attribute was missing entirely and
96 the runner defaulted to ``False`` even when both inners set ``True``.
97 """
98
99 def test_missing_on_both_defaults_false(self) -> None:
100 base = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
101 ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
102 wrapper = TwoModelDifferential(base=base, ft=ft)
103 # Dummy has safe_for_concurrent_views=False by class default.
104 assert wrapper.safe_for_concurrent_views is False
105
106 def test_both_true_composes_true(self) -> None:
107 class SafeDummy(DummyDifferentialBackend):
108 safe_for_concurrent_views = True
109
110 base = SafeDummy(base=DummyResponses(), ft=DummyResponses())
111 ft = SafeDummy(base=DummyResponses(), ft=DummyResponses())
112 wrapper = TwoModelDifferential(base=base, ft=ft)
113 assert wrapper.safe_for_concurrent_views is True
114
115 def test_one_true_one_false_composes_false(self) -> None:
116 class SafeDummy(DummyDifferentialBackend):
117 safe_for_concurrent_views = True
118
119 base = SafeDummy(base=DummyResponses(), ft=DummyResponses())
120 ft = DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
121 wrapper = TwoModelDifferential(base=base, ft=ft)
122 assert wrapper.safe_for_concurrent_views is False
123
124
125 class TestSpecAcceptsDifferentialFalse:
126 def test_loader_accepts_false_then_uses_two_separate(self, tmp_path) -> None:
127 """The CLI path: spec.defaults.differential=False routes through
128 ``build_two_separate``. We don't exercise the actual HF backend
129 here — the unit smoke is that the spec parses + the wrapper
130 runs end-to-end on dummy."""
131 spec = SwaySpec.model_validate(
132 {
133 "version": 1,
134 "models": {
135 "base": {"base": "b"},
136 "ft": {"base": "b", "adapter": "/tmp/a"},
137 },
138 "defaults": {"differential": False},
139 "suite": [],
140 }
141 )
142 assert spec.defaults.differential is False