Python · 6559 bytes Raw Blame History
1 """Tests for :func:`dlm_sway.core.result.safe_finalize`.
2
3 This helper is the shared guardrail S01 installs against NaN-flows-through
4 bugs. It must:
5
6 - Route critical non-finite fields to :attr:`Verdict.ERROR` with score nulled
7 - Defensively null non-critical non-finite fields without changing the verdict
8 - Leave all-finite inputs untouched
9 - Preserve the original non-finite values in evidence for postmortem
10 """
11
12 from __future__ import annotations
13
14 import math
15
16 from dlm_sway.core.result import ProbeResult, Verdict, safe_finalize
17
18
19 class TestAllFinite:
20 def test_passthrough_preserves_all_fields(self) -> None:
21 r = safe_finalize(
22 name="p1",
23 kind="delta_kl",
24 verdict=Verdict.PASS,
25 score=0.75,
26 raw=0.08,
27 z_score=3.2,
28 base_value=0.0,
29 ft_value=0.08,
30 evidence={"num_prompts": 4},
31 message="looks fine",
32 duration_s=1.2,
33 )
34 assert r.verdict == Verdict.PASS
35 assert r.score == 0.75
36 assert r.raw == 0.08
37 assert r.z_score == 3.2
38 assert r.base_value == 0.0
39 assert r.ft_value == 0.08
40 assert r.message == "looks fine"
41 assert r.duration_s == 1.2
42 assert r.evidence == {"num_prompts": 4}
43
44 def test_defaults(self) -> None:
45 r = safe_finalize(name="p", kind="k", verdict=Verdict.PASS, score=1.0)
46 assert r.raw is None
47 assert r.z_score is None
48 assert r.evidence == {}
49 assert r.duration_s == 0.0
50
51
52 class TestCriticalNonFinite:
53 def test_nan_raw_routes_to_error(self) -> None:
54 r = safe_finalize(
55 name="p",
56 kind="delta_kl",
57 verdict=Verdict.PASS,
58 score=1.0,
59 raw=math.nan,
60 z_score=3.0,
61 )
62 assert r.verdict == Verdict.ERROR
63 assert r.score is None
64 assert r.raw is None
65 assert r.z_score is None
66 assert "non-finite critical" in r.message
67 assert "raw" in r.message
68 assert "raw" in r.evidence["non_finite_inputs"]
69 assert math.isnan(r.evidence["non_finite_inputs"]["raw"])
70
71 def test_inf_raw_routes_to_error(self) -> None:
72 r = safe_finalize(
73 name="p",
74 kind="delta_kl",
75 verdict=Verdict.PASS,
76 score=1.0,
77 raw=math.inf,
78 )
79 assert r.verdict == Verdict.ERROR
80 assert r.evidence["non_finite_inputs"]["raw"] == math.inf
81
82 def test_negative_inf_raw_routes_to_error(self) -> None:
83 r = safe_finalize(
84 name="p",
85 kind="delta_kl",
86 verdict=Verdict.PASS,
87 score=1.0,
88 raw=-math.inf,
89 )
90 assert r.verdict == Verdict.ERROR
91
92 def test_error_capture_includes_all_non_finite_fields(self) -> None:
93 """Even non-critical fields that are non-finite are recorded in evidence."""
94 r = safe_finalize(
95 name="p",
96 kind="delta_kl",
97 verdict=Verdict.PASS,
98 score=1.0,
99 raw=math.nan,
100 z_score=math.inf,
101 base_value=math.nan,
102 )
103 assert r.verdict == Verdict.ERROR
104 captured = r.evidence["non_finite_inputs"]
105 assert set(captured) == {"raw", "z_score", "base_value"}
106
107 def test_error_preserves_caller_evidence_keys(self) -> None:
108 r = safe_finalize(
109 name="p",
110 kind="delta_kl",
111 verdict=Verdict.PASS,
112 score=1.0,
113 raw=math.nan,
114 evidence={"per_prompt": [1, 2, 3], "num_prompts": 3},
115 )
116 assert r.verdict == Verdict.ERROR
117 assert r.evidence["per_prompt"] == [1, 2, 3]
118 assert r.evidence["num_prompts"] == 3
119 assert "non_finite_inputs" in r.evidence
120
121
122 class TestNonCriticalNonFinite:
123 def test_nan_z_score_is_nulled_silently(self) -> None:
124 r = safe_finalize(
125 name="p",
126 kind="delta_kl",
127 verdict=Verdict.PASS,
128 score=0.7,
129 raw=0.05,
130 z_score=math.nan,
131 )
132 assert r.verdict == Verdict.PASS
133 assert r.score == 0.7
134 assert r.raw == 0.05
135 assert r.z_score is None
136 assert "z_score" in r.evidence["defensively_nulled"]
137
138 def test_nan_base_and_ft_nulled_preserves_passing_score(self) -> None:
139 r = safe_finalize(
140 name="p",
141 kind="delta_kl",
142 verdict=Verdict.PASS,
143 score=0.9,
144 raw=0.1,
145 base_value=math.nan,
146 ft_value=math.inf,
147 )
148 assert r.verdict == Verdict.PASS
149 assert r.base_value is None
150 assert r.ft_value is None
151 assert sorted(r.evidence["defensively_nulled"]) == ["base_value", "ft_value"]
152
153
154 class TestCriticalFieldsOverride:
155 def test_z_score_critical_triggers_error_on_nan(self) -> None:
156 r = safe_finalize(
157 name="p",
158 kind="adapter_ablation",
159 verdict=Verdict.PASS,
160 score=1.0,
161 raw=0.9,
162 z_score=math.nan,
163 critical_fields=("raw", "z_score"),
164 )
165 assert r.verdict == Verdict.ERROR
166 assert "z_score" in r.message
167
168 def test_critical_fields_empty_allows_all_through(self) -> None:
169 """When no field is critical, even NaN raw only gets defensively nulled."""
170 r = safe_finalize(
171 name="p",
172 kind="delta_kl",
173 verdict=Verdict.PASS,
174 score=1.0,
175 raw=math.nan,
176 critical_fields=(),
177 )
178 assert r.verdict == Verdict.PASS
179 assert r.raw is None
180 assert "raw" in r.evidence["defensively_nulled"]
181
182
183 class TestBoolFieldsNotMistakenForFloat:
184 """Pyantic sometimes wraps bools as ints; isinstance(True, int) is True.
185 We don't want booleans to be treated as numeric checks.
186 """
187
188 def test_true_in_a_numeric_slot_is_not_non_finite(self) -> None:
189 # This test pins behavior: even if a caller passes True, we don't
190 # crash. We also don't treat True as non-finite.
191 r = safe_finalize(
192 name="p",
193 kind="test",
194 verdict=Verdict.PASS,
195 score=1.0,
196 raw=True, # type: ignore[arg-type]
197 )
198 assert r.verdict == Verdict.PASS # bool is finite
199
200
201 class TestResultTypeReturned:
202 def test_returns_probe_result(self) -> None:
203 r = safe_finalize(name="p", kind="k", verdict=Verdict.PASS, score=1.0)
204 assert isinstance(r, ProbeResult)