tenseleyflow/sway / 88bbdf6

Browse files

tests/golden_comparator: update tolerance boundaries for 1e-4 default

Authored by espadonne
SHA
88bbdf623c0f196c6a9aac10010e12b3e3cca5a7
Parents
c6463ce
Tree
8a2cd73

1 changed file

StatusFile+-
M tests/unit/test_golden_comparator.py 16 14
tests/unit/test_golden_comparator.pymodified
@@ -90,21 +90,20 @@ class TestCompareGoldensIdentical:
9090
 class TestCompareGoldensTolerance:
9191
     def test_floats_within_logprob_tol_pass(self) -> None:
9292
         actual = {"probes": [{"raw": 0.12345}]}
93
-        expected = {"probes": [{"raw": 0.12345 + 5e-7}]}  # well under 1e-6
93
+        expected = {"probes": [{"raw": 0.12345 + 5e-5}]}  # well under 1e-4
9494
         assert compare_goldens(actual, expected) == []
9595
 
9696
     def test_floats_just_above_logprob_tol_fail(self) -> None:
9797
         actual = {"probes": [{"raw": 0.12345}]}
98
-        expected = {"probes": [{"raw": 0.12345 + 2e-6}]}  # double the tol
98
+        expected = {"probes": [{"raw": 0.12345 + 2e-4}]}  # double the tol
9999
         diffs = compare_goldens(actual, expected)
100100
         assert len(diffs) == 1
101101
         assert "raw" in diffs[0].path
102102
         assert "Δ" in diffs[0].reason
103103
 
104
-    def test_scores_use_looser_tol(self) -> None:
105
-        """Score fields get ``score_tol`` (1e-4), not ``logprob_tol``.
106
-        A 5e-5 drift on a score field passes; the same drift on a
107
-        non-score field would fail at default logprob tol."""
104
+    def test_scores_match_logprob_tol_default(self) -> None:
105
+        """Score fields use ``score_tol`` (1e-4) — same as ``logprob_tol``
106
+        after S18's first-week tuning. A 5e-5 drift passes on both."""
108107
         actual = {"overall": 0.85}
109108
         expected = {"overall": 0.85 + 5e-5}
110109
         assert compare_goldens(actual, expected) == []
@@ -120,10 +119,13 @@ class TestCompareGoldensTolerance:
120119
         """Callers can tighten or loosen both tolerances."""
121120
         actual = {"probes": [{"raw": 0.1}]}
122121
         expected = {"probes": [{"raw": 0.1 + 5e-4}]}
123
-        # Default tol (1e-6) → fail.
122
+        # Default tol (1e-4) → fail.
124123
         assert compare_goldens(actual, expected) != []
125124
         # Loosened to 1e-3 → pass.
126125
         assert compare_goldens(actual, expected, logprob_tol=1e-3) == []
126
+        # Tightened to 1e-6 → same fail, but also a regression guard
127
+        # if we ever tighten the default back.
128
+        assert compare_goldens(actual, expected, logprob_tol=1e-6) != []
127129
 
128130
     def test_nan_vs_nan_treated_equal(self) -> None:
129131
         actual = {"z_score": float("nan")}
@@ -226,7 +228,7 @@ class TestRealisticPayload:
226228
             "probes": [
227229
                 {
228230
                     "name": "dk",
229
-                    "raw": 0.4561 + 5e-7,  # within logprob_tol
231
+                    "raw": 0.4561 + 5e-5,  # within logprob_tol (1e-4)
230232
                     "score": 0.87,
231233
                     "duration_s": 0.789,  # different duration
232234
                 },
@@ -237,15 +239,15 @@ class TestRealisticPayload:
237239
         assert compare_goldens(masked_actual, masked_expected) == []
238240
 
239241
     def test_simulated_silent_algorithm_change_is_caught(self) -> None:
240
-        """Prove-the-value sanity: a 1e-3 drift on a probe's raw is
241
-        flagged, even when every variable field differs."""
242
+        """Prove-the-value sanity: a 1e-2 drift on a probe's raw is
243
+        flagged — well above the 1e-4 default tolerance. Real
244
+        algorithm changes (e.g. flipping ``top_k=256`` → 128) shift
245
+        raws by this order of magnitude."""
242246
         expected = {"probes": [{"raw": 0.4561}]}
243
-        # Simulate an algorithm change: someone edited delta_kl's
244
-        # top_k default and raw shifted by 1e-3.
245
-        actual = {"probes": [{"raw": 0.4571}]}
247
+        actual = {"probes": [{"raw": 0.4561 + 1e-2}]}
246248
         diffs = compare_goldens(actual, expected)
247249
         assert len(diffs) == 1
248250
         assert "raw" in diffs[0].path
249251
         assert math.isclose(
250
-            abs(float(diffs[0].actual) - float(diffs[0].expected)), 1e-3, abs_tol=1e-9
252
+            abs(float(diffs[0].actual) - float(diffs[0].expected)), 1e-2, abs_tol=1e-9
251253
         )