@@ -90,21 +90,20 @@ class TestCompareGoldensIdentical: |
| 90 | 90 | class TestCompareGoldensTolerance: |
| 91 | 91 | def test_floats_within_logprob_tol_pass(self) -> None: |
| 92 | 92 | actual = {"probes": [{"raw": 0.12345}]} |
| 93 | | - expected = {"probes": [{"raw": 0.12345 + 5e-7}]} # well under 1e-6 |
| 93 | + expected = {"probes": [{"raw": 0.12345 + 5e-5}]} # well under 1e-4 |
| 94 | 94 | assert compare_goldens(actual, expected) == [] |
| 95 | 95 | |
| 96 | 96 | def test_floats_just_above_logprob_tol_fail(self) -> None: |
| 97 | 97 | actual = {"probes": [{"raw": 0.12345}]} |
| 98 | | - expected = {"probes": [{"raw": 0.12345 + 2e-6}]} # double the tol |
| 98 | + expected = {"probes": [{"raw": 0.12345 + 2e-4}]} # double the tol |
| 99 | 99 | diffs = compare_goldens(actual, expected) |
| 100 | 100 | assert len(diffs) == 1 |
| 101 | 101 | assert "raw" in diffs[0].path |
| 102 | 102 | assert "Δ" in diffs[0].reason |
| 103 | 103 | |
| 104 | | - def test_scores_use_looser_tol(self) -> None: |
| 105 | | - """Score fields get ``score_tol`` (1e-4), not ``logprob_tol``. |
| 106 | | - A 5e-5 drift on a score field passes; the same drift on a |
| 107 | | - non-score field would fail at default logprob tol.""" |
| 104 | + def test_scores_match_logprob_tol_default(self) -> None: |
| 105 | + """Score fields use ``score_tol`` (1e-4) — same as ``logprob_tol`` |
| 106 | + after S18's first-week tuning. A 5e-5 drift passes on both.""" |
| 108 | 107 | actual = {"overall": 0.85} |
| 109 | 108 | expected = {"overall": 0.85 + 5e-5} |
| 110 | 109 | assert compare_goldens(actual, expected) == [] |
@@ -120,10 +119,13 @@ class TestCompareGoldensTolerance: |
| 120 | 119 | """Callers can tighten or loosen both tolerances.""" |
| 121 | 120 | actual = {"probes": [{"raw": 0.1}]} |
| 122 | 121 | expected = {"probes": [{"raw": 0.1 + 5e-4}]} |
| 123 | | - # Default tol (1e-6) → fail. |
| 122 | + # Default tol (1e-4) → fail. |
| 124 | 123 | assert compare_goldens(actual, expected) != [] |
| 125 | 124 | # Loosened to 1e-3 → pass. |
| 126 | 125 | assert compare_goldens(actual, expected, logprob_tol=1e-3) == [] |
| 126 | + # Tightened to 1e-6 → same fail, but also a regression guard |
| 127 | + # if we ever tighten the default back. |
| 128 | + assert compare_goldens(actual, expected, logprob_tol=1e-6) != [] |
| 127 | 129 | |
| 128 | 130 | def test_nan_vs_nan_treated_equal(self) -> None: |
| 129 | 131 | actual = {"z_score": float("nan")} |
@@ -226,7 +228,7 @@ class TestRealisticPayload: |
| 226 | 228 | "probes": [ |
| 227 | 229 | { |
| 228 | 230 | "name": "dk", |
| 229 | | - "raw": 0.4561 + 5e-7, # within logprob_tol |
| 231 | + "raw": 0.4561 + 5e-5, # within logprob_tol (1e-4) |
| 230 | 232 | "score": 0.87, |
| 231 | 233 | "duration_s": 0.789, # different duration |
| 232 | 234 | }, |
@@ -237,15 +239,15 @@ class TestRealisticPayload: |
| 237 | 239 | assert compare_goldens(masked_actual, masked_expected) == [] |
| 238 | 240 | |
| 239 | 241 | def test_simulated_silent_algorithm_change_is_caught(self) -> None: |
| 240 | | - """Prove-the-value sanity: a 1e-3 drift on a probe's raw is |
| 241 | | - flagged, even when every variable field differs.""" |
| 242 | + """Prove-the-value sanity: a 1e-2 drift on a probe's raw is |
| 243 | + flagged — well above the 1e-4 default tolerance. Real |
| 244 | + algorithm changes (e.g. flipping ``top_k=256`` → 128) shift |
| 245 | + raws by this order of magnitude.""" |
| 242 | 246 | expected = {"probes": [{"raw": 0.4561}]} |
| 243 | | - # Simulate an algorithm change: someone edited delta_kl's |
| 244 | | - # top_k default and raw shifted by 1e-3. |
| 245 | | - actual = {"probes": [{"raw": 0.4571}]} |
| 247 | + actual = {"probes": [{"raw": 0.4561 + 1e-2}]} |
| 246 | 248 | diffs = compare_goldens(actual, expected) |
| 247 | 249 | assert len(diffs) == 1 |
| 248 | 250 | assert "raw" in diffs[0].path |
| 249 | 251 | assert math.isclose( |
| 250 | | - abs(float(diffs[0].actual) - float(diffs[0].expected)), 1e-3, abs_tol=1e-9 |
| 252 | + abs(float(diffs[0].actual) - float(diffs[0].expected)), 1e-2, abs_tol=1e-9 |
| 251 | 253 | ) |