core/golden: bump logprob_tol 1e-6 → 1e-4 — absorbs ubuntu BLAS drift
- SHA
c6463ce0c68f22c46d4d5850e56992939250f29f- Parents
-
e0e16ed - Tree
f6d885d
c6463ce
c6463ce0c68f22c46d4d5850e56992939250f29fe0e16ed
f6d885d| Status | File | + | - |
|---|---|---|---|
| M |
src/dlm_sway/core/golden.py
|
11 | 1 |
src/dlm_sway/core/golden.pymodified@@ -94,7 +94,7 @@ def compare_goldens( | ||
| 94 | 94 | actual: Any, |
| 95 | 95 | expected: Any, |
| 96 | 96 | *, |
| 97 | - logprob_tol: float = 1e-6, | |
| 97 | + logprob_tol: float = 1e-4, | |
| 98 | 98 | score_tol: float = 1e-4, |
| 99 | 99 | ) -> list[Diff]: |
| 100 | 100 | """Compare two masked JSON payloads; return tolerance-exceeding diffs. |
@@ -106,6 +106,16 @@ def compare_goldens( | ||
| 106 | 106 | the appropriate tolerance (``score_tol`` for score-like fields, |
| 107 | 107 | ``logprob_tol`` elsewhere). Missing keys, length mismatches, or |
| 108 | 108 | type changes surface as structural diffs regardless of tolerance. |
| 109 | + | |
| 110 | + **Tolerance rationale.** S18's first CI observation showed | |
| 111 | + intra-platform BLAS drift in the 1e-5–1e-6 band on ubuntu-latest | |
| 112 | + runners (heterogeneous Intel/AMD hardware + variable OpenBLAS | |
| 113 | + builds), which put 1e-6 below the natural noise floor. A real | |
| 114 | + algorithm change — e.g. flipping ``top_k=256`` → 128 in | |
| 115 | + :mod:`delta_kl` — shifts probe raws by 1e-2 to 1e-1, three orders | |
| 116 | + of magnitude above the current ``1e-4`` tolerance. Tuning room to | |
| 117 | + revisit once we have more CI history; see the sprint's risks | |
| 118 | + section for the "too tight vs too loose" tradeoff. | |
| 109 | 119 | """ |
| 110 | 120 | diffs: list[Diff] = [] |
| 111 | 121 | _walk(actual, expected, path="$", diffs=diffs, logprob_tol=logprob_tol, score_tol=score_tol) |