tests/golden: scaffold — 2-probe spec (delta_kl + calibration_drift) for S18 golden
- SHA
9fe7f3258ae54fb097e1ba5931bf74a30df18860- Parents
-
50b26bf - Tree
750940c
9fe7f32
9fe7f3258ae54fb097e1ba5931bf74a30df1886050b26bf
750940c| Status | File | + | - |
|---|---|---|---|
| A |
tests/golden/__init__.py
|
9 | 0 |
| A |
tests/golden/spec.yaml
|
46 | 0 |
tests/golden/__init__.pyadded@@ -0,0 +1,9 @@ | ||
| 1 | +"""Cross-platform determinism golden fixtures (S18). | |
| 2 | + | |
| 3 | +Contains the minimal spec + platform-pinned ``expected_<platform>.json`` | |
| 4 | +files that ``tests/integration/test_determinism_golden.py`` diffs against. | |
| 5 | + | |
| 6 | +Regeneration: set ``SWAY_UPDATE_GOLDENS=1`` when running the golden | |
| 7 | +test, or dispatch the ``determinism-golden`` CI workflow with | |
| 8 | +``regenerate_goldens=true`` and commit the resulting artifact. | |
| 9 | +""" | |
tests/golden/spec.yamladded@@ -0,0 +1,46 @@ | ||
| 1 | +# Determinism golden spec (S18). | |
| 2 | +# | |
| 3 | +# Pins a minimal 2-probe suite the CI runs on both ubuntu-latest and | |
| 4 | +# macos-latest-xlarge to catch silent algorithm drift. Intentionally | |
| 5 | +# small: one delta_kl probe + one calibration_drift probe, 4 prompts | |
| 6 | +# each, no null_adapter (null adds runtime without improving the | |
| 7 | +# drift-detection signal — the golden already encodes the exact | |
| 8 | +# raw/score values). | |
| 9 | +# | |
| 10 | +# ``models.base.base`` and ``models.ft.adapter`` are placeholders — | |
| 11 | +# the golden test's fixture substitutes paths at runtime so the | |
| 12 | +# checked-in spec stays portable. | |
| 13 | + | |
| 14 | +version: 1 | |
| 15 | +models: | |
| 16 | + base: | |
| 17 | + base: placeholder-base | |
| 18 | + kind: hf | |
| 19 | + adapter: placeholder-base-adapter | |
| 20 | + dtype: fp32 | |
| 21 | + device: cpu | |
| 22 | + ft: | |
| 23 | + base: placeholder-base | |
| 24 | + kind: hf | |
| 25 | + adapter: placeholder-ft-adapter | |
| 26 | + dtype: fp32 | |
| 27 | + device: cpu | |
| 28 | +defaults: | |
| 29 | + seed: 0 | |
| 30 | + differential: true | |
| 31 | +suite: | |
| 32 | + - name: dk_golden | |
| 33 | + kind: delta_kl | |
| 34 | + prompts: | |
| 35 | + - The capital of France is | |
| 36 | + - Water boils at a temperature of | |
| 37 | + - The sun rises in the | |
| 38 | + - Python decorators are useful for | |
| 39 | + divergence: js | |
| 40 | + assert_mean_gte: 0.0 | |
| 41 | + - name: cal_golden | |
| 42 | + kind: calibration_drift | |
| 43 | + items_limit: 20 | |
| 44 | + regression_nats: 1.0 | |
| 45 | + assert_mean_delta_gte: -0.5 | |
| 46 | + assert_fraction_regressed_lt: 0.15 | |