tests/golden: scaffold — 2-probe spec (delta_kl + calibration_drift) for S18 golden
- SHA
9fe7f3258ae54fb097e1ba5931bf74a30df18860- Parents
-
50b26bf - Tree
750940c
9fe7f32
9fe7f3258ae54fb097e1ba5931bf74a30df1886050b26bf
750940c| Status | File | + | - |
|---|---|---|---|
| A |
tests/golden/__init__.py
|
9 | 0 |
| A |
tests/golden/spec.yaml
|
46 | 0 |
tests/golden/__init__.pyadded@@ -0,0 +1,9 @@ | |||
| 1 | +"""Cross-platform determinism golden fixtures (S18). | ||
| 2 | + | ||
| 3 | +Contains the minimal spec + platform-pinned ``expected_<platform>.json`` | ||
| 4 | +files that ``tests/integration/test_determinism_golden.py`` diffs against. | ||
| 5 | + | ||
| 6 | +Regeneration: set ``SWAY_UPDATE_GOLDENS=1`` when running the golden | ||
| 7 | +test, or dispatch the ``determinism-golden`` CI workflow with | ||
| 8 | +``regenerate_goldens=true`` and commit the resulting artifact. | ||
| 9 | +""" | ||
tests/golden/spec.yamladded@@ -0,0 +1,46 @@ | |||
| 1 | +# Determinism golden spec (S18). | ||
| 2 | +# | ||
| 3 | +# Pins a minimal 2-probe suite the CI runs on both ubuntu-latest and | ||
| 4 | +# macos-latest-xlarge to catch silent algorithm drift. Intentionally | ||
| 5 | +# small: one delta_kl probe + one calibration_drift probe, 4 prompts | ||
| 6 | +# each, no null_adapter (null adds runtime without improving the | ||
| 7 | +# drift-detection signal — the golden already encodes the exact | ||
| 8 | +# raw/score values). | ||
| 9 | +# | ||
| 10 | +# ``models.base.base`` and ``models.ft.adapter`` are placeholders — | ||
| 11 | +# the golden test's fixture substitutes paths at runtime so the | ||
| 12 | +# checked-in spec stays portable. | ||
| 13 | + | ||
| 14 | +version: 1 | ||
| 15 | +models: | ||
| 16 | + base: | ||
| 17 | + base: placeholder-base | ||
| 18 | + kind: hf | ||
| 19 | + adapter: placeholder-base-adapter | ||
| 20 | + dtype: fp32 | ||
| 21 | + device: cpu | ||
| 22 | + ft: | ||
| 23 | + base: placeholder-base | ||
| 24 | + kind: hf | ||
| 25 | + adapter: placeholder-ft-adapter | ||
| 26 | + dtype: fp32 | ||
| 27 | + device: cpu | ||
| 28 | +defaults: | ||
| 29 | + seed: 0 | ||
| 30 | + differential: true | ||
| 31 | +suite: | ||
| 32 | + - name: dk_golden | ||
| 33 | + kind: delta_kl | ||
| 34 | + prompts: | ||
| 35 | + - The capital of France is | ||
| 36 | + - Water boils at a temperature of | ||
| 37 | + - The sun rises in the | ||
| 38 | + - Python decorators are useful for | ||
| 39 | + divergence: js | ||
| 40 | + assert_mean_gte: 0.0 | ||
| 41 | + - name: cal_golden | ||
| 42 | + kind: calibration_drift | ||
| 43 | + items_limit: 20 | ||
| 44 | + regression_nats: 1.0 | ||
| 45 | + assert_mean_delta_gte: -0.5 | ||
| 46 | + assert_fraction_regressed_lt: 0.15 | ||