tenseleyflow/sway / 9fe7f32

Browse files

tests/golden: scaffold — 2-probe spec (delta_kl + calibration_drift) for S18 golden

Authored by espadonne
SHA
9fe7f3258ae54fb097e1ba5931bf74a30df18860
Parents
50b26bf
Tree
750940c

2 changed files

StatusFile+-
A tests/golden/__init__.py 9 0
A tests/golden/spec.yaml 46 0
tests/golden/__init__.pyadded
@@ -0,0 +1,9 @@
1
+"""Cross-platform determinism golden fixtures (S18).
2
+
3
+Contains the minimal spec + platform-pinned ``expected_<platform>.json``
4
+files that ``tests/integration/test_determinism_golden.py`` diffs against.
5
+
6
+Regeneration: set ``SWAY_UPDATE_GOLDENS=1`` when running the golden
7
+test, or dispatch the ``determinism-golden`` CI workflow with
8
+``regenerate_goldens=true`` and commit the resulting artifact.
9
+"""
tests/golden/spec.yamladded
@@ -0,0 +1,46 @@
1
+# Determinism golden spec (S18).
2
+#
3
+# Pins a minimal 2-probe suite the CI runs on both ubuntu-latest and
4
+# macos-latest-xlarge to catch silent algorithm drift. Intentionally
5
+# small: one delta_kl probe + one calibration_drift probe, 4 prompts
6
+# each, no null_adapter (null adds runtime without improving the
7
+# drift-detection signal — the golden already encodes the exact
8
+# raw/score values).
9
+#
10
+# ``models.base.base`` and ``models.ft.adapter`` are placeholders —
11
+# the golden test's fixture substitutes paths at runtime so the
12
+# checked-in spec stays portable.
13
+
14
+version: 1
15
+models:
16
+  base:
17
+    base: placeholder-base
18
+    kind: hf
19
+    adapter: placeholder-base-adapter
20
+    dtype: fp32
21
+    device: cpu
22
+  ft:
23
+    base: placeholder-base
24
+    kind: hf
25
+    adapter: placeholder-ft-adapter
26
+    dtype: fp32
27
+    device: cpu
28
+defaults:
29
+  seed: 0
30
+  differential: true
31
+suite:
32
+  - name: dk_golden
33
+    kind: delta_kl
34
+    prompts:
35
+      - The capital of France is
36
+      - Water boils at a temperature of
37
+      - The sun rises in the
38
+      - Python decorators are useful for
39
+    divergence: js
40
+    assert_mean_gte: 0.0
41
+  - name: cal_golden
42
+    kind: calibration_drift
43
+    items_limit: 20
44
+    regression_nats: 1.0
45
+    assert_mean_delta_gte: -0.5
46
+    assert_fraction_regressed_lt: 0.15