tenseleyflow/sway / ee4f548

Browse files

tests/golden: expected_darwin.json — locally-generated pin (S18.5)

Authored by espadonne
SHA
ee4f54843a142679c8871ea3c170bb54a42146c5
Parents
f230789
Tree
39561c8

1 changed file

StatusFile+-
A tests/golden/expected_darwin.json 96 0
tests/golden/expected_darwin.jsonadded
@@ -0,0 +1,96 @@
1
+{
2
+  "determinism": {
3
+    "class": "best_effort",
4
+    "notes": [
5
+      "MPS: bit-identical across runs is best-effort"
6
+    ],
7
+    "seed": 0
8
+  },
9
+  "null_stats": {},
10
+  "probes": [
11
+    {
12
+      "base_value": null,
13
+      "ci_95": [
14
+        0.00845183397355073,
15
+        0.02976923823248102
16
+      ],
17
+      "evidence": {
18
+        "divergence_kind": "js",
19
+        "max": 0.036150663063872254,
20
+        "num_prompts": 4,
21
+        "per_prompt": [
22
+          0.01062496373830732,
23
+          0.006711906030413639,
24
+          0.013671617802962006,
25
+          0.036150663063872254
26
+        ],
27
+        "raw_ci_95": [
28
+          0.00845183397355073,
29
+          0.02976923823248102
30
+        ],
31
+        "weight": 1.0,
32
+        "z_by_rank": null
33
+      },
34
+      "ft_value": null,
35
+      "kind": "delta_kl",
36
+      "message": "mean js=0.0168 (\u2265 0.0) (no calibration for delta_kl)",
37
+      "name": "dk_golden",
38
+      "raw": 0.016789787658888805,
39
+      "score": 0.0242225433930576,
40
+      "verdict": "pass",
41
+      "z_score": null
42
+    },
43
+    {
44
+      "base_value": null,
45
+      "ci_95": [
46
+        0.0,
47
+        0.0
48
+      ],
49
+      "evidence": {
50
+        "fraction_regressed": 0.0,
51
+        "mean_delta_nats": -0.03518791794776917,
52
+        "raw_ci_95": [
53
+          0.0,
54
+          0.0
55
+        ],
56
+        "regressed_count": 0,
57
+        "regression_nats_threshold": 1.0,
58
+        "total_items": 20,
59
+        "weight": 1.0,
60
+        "worst_offenders": [],
61
+        "z_by_rank": null
62
+      },
63
+      "ft_value": -0.03518791794776917,
64
+      "kind": "calibration_drift",
65
+      "message": "0/20 items regressed >1.0 nats (frac=0.0%), mean_delta=-0.035 nats/tok (no calibration for calibration_drift)",
66
+      "name": "cal_golden",
67
+      "raw": 0.0,
68
+      "score": 0.8572832218805948,
69
+      "verdict": "pass",
70
+      "z_score": null
71
+    }
72
+  ],
73
+  "schema_version": 1,
74
+  "score": {
75
+    "band": "partial",
76
+    "components": {
77
+      "ablation": 0.0,
78
+      "adherence": 0.0242225433930576,
79
+      "attribution": 0.0,
80
+      "baseline": 0.0,
81
+      "calibration": 0.8572832218805948
82
+    },
83
+    "findings": [
84
+      "adherence score is 0.02 \u2014 below the noise threshold"
85
+    ],
86
+    "overall": 0.3574468147880725,
87
+    "weights": {
88
+      "ablation": 0.15,
89
+      "adherence": 0.3,
90
+      "attribution": 0.35,
91
+      "baseline": 0.0,
92
+      "calibration": 0.2
93
+    }
94
+  },
95
+  "spec_path": "<memory>"
96
+}