JSON · 2325 bytes Raw Blame History
1 {
2 "determinism": {
3 "class": "best_effort",
4 "notes": [
5 "MPS: bit-identical across runs is best-effort"
6 ],
7 "seed": 0
8 },
9 "null_stats": {},
10 "probes": [
11 {
12 "base_value": null,
13 "ci_95": [
14 0.00845183397355073,
15 0.02976923823248102
16 ],
17 "evidence": {
18 "divergence_kind": "js",
19 "max": 0.036150663063872254,
20 "num_prompts": 4,
21 "per_prompt": [
22 0.01062496373830732,
23 0.006711906030413639,
24 0.013671617802962006,
25 0.036150663063872254
26 ],
27 "raw_ci_95": [
28 0.00845183397355073,
29 0.02976923823248102
30 ],
31 "weight": 1.0,
32 "z_by_rank": null
33 },
34 "ft_value": null,
35 "kind": "delta_kl",
36 "message": "mean js=0.0168 (\u2265 0.0) (no calibration for delta_kl)",
37 "name": "dk_golden",
38 "raw": 0.016789787658888805,
39 "score": 0.0242225433930576,
40 "verdict": "pass",
41 "z_score": null
42 },
43 {
44 "base_value": null,
45 "ci_95": [
46 0.0,
47 0.0
48 ],
49 "evidence": {
50 "fraction_regressed": 0.0,
51 "mean_delta_nats": -0.03518791794776917,
52 "raw_ci_95": [
53 0.0,
54 0.0
55 ],
56 "regressed_count": 0,
57 "regression_nats_threshold": 1.0,
58 "total_items": 20,
59 "weight": 1.0,
60 "worst_offenders": [],
61 "z_by_rank": null
62 },
63 "ft_value": -0.03518791794776917,
64 "kind": "calibration_drift",
65 "message": "0/20 items regressed >1.0 nats (frac=0.0%), mean_delta=-0.035 nats/tok (no calibration for calibration_drift)",
66 "name": "cal_golden",
67 "raw": 0.0,
68 "score": 0.8572832218805948,
69 "verdict": "pass",
70 "z_score": null
71 }
72 ],
73 "schema_version": 1,
74 "score": {
75 "band": "partial",
76 "components": {
77 "ablation": 0.0,
78 "adherence": 0.0242225433930576,
79 "attribution": 0.0,
80 "baseline": 0.0,
81 "calibration": 0.8572832218805948
82 },
83 "findings": [
84 "adherence score is 0.02 \u2014 below the noise threshold"
85 ],
86 "overall": 0.3574468147880725,
87 "weights": {
88 "ablation": 0.15,
89 "adherence": 0.3,
90 "attribution": 0.35,
91 "baseline": 0.0,
92 "calibration": 0.2
93 }
94 },
95 "spec_path": "<memory>"
96 }