| 1 | { |
| 2 | "determinism": { |
| 3 | "class": "best_effort", |
| 4 | "notes": [ |
| 5 | "CPU-only backend: strict determinism depends on BLAS impl" |
| 6 | ], |
| 7 | "seed": 0 |
| 8 | }, |
| 9 | "null_stats": {}, |
| 10 | "probes": [ |
| 11 | { |
| 12 | "base_value": null, |
| 13 | "ci_95": [ |
| 14 | 0.008465445404266456, |
| 15 | 0.02977853737438285 |
| 16 | ], |
| 17 | "evidence": { |
| 18 | "divergence_kind": "js", |
| 19 | "max": 0.03615893521221293, |
| 20 | "num_prompts": 4, |
| 21 | "per_prompt": [ |
| 22 | 0.010637343860892606, |
| 23 | 0.006726111759252678, |
| 24 | 0.013683446339307789, |
| 25 | 0.03615893521221293 |
| 26 | ], |
| 27 | "raw_ci_95": [ |
| 28 | 0.008465445404266456, |
| 29 | 0.02977853737438285 |
| 30 | ], |
| 31 | "weight": 1.0, |
| 32 | "z_by_rank": null |
| 33 | }, |
| 34 | "ft_value": null, |
| 35 | "kind": "delta_kl", |
| 36 | "message": "mean js=0.0168 (\u2265 0.0) (no calibration for delta_kl)", |
| 37 | "name": "dk_golden", |
| 38 | "raw": 0.0168014592929165, |
| 39 | "score": 0.024239382001588428, |
| 40 | "verdict": "pass", |
| 41 | "z_score": null |
| 42 | }, |
| 43 | { |
| 44 | "base_value": null, |
| 45 | "ci_95": [ |
| 46 | 0.0, |
| 47 | 0.0 |
| 48 | ], |
| 49 | "evidence": { |
| 50 | "fraction_regressed": 0.0, |
| 51 | "mean_delta_nats": -0.03518710732460022, |
| 52 | "raw_ci_95": [ |
| 53 | 0.0, |
| 54 | 0.0 |
| 55 | ], |
| 56 | "regressed_count": 0, |
| 57 | "regression_nats_threshold": 1.0, |
| 58 | "total_items": 20, |
| 59 | "weight": 1.0, |
| 60 | "worst_offenders": [], |
| 61 | "z_by_rank": null |
| 62 | }, |
| 63 | "ft_value": -0.03518710732460022, |
| 64 | "kind": "calibration_drift", |
| 65 | "message": "0/20 items regressed >1.0 nats (frac=0.0%), mean_delta=-0.035 nats/tok (no calibration for calibration_drift)", |
| 66 | "name": "cal_golden", |
| 67 | "raw": 0.0, |
| 68 | "score": 0.8572834380467733, |
| 69 | "verdict": "pass", |
| 70 | "z_score": null |
| 71 | } |
| 72 | ], |
| 73 | "schema_version": 1, |
| 74 | "score": { |
| 75 | "band": "partial", |
| 76 | "components": { |
| 77 | "ablation": 0.0, |
| 78 | "adherence": 0.024239382001588428, |
| 79 | "attribution": 0.0, |
| 80 | "baseline": 0.0, |
| 81 | "calibration": 0.8572834380467733 |
| 82 | }, |
| 83 | "findings": [ |
| 84 | "adherence score is 0.02 \u2014 below the noise threshold" |
| 85 | ], |
| 86 | "overall": 0.3574570044196624, |
| 87 | "weights": { |
| 88 | "ablation": 0.15, |
| 89 | "adherence": 0.3, |
| 90 | "attribution": 0.35, |
| 91 | "baseline": 0.0, |
| 92 | "calibration": 0.2 |
| 93 | } |
| 94 | }, |
| 95 | "spec_path": "<memory>" |
| 96 | } |