@@ -0,0 +1,139 @@ |
| 1 | +"""Probe and suite result types. |
| 2 | + |
| 3 | +Every numeric probe ultimately returns a :class:`ProbeResult`. The suite |
| 4 | +runner collects them into a :class:`SuiteResult` and the scorer folds |
| 5 | +that into a single :class:`SwayScore` with transparent per-component |
| 6 | +weights. |
| 7 | + |
| 8 | +These dataclasses are deliberately plain — no pydantic — because they |
| 9 | +cross probe/backend boundaries hundreds of times per run and a free |
| 10 | +``model_validate`` on every construction would dominate the runtime of |
| 11 | +cheap probes. |
| 12 | +""" |
| 13 | + |
| 14 | +from __future__ import annotations |
| 15 | + |
| 16 | +from dataclasses import dataclass, field |
| 17 | +from datetime import UTC, datetime |
| 18 | +from enum import StrEnum |
| 19 | +from typing import Any |
| 20 | + |
| 21 | + |
| 22 | +class Verdict(StrEnum): |
| 23 | + """Outcome of a single probe against its assertion.""" |
| 24 | + |
| 25 | + PASS = "pass" |
| 26 | + FAIL = "fail" |
| 27 | + WARN = "warn" |
| 28 | + SKIP = "skip" |
| 29 | + ERROR = "error" |
| 30 | + |
| 31 | + |
| 32 | +@dataclass(frozen=True, slots=True) |
| 33 | +class ProbeResult: |
| 34 | + """The result of running one probe. |
| 35 | + |
| 36 | + Attributes |
| 37 | + ---------- |
| 38 | + name: |
| 39 | + User-facing name from the spec (unique within a suite). |
| 40 | + kind: |
| 41 | + Probe discriminator (``delta_kl``, ``section_internalization`` …). |
| 42 | + verdict: |
| 43 | + Pass / fail / warn / skip / error. |
| 44 | + score: |
| 45 | + Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric |
| 46 | + probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`. |
| 47 | + raw: |
| 48 | + The raw metric value (e.g. KL=0.083). Probe-specific units. |
| 49 | + z_score: |
| 50 | + Standard deviations above the null-adapter baseline. ``None`` |
| 51 | + when no null calibration was run. |
| 52 | + base_value: |
| 53 | + The metric evaluated on the base model, when meaningful. |
| 54 | + ft_value: |
| 55 | + The metric evaluated on the fine-tuned model, when meaningful. |
| 56 | + evidence: |
| 57 | + Small structured payload for the report — prompts, example |
| 58 | + completions, per-section breakdowns. Kept bounded (<10 KB) so |
| 59 | + suite JSON stays under a megabyte. |
| 60 | + message: |
| 61 | + One-line diagnostic. Surfaces in the terminal report. |
| 62 | + duration_s: |
| 63 | + Wall time to execute. |
| 64 | + """ |
| 65 | + |
| 66 | + name: str |
| 67 | + kind: str |
| 68 | + verdict: Verdict |
| 69 | + score: float | None |
| 70 | + raw: float | None = None |
| 71 | + z_score: float | None = None |
| 72 | + base_value: float | None = None |
| 73 | + ft_value: float | None = None |
| 74 | + evidence: dict[str, Any] = field(default_factory=dict) |
| 75 | + message: str = "" |
| 76 | + duration_s: float = 0.0 |
| 77 | + |
| 78 | + |
| 79 | +@dataclass(frozen=True, slots=True) |
| 80 | +class SuiteResult: |
| 81 | + """A full run of a sway.yaml suite.""" |
| 82 | + |
| 83 | + spec_path: str |
| 84 | + started_at: datetime |
| 85 | + finished_at: datetime |
| 86 | + base_model_id: str |
| 87 | + adapter_id: str |
| 88 | + sway_version: str |
| 89 | + probes: tuple[ProbeResult, ...] = () |
| 90 | + null_stats: dict[str, dict[str, float]] = field(default_factory=dict) |
| 91 | + """Per-primitive null-adapter baseline stats (mean, std, runs). Used |
| 92 | + to turn raw metrics into z-scores when rendering the report.""" |
| 93 | + |
| 94 | + @property |
| 95 | + def wall_seconds(self) -> float: |
| 96 | + return (self.finished_at - self.started_at).total_seconds() |
| 97 | + |
| 98 | + |
| 99 | +# Component weights for the composite score. Overridable in sway.yaml. |
| 100 | +DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = { |
| 101 | + "adherence": 0.30, |
| 102 | + "attribution": 0.35, |
| 103 | + "calibration": 0.20, |
| 104 | + "ablation": 0.15, |
| 105 | +} |
| 106 | + |
| 107 | + |
| 108 | +@dataclass(frozen=True, slots=True) |
| 109 | +class SwayScore: |
| 110 | + """Composite score with a transparent per-component breakdown.""" |
| 111 | + |
| 112 | + overall: float |
| 113 | + components: dict[str, float] |
| 114 | + weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS)) |
| 115 | + band: str = "" |
| 116 | + findings: tuple[str, ...] = () |
| 117 | + |
| 118 | + @staticmethod |
| 119 | + def band_for(overall: float) -> str: |
| 120 | + """Map a score to a human-readable band. |
| 121 | + |
| 122 | + Bands (from the plan): |
| 123 | + - <0.3 : indistinguishable from noise |
| 124 | + - 0.3–0.6 : partial fit |
| 125 | + - 0.6–0.85: healthy |
| 126 | + - >0.85 : suspiciously good (possible overfit / memorization) |
| 127 | + """ |
| 128 | + if overall < 0.3: |
| 129 | + return "noise" |
| 130 | + if overall < 0.6: |
| 131 | + return "partial" |
| 132 | + if overall <= 0.85: |
| 133 | + return "healthy" |
| 134 | + return "suspicious" |
| 135 | + |
| 136 | + |
| 137 | +def utcnow() -> datetime: |
| 138 | + """Timezone-aware UTC timestamp (used by the runner).""" |
| 139 | + return datetime.now(UTC) |