tenseleyflow/sway / cd41452

Browse files

sway(core): ProbeResult, SuiteResult, SwayScore, Verdict

Authored by espadonne
SHA
cd41452a6d79ba22a64b03705a45cd7869d07f08
Parents
bec8a31
Tree
c681676

2 changed files

StatusFile+-
A src/dlm_sway/core/result.py 139 0
A tests/unit/test_result.py 82 0
src/dlm_sway/core/result.pyadded
@@ -0,0 +1,139 @@
1
+"""Probe and suite result types.
2
+
3
+Every numeric probe ultimately returns a :class:`ProbeResult`. The suite
4
+runner collects them into a :class:`SuiteResult` and the scorer folds
5
+that into a single :class:`SwayScore` with transparent per-component
6
+weights.
7
+
8
+These dataclasses are deliberately plain — no pydantic — because they
9
+cross probe/backend boundaries hundreds of times per run and a free
10
+``model_validate`` on every construction would dominate the runtime of
11
+cheap probes.
12
+"""
13
+
14
+from __future__ import annotations
15
+
16
+from dataclasses import dataclass, field
17
+from datetime import UTC, datetime
18
+from enum import StrEnum
19
+from typing import Any
20
+
21
+
22
+class Verdict(StrEnum):
23
+    """Outcome of a single probe against its assertion."""
24
+
25
+    PASS = "pass"
26
+    FAIL = "fail"
27
+    WARN = "warn"
28
+    SKIP = "skip"
29
+    ERROR = "error"
30
+
31
+
32
+@dataclass(frozen=True, slots=True)
33
+class ProbeResult:
34
+    """The result of running one probe.
35
+
36
+    Attributes
37
+    ----------
38
+    name:
39
+        User-facing name from the spec (unique within a suite).
40
+    kind:
41
+        Probe discriminator (``delta_kl``, ``section_internalization`` …).
42
+    verdict:
43
+        Pass / fail / warn / skip / error.
44
+    score:
45
+        Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric
46
+        probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`.
47
+    raw:
48
+        The raw metric value (e.g. KL=0.083). Probe-specific units.
49
+    z_score:
50
+        Standard deviations above the null-adapter baseline. ``None``
51
+        when no null calibration was run.
52
+    base_value:
53
+        The metric evaluated on the base model, when meaningful.
54
+    ft_value:
55
+        The metric evaluated on the fine-tuned model, when meaningful.
56
+    evidence:
57
+        Small structured payload for the report — prompts, example
58
+        completions, per-section breakdowns. Kept bounded (<10 KB) so
59
+        suite JSON stays under a megabyte.
60
+    message:
61
+        One-line diagnostic. Surfaces in the terminal report.
62
+    duration_s:
63
+        Wall time to execute.
64
+    """
65
+
66
+    name: str
67
+    kind: str
68
+    verdict: Verdict
69
+    score: float | None
70
+    raw: float | None = None
71
+    z_score: float | None = None
72
+    base_value: float | None = None
73
+    ft_value: float | None = None
74
+    evidence: dict[str, Any] = field(default_factory=dict)
75
+    message: str = ""
76
+    duration_s: float = 0.0
77
+
78
+
79
+@dataclass(frozen=True, slots=True)
80
+class SuiteResult:
81
+    """A full run of a sway.yaml suite."""
82
+
83
+    spec_path: str
84
+    started_at: datetime
85
+    finished_at: datetime
86
+    base_model_id: str
87
+    adapter_id: str
88
+    sway_version: str
89
+    probes: tuple[ProbeResult, ...] = ()
90
+    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
91
+    """Per-primitive null-adapter baseline stats (mean, std, runs). Used
92
+    to turn raw metrics into z-scores when rendering the report."""
93
+
94
+    @property
95
+    def wall_seconds(self) -> float:
96
+        return (self.finished_at - self.started_at).total_seconds()
97
+
98
+
99
+# Component weights for the composite score. Overridable in sway.yaml.
100
+DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = {
101
+    "adherence": 0.30,
102
+    "attribution": 0.35,
103
+    "calibration": 0.20,
104
+    "ablation": 0.15,
105
+}
106
+
107
+
108
+@dataclass(frozen=True, slots=True)
109
+class SwayScore:
110
+    """Composite score with a transparent per-component breakdown."""
111
+
112
+    overall: float
113
+    components: dict[str, float]
114
+    weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS))
115
+    band: str = ""
116
+    findings: tuple[str, ...] = ()
117
+
118
+    @staticmethod
119
+    def band_for(overall: float) -> str:
120
+        """Map a score to a human-readable band.
121
+
122
+        Bands (from the plan):
123
+          - <0.3  : indistinguishable from noise
124
+          - 0.3–0.6 : partial fit
125
+          - 0.6–0.85: healthy
126
+          - >0.85 : suspiciously good (possible overfit / memorization)
127
+        """
128
+        if overall < 0.3:
129
+            return "noise"
130
+        if overall < 0.6:
131
+            return "partial"
132
+        if overall <= 0.85:
133
+            return "healthy"
134
+        return "suspicious"
135
+
136
+
137
+def utcnow() -> datetime:
138
+    """Timezone-aware UTC timestamp (used by the runner)."""
139
+    return datetime.now(UTC)
tests/unit/test_result.pyadded
@@ -0,0 +1,82 @@
1
+"""Tests for :mod:`dlm_sway.core.result`."""
2
+
3
+from __future__ import annotations
4
+
5
+from dataclasses import FrozenInstanceError
6
+
7
+import pytest
8
+
9
+from dlm_sway.core.result import (
10
+    DEFAULT_COMPONENT_WEIGHTS,
11
+    ProbeResult,
12
+    SuiteResult,
13
+    SwayScore,
14
+    Verdict,
15
+    utcnow,
16
+)
17
+
18
+
19
+class TestVerdict:
20
+    def test_is_str_enum(self) -> None:
21
+        assert Verdict.PASS.value == "pass"
22
+        assert str(Verdict.WARN.value) == "warn"
23
+
24
+    def test_all_expected_members(self) -> None:
25
+        assert {v.value for v in Verdict} == {
26
+            "pass",
27
+            "fail",
28
+            "warn",
29
+            "skip",
30
+            "error",
31
+        }
32
+
33
+
34
+class TestProbeResult:
35
+    def test_minimum_construction(self) -> None:
36
+        r = ProbeResult(name="t", kind="delta_kl", verdict=Verdict.PASS, score=0.82)
37
+        assert r.raw is None
38
+        assert r.evidence == {}
39
+        assert r.message == ""
40
+        assert r.duration_s == 0.0
41
+
42
+    def test_frozen(self) -> None:
43
+        r = ProbeResult(name="t", kind="t", verdict=Verdict.PASS, score=0.5)
44
+        with pytest.raises(FrozenInstanceError):
45
+            r.score = 0.6  # type: ignore[misc]
46
+
47
+
48
+class TestSuiteResult:
49
+    def test_wall_seconds(self) -> None:
50
+        from datetime import timedelta
51
+
52
+        started = utcnow()
53
+        finished = started + timedelta(seconds=2, milliseconds=500)
54
+        result = SuiteResult(
55
+            spec_path="sway.yaml",
56
+            started_at=started,
57
+            finished_at=finished,
58
+            base_model_id="b",
59
+            adapter_id="a",
60
+            sway_version="0.1.0.dev0",
61
+        )
62
+        assert result.wall_seconds == pytest.approx(2.5, abs=1e-6)
63
+
64
+
65
+class TestSwayScore:
66
+    def test_default_weights_sum_to_one(self) -> None:
67
+        assert abs(sum(DEFAULT_COMPONENT_WEIGHTS.values()) - 1.0) < 1e-9
68
+
69
+    def test_band_boundaries(self) -> None:
70
+        assert SwayScore.band_for(0.0) == "noise"
71
+        assert SwayScore.band_for(0.29) == "noise"
72
+        assert SwayScore.band_for(0.30) == "partial"
73
+        assert SwayScore.band_for(0.59) == "partial"
74
+        assert SwayScore.band_for(0.60) == "healthy"
75
+        assert SwayScore.band_for(0.85) == "healthy"
76
+        assert SwayScore.band_for(0.851) == "suspicious"
77
+        assert SwayScore.band_for(0.99) == "suspicious"
78
+
79
+
80
+def test_utcnow_is_tz_aware() -> None:
81
+    now = utcnow()
82
+    assert now.tzinfo is not None