tenseleyflow/sway / d7907f7

Browse files

tests/score_weights: spec field validation + CLI parser + composite override

Authored by espadonne
SHA
d7907f77833cb591cc27998fd2ecb4fb1b520973
Parents
4fa3f20
Tree
3339385

1 changed file

StatusFile+-
A tests/unit/test_score_weights_override.py 130 0
tests/unit/test_score_weights_override.pyadded
@@ -0,0 +1,130 @@
1
+"""Tests for score_weights overrides via spec field + CLI flag."""
2
+
3
+from __future__ import annotations
4
+
5
+import pytest
6
+from pydantic import ValidationError
7
+
8
+from dlm_sway.cli.commands import _parse_weights_flag
9
+from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS
10
+from dlm_sway.suite.spec import SuiteDefaults
11
+
12
+
13
+class TestSuiteDefaultsScoreWeights:
14
+    def test_none_means_use_defaults(self) -> None:
15
+        d = SuiteDefaults()
16
+        assert d.score_weights is None
17
+
18
+    def test_partial_override_inherits_defaults(self) -> None:
19
+        d = SuiteDefaults(score_weights={"adherence": 0.5})
20
+        assert d.score_weights is not None
21
+        # The partial override is merged with the defaults.
22
+        assert d.score_weights["adherence"] == 0.5
23
+        assert d.score_weights["attribution"] == DEFAULT_COMPONENT_WEIGHTS["attribution"]
24
+        assert d.score_weights["calibration"] == DEFAULT_COMPONENT_WEIGHTS["calibration"]
25
+        assert d.score_weights["ablation"] == DEFAULT_COMPONENT_WEIGHTS["ablation"]
26
+        assert d.score_weights["baseline"] == DEFAULT_COMPONENT_WEIGHTS["baseline"]
27
+
28
+    def test_unknown_key_rejected(self) -> None:
29
+        with pytest.raises(ValidationError, match="unknown category keys"):
30
+            SuiteDefaults(score_weights={"bogus": 0.5})
31
+
32
+    def test_negative_weight_rejected(self) -> None:
33
+        with pytest.raises(ValidationError, match="non-negative"):
34
+            SuiteDefaults(score_weights={"adherence": -0.1})
35
+
36
+    def test_all_zero_weights_rejected(self) -> None:
37
+        with pytest.raises(ValidationError, match="at least one positive weight"):
38
+            SuiteDefaults(
39
+                score_weights={
40
+                    "adherence": 0.0,
41
+                    "attribution": 0.0,
42
+                    "calibration": 0.0,
43
+                    "ablation": 0.0,
44
+                    "baseline": 0.0,
45
+                }
46
+            )
47
+
48
+
49
+class TestParseWeightsFlag:
50
+    def test_none_input_returns_none(self) -> None:
51
+        assert _parse_weights_flag(None) is None
52
+        assert _parse_weights_flag("") is None
53
+
54
+    def test_single_pair(self) -> None:
55
+        assert _parse_weights_flag("adherence=0.4") == {"adherence": 0.4}
56
+
57
+    def test_multiple_pairs(self) -> None:
58
+        out = _parse_weights_flag("adherence=0.4,attribution=0.3,ablation=0.1")
59
+        assert out == {"adherence": 0.4, "attribution": 0.3, "ablation": 0.1}
60
+
61
+    def test_whitespace_tolerated(self) -> None:
62
+        out = _parse_weights_flag(" adherence=0.4 , attribution=0.3 ")
63
+        assert out == {"adherence": 0.4, "attribution": 0.3}
64
+
65
+    def test_missing_equals_raises(self) -> None:
66
+        import typer
67
+
68
+        with pytest.raises(typer.BadParameter, match="key=value"):
69
+            _parse_weights_flag("adherence0.4")
70
+
71
+    def test_non_numeric_value_raises(self) -> None:
72
+        import typer
73
+
74
+        with pytest.raises(typer.BadParameter, match="not a number"):
75
+            _parse_weights_flag("adherence=high")
76
+
77
+
78
+class TestComputeWithOverride:
79
+    def test_overridden_weights_change_composite(self) -> None:
80
+        """A re-weighted compose should change the overall score."""
81
+        from datetime import UTC, datetime
82
+
83
+        from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict
84
+        from dlm_sway.suite.score import compute as compute_score
85
+
86
+        now = datetime.now(UTC)
87
+        # Two probes: one perfect adherence, one zero attribution.
88
+        probes = (
89
+            ProbeResult(
90
+                name="dk",
91
+                kind="delta_kl",
92
+                verdict=Verdict.PASS,
93
+                score=1.0,
94
+                evidence={"weight": 1.0},
95
+            ),
96
+            ProbeResult(
97
+                name="sis",
98
+                kind="section_internalization",
99
+                verdict=Verdict.FAIL,
100
+                score=0.0,
101
+                evidence={"weight": 1.0},
102
+            ),
103
+        )
104
+        suite = SuiteResult(
105
+            spec_path="<test>",
106
+            started_at=now,
107
+            finished_at=now,
108
+            base_model_id="b",
109
+            adapter_id="a",
110
+            sway_version="0.0.0",
111
+            probes=probes,
112
+        )
113
+
114
+        # Default weights skew toward attribution (0.35 vs 0.30).
115
+        default_score = compute_score(suite)
116
+        # Override to favor adherence.
117
+        skewed_score = compute_score(
118
+            suite,
119
+            weights={
120
+                "adherence": 0.9,
121
+                "attribution": 0.05,
122
+                "calibration": 0.025,
123
+                "ablation": 0.025,
124
+                "baseline": 0.0,
125
+            },
126
+        )
127
+        assert skewed_score.overall > default_score.overall, (
128
+            f"expected skewed_score > default; got skewed={skewed_score.overall:.3f}, "
129
+            f"default={default_score.overall:.3f}"
130
+        )