Python · 4747 bytes Raw Blame History
1 """Tests for score_weights overrides via spec field + CLI flag."""
2
3 from __future__ import annotations
4
5 import pytest
6 from pydantic import ValidationError
7
8 from dlm_sway.cli.commands import _parse_weights_flag
9 from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS
10 from dlm_sway.suite.spec import SuiteDefaults
11
12
13 class TestSuiteDefaultsScoreWeights:
14 def test_none_means_use_defaults(self) -> None:
15 d = SuiteDefaults()
16 assert d.score_weights is None
17
18 def test_partial_override_inherits_defaults(self) -> None:
19 d = SuiteDefaults(score_weights={"adherence": 0.5})
20 assert d.score_weights is not None
21 # The partial override is merged with the defaults.
22 assert d.score_weights["adherence"] == 0.5
23 assert d.score_weights["attribution"] == DEFAULT_COMPONENT_WEIGHTS["attribution"]
24 assert d.score_weights["calibration"] == DEFAULT_COMPONENT_WEIGHTS["calibration"]
25 assert d.score_weights["ablation"] == DEFAULT_COMPONENT_WEIGHTS["ablation"]
26 assert d.score_weights["baseline"] == DEFAULT_COMPONENT_WEIGHTS["baseline"]
27
28 def test_unknown_key_rejected(self) -> None:
29 with pytest.raises(ValidationError, match="unknown category keys"):
30 SuiteDefaults(score_weights={"bogus": 0.5})
31
32 def test_negative_weight_rejected(self) -> None:
33 with pytest.raises(ValidationError, match="non-negative"):
34 SuiteDefaults(score_weights={"adherence": -0.1})
35
36 def test_all_zero_weights_rejected(self) -> None:
37 with pytest.raises(ValidationError, match="at least one positive weight"):
38 SuiteDefaults(
39 score_weights={
40 "adherence": 0.0,
41 "attribution": 0.0,
42 "calibration": 0.0,
43 "ablation": 0.0,
44 "baseline": 0.0,
45 }
46 )
47
48
49 class TestParseWeightsFlag:
50 def test_none_input_returns_none(self) -> None:
51 assert _parse_weights_flag(None) is None
52 assert _parse_weights_flag("") is None
53
54 def test_single_pair(self) -> None:
55 assert _parse_weights_flag("adherence=0.4") == {"adherence": 0.4}
56
57 def test_multiple_pairs(self) -> None:
58 out = _parse_weights_flag("adherence=0.4,attribution=0.3,ablation=0.1")
59 assert out == {"adherence": 0.4, "attribution": 0.3, "ablation": 0.1}
60
61 def test_whitespace_tolerated(self) -> None:
62 out = _parse_weights_flag(" adherence=0.4 , attribution=0.3 ")
63 assert out == {"adherence": 0.4, "attribution": 0.3}
64
65 def test_missing_equals_raises(self) -> None:
66 import typer
67
68 with pytest.raises(typer.BadParameter, match="key=value"):
69 _parse_weights_flag("adherence0.4")
70
71 def test_non_numeric_value_raises(self) -> None:
72 import typer
73
74 with pytest.raises(typer.BadParameter, match="not a number"):
75 _parse_weights_flag("adherence=high")
76
77
78 class TestComputeWithOverride:
79 def test_overridden_weights_change_composite(self) -> None:
80 """A re-weighted compose should change the overall score."""
81 from datetime import UTC, datetime
82
83 from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict
84 from dlm_sway.suite.score import compute as compute_score
85
86 now = datetime.now(UTC)
87 # Two probes: one perfect adherence, one zero attribution.
88 probes = (
89 ProbeResult(
90 name="dk",
91 kind="delta_kl",
92 verdict=Verdict.PASS,
93 score=1.0,
94 evidence={"weight": 1.0},
95 ),
96 ProbeResult(
97 name="sis",
98 kind="section_internalization",
99 verdict=Verdict.FAIL,
100 score=0.0,
101 evidence={"weight": 1.0},
102 ),
103 )
104 suite = SuiteResult(
105 spec_path="<test>",
106 started_at=now,
107 finished_at=now,
108 base_model_id="b",
109 adapter_id="a",
110 sway_version="0.0.0",
111 probes=probes,
112 )
113
114 # Default weights skew toward attribution (0.35 vs 0.30).
115 default_score = compute_score(suite)
116 # Override to favor adherence.
117 skewed_score = compute_score(
118 suite,
119 weights={
120 "adherence": 0.9,
121 "attribution": 0.05,
122 "calibration": 0.025,
123 "ablation": 0.025,
124 "baseline": 0.0,
125 },
126 )
127 assert skewed_score.overall > default_score.overall, (
128 f"expected skewed_score > default; got skewed={skewed_score.overall:.3f}, "
129 f"default={default_score.overall:.3f}"
130 )