sway Public

Watch 0 Fork 0 Star 0

Python · 4747 bytes Raw Blame History

  
        1
        """Tests for score_weights overrides via spec field + CLI flag."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import pytest
      
        6
        from pydantic import ValidationError
      
        7
        
        8
        from dlm_sway.cli.commands import _parse_weights_flag
      
        9
        from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS
      
        10
        from dlm_sway.suite.spec import SuiteDefaults
      
        11
        
        12
        
        13
        class TestSuiteDefaultsScoreWeights:
      
        14
            def test_none_means_use_defaults(self) -> None:
      
        15
                d = SuiteDefaults()
      
        16
                assert d.score_weights is None
      
        17
        
        18
            def test_partial_override_inherits_defaults(self) -> None:
      
        19
                d = SuiteDefaults(score_weights={"adherence": 0.5})
      
        20
                assert d.score_weights is not None
      
        21
                # The partial override is merged with the defaults.
      
        22
                assert d.score_weights["adherence"] == 0.5
      
        23
                assert d.score_weights["attribution"] == DEFAULT_COMPONENT_WEIGHTS["attribution"]
      
        24
                assert d.score_weights["calibration"] == DEFAULT_COMPONENT_WEIGHTS["calibration"]
      
        25
                assert d.score_weights["ablation"] == DEFAULT_COMPONENT_WEIGHTS["ablation"]
      
        26
                assert d.score_weights["baseline"] == DEFAULT_COMPONENT_WEIGHTS["baseline"]
      
        27
        
        28
            def test_unknown_key_rejected(self) -> None:
      
        29
                with pytest.raises(ValidationError, match="unknown category keys"):
      
        30
                    SuiteDefaults(score_weights={"bogus": 0.5})
      
        31
        
        32
            def test_negative_weight_rejected(self) -> None:
      
        33
                with pytest.raises(ValidationError, match="non-negative"):
      
        34
                    SuiteDefaults(score_weights={"adherence": -0.1})
      
        35
        
        36
            def test_all_zero_weights_rejected(self) -> None:
      
        37
                with pytest.raises(ValidationError, match="at least one positive weight"):
      
        38
                    SuiteDefaults(
      
        39
                        score_weights={
      
        40
                            "adherence": 0.0,
      
        41
                            "attribution": 0.0,
      
        42
                            "calibration": 0.0,
      
        43
                            "ablation": 0.0,
      
        44
                            "baseline": 0.0,
      
        45
                        }
      
        46
                    )
      
        47
        
        48
        
        49
        class TestParseWeightsFlag:
      
        50
            def test_none_input_returns_none(self) -> None:
      
        51
                assert _parse_weights_flag(None) is None
      
        52
                assert _parse_weights_flag("") is None
      
        53
        
        54
            def test_single_pair(self) -> None:
      
        55
                assert _parse_weights_flag("adherence=0.4") == {"adherence": 0.4}
      
        56
        
        57
            def test_multiple_pairs(self) -> None:
      
        58
                out = _parse_weights_flag("adherence=0.4,attribution=0.3,ablation=0.1")
      
        59
                assert out == {"adherence": 0.4, "attribution": 0.3, "ablation": 0.1}
      
        60
        
        61
            def test_whitespace_tolerated(self) -> None:
      
        62
                out = _parse_weights_flag(" adherence=0.4 , attribution=0.3 ")
      
        63
                assert out == {"adherence": 0.4, "attribution": 0.3}
      
        64
        
        65
            def test_missing_equals_raises(self) -> None:
      
        66
                import typer
      
        67
        
        68
                with pytest.raises(typer.BadParameter, match="key=value"):
      
        69
                    _parse_weights_flag("adherence0.4")
      
        70
        
        71
            def test_non_numeric_value_raises(self) -> None:
      
        72
                import typer
      
        73
        
        74
                with pytest.raises(typer.BadParameter, match="not a number"):
      
        75
                    _parse_weights_flag("adherence=high")
      
        76
        
        77
        
        78
        class TestComputeWithOverride:
      
        79
            def test_overridden_weights_change_composite(self) -> None:
      
        80
                """A re-weighted compose should change the overall score."""
      
        81
                from datetime import UTC, datetime
      
        82
        
        83
                from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict
      
        84
                from dlm_sway.suite.score import compute as compute_score
      
        85
        
        86
                now = datetime.now(UTC)
      
        87
                # Two probes: one perfect adherence, one zero attribution.
      
        88
                probes = (
      
        89
                    ProbeResult(
      
        90
                        name="dk",
      
        91
                        kind="delta_kl",
      
        92
                        verdict=Verdict.PASS,
      
        93
                        score=1.0,
      
        94
                        evidence={"weight": 1.0},
      
        95
                    ),
      
        96
                    ProbeResult(
      
        97
                        name="sis",
      
        98
                        kind="section_internalization",
      
        99
                        verdict=Verdict.FAIL,
      
        100
                        score=0.0,
      
        101
                        evidence={"weight": 1.0},
      
        102
                    ),
      
        103
                )
      
        104
                suite = SuiteResult(
      
        105
                    spec_path="<test>",
      
        106
                    started_at=now,
      
        107
                    finished_at=now,
      
        108
                    base_model_id="b",
      
        109
                    adapter_id="a",
      
        110
                    sway_version="0.0.0",
      
        111
                    probes=probes,
      
        112
                )
      
        113
        
        114
                # Default weights skew toward attribution (0.35 vs 0.30).
      
        115
                default_score = compute_score(suite)
      
        116
                # Override to favor adherence.
      
        117
                skewed_score = compute_score(
      
        118
                    suite,
      
        119
                    weights={
      
        120
                        "adherence": 0.9,
      
        121
                        "attribution": 0.05,
      
        122
                        "calibration": 0.025,
      
        123
                        "ablation": 0.025,
      
        124
                        "baseline": 0.0,
      
        125
                    },
      
        126
                )
      
        127
                assert skewed_score.overall > default_score.overall, (
      
        128
                    f"expected skewed_score > default; got skewed={skewed_score.overall:.3f}, "
      
        129
                    f"default={default_score.overall:.3f}"
      
        130
                )

1	"""Tests for score_weights overrides via spec field + CLI flag."""
2
3	from __future__ import annotations
4
5	import pytest
6	from pydantic import ValidationError
7
8	from dlm_sway.cli.commands import _parse_weights_flag
9	from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS
10	from dlm_sway.suite.spec import SuiteDefaults
11
12
13	class TestSuiteDefaultsScoreWeights:
14	def test_none_means_use_defaults(self) -> None:
15	d = SuiteDefaults()
16	assert d.score_weights is None
17
18	def test_partial_override_inherits_defaults(self) -> None:
19	d = SuiteDefaults(score_weights={"adherence": 0.5})
20	assert d.score_weights is not None
21	# The partial override is merged with the defaults.
22	assert d.score_weights["adherence"] == 0.5
23	assert d.score_weights["attribution"] == DEFAULT_COMPONENT_WEIGHTS["attribution"]
24	assert d.score_weights["calibration"] == DEFAULT_COMPONENT_WEIGHTS["calibration"]
25	assert d.score_weights["ablation"] == DEFAULT_COMPONENT_WEIGHTS["ablation"]
26	assert d.score_weights["baseline"] == DEFAULT_COMPONENT_WEIGHTS["baseline"]
27
28	def test_unknown_key_rejected(self) -> None:
29	with pytest.raises(ValidationError, match="unknown category keys"):
30	SuiteDefaults(score_weights={"bogus": 0.5})
31
32	def test_negative_weight_rejected(self) -> None:
33	with pytest.raises(ValidationError, match="non-negative"):
34	SuiteDefaults(score_weights={"adherence": -0.1})
35
36	def test_all_zero_weights_rejected(self) -> None:
37	with pytest.raises(ValidationError, match="at least one positive weight"):
38	SuiteDefaults(
39	score_weights={
40	"adherence": 0.0,
41	"attribution": 0.0,
42	"calibration": 0.0,
43	"ablation": 0.0,
44	"baseline": 0.0,
45	}
46	)
47
48
49	class TestParseWeightsFlag:
50	def test_none_input_returns_none(self) -> None:
51	assert _parse_weights_flag(None) is None
52	assert _parse_weights_flag("") is None
53
54	def test_single_pair(self) -> None:
55	assert _parse_weights_flag("adherence=0.4") == {"adherence": 0.4}
56
57	def test_multiple_pairs(self) -> None:
58	out = _parse_weights_flag("adherence=0.4,attribution=0.3,ablation=0.1")
59	assert out == {"adherence": 0.4, "attribution": 0.3, "ablation": 0.1}
60
61	def test_whitespace_tolerated(self) -> None:
62	out = _parse_weights_flag(" adherence=0.4 , attribution=0.3 ")
63	assert out == {"adherence": 0.4, "attribution": 0.3}
64
65	def test_missing_equals_raises(self) -> None:
66	import typer
67
68	with pytest.raises(typer.BadParameter, match="key=value"):
69	_parse_weights_flag("adherence0.4")
70
71	def test_non_numeric_value_raises(self) -> None:
72	import typer
73
74	with pytest.raises(typer.BadParameter, match="not a number"):
75	_parse_weights_flag("adherence=high")
76
77
78	class TestComputeWithOverride:
79	def test_overridden_weights_change_composite(self) -> None:
80	"""A re-weighted compose should change the overall score."""
81	from datetime import UTC, datetime
82
83	from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict
84	from dlm_sway.suite.score import compute as compute_score
85
86	now = datetime.now(UTC)
87	# Two probes: one perfect adherence, one zero attribution.
88	probes = (
89	ProbeResult(
90	name="dk",
91	kind="delta_kl",
92	verdict=Verdict.PASS,
93	score=1.0,
94	evidence={"weight": 1.0},
95	),
96	ProbeResult(
97	name="sis",
98	kind="section_internalization",
99	verdict=Verdict.FAIL,
100	score=0.0,
101	evidence={"weight": 1.0},
102	),
103	)
104	suite = SuiteResult(
105	spec_path="<test>",
106	started_at=now,
107	finished_at=now,
108	base_model_id="b",
109	adapter_id="a",
110	sway_version="0.0.0",
111	probes=probes,
112	)
113
114	# Default weights skew toward attribution (0.35 vs 0.30).
115	default_score = compute_score(suite)
116	# Override to favor adherence.
117	skewed_score = compute_score(
118	suite,
119	weights={
120	"adherence": 0.9,
121	"attribution": 0.05,
122	"calibration": 0.025,
123	"ablation": 0.025,
124	"baseline": 0.0,
125	},
126	)
127	assert skewed_score.overall > default_score.overall, (
128	f"expected skewed_score > default; got skewed={skewed_score.overall:.3f}, "
129	f"default={default_score.overall:.3f}"
130	)