| 1 | """Tests for score_weights overrides via spec field + CLI flag.""" |
| 2 | |
| 3 | from __future__ import annotations |
| 4 | |
| 5 | import pytest |
| 6 | from pydantic import ValidationError |
| 7 | |
| 8 | from dlm_sway.cli.commands import _parse_weights_flag |
| 9 | from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS |
| 10 | from dlm_sway.suite.spec import SuiteDefaults |
| 11 | |
| 12 | |
| 13 | class TestSuiteDefaultsScoreWeights: |
| 14 | def test_none_means_use_defaults(self) -> None: |
| 15 | d = SuiteDefaults() |
| 16 | assert d.score_weights is None |
| 17 | |
| 18 | def test_partial_override_inherits_defaults(self) -> None: |
| 19 | d = SuiteDefaults(score_weights={"adherence": 0.5}) |
| 20 | assert d.score_weights is not None |
| 21 | # The partial override is merged with the defaults. |
| 22 | assert d.score_weights["adherence"] == 0.5 |
| 23 | assert d.score_weights["attribution"] == DEFAULT_COMPONENT_WEIGHTS["attribution"] |
| 24 | assert d.score_weights["calibration"] == DEFAULT_COMPONENT_WEIGHTS["calibration"] |
| 25 | assert d.score_weights["ablation"] == DEFAULT_COMPONENT_WEIGHTS["ablation"] |
| 26 | assert d.score_weights["baseline"] == DEFAULT_COMPONENT_WEIGHTS["baseline"] |
| 27 | |
| 28 | def test_unknown_key_rejected(self) -> None: |
| 29 | with pytest.raises(ValidationError, match="unknown category keys"): |
| 30 | SuiteDefaults(score_weights={"bogus": 0.5}) |
| 31 | |
| 32 | def test_negative_weight_rejected(self) -> None: |
| 33 | with pytest.raises(ValidationError, match="non-negative"): |
| 34 | SuiteDefaults(score_weights={"adherence": -0.1}) |
| 35 | |
| 36 | def test_all_zero_weights_rejected(self) -> None: |
| 37 | with pytest.raises(ValidationError, match="at least one positive weight"): |
| 38 | SuiteDefaults( |
| 39 | score_weights={ |
| 40 | "adherence": 0.0, |
| 41 | "attribution": 0.0, |
| 42 | "calibration": 0.0, |
| 43 | "ablation": 0.0, |
| 44 | "baseline": 0.0, |
| 45 | } |
| 46 | ) |
| 47 | |
| 48 | |
| 49 | class TestParseWeightsFlag: |
| 50 | def test_none_input_returns_none(self) -> None: |
| 51 | assert _parse_weights_flag(None) is None |
| 52 | assert _parse_weights_flag("") is None |
| 53 | |
| 54 | def test_single_pair(self) -> None: |
| 55 | assert _parse_weights_flag("adherence=0.4") == {"adherence": 0.4} |
| 56 | |
| 57 | def test_multiple_pairs(self) -> None: |
| 58 | out = _parse_weights_flag("adherence=0.4,attribution=0.3,ablation=0.1") |
| 59 | assert out == {"adherence": 0.4, "attribution": 0.3, "ablation": 0.1} |
| 60 | |
| 61 | def test_whitespace_tolerated(self) -> None: |
| 62 | out = _parse_weights_flag(" adherence=0.4 , attribution=0.3 ") |
| 63 | assert out == {"adherence": 0.4, "attribution": 0.3} |
| 64 | |
| 65 | def test_missing_equals_raises(self) -> None: |
| 66 | import typer |
| 67 | |
| 68 | with pytest.raises(typer.BadParameter, match="key=value"): |
| 69 | _parse_weights_flag("adherence0.4") |
| 70 | |
| 71 | def test_non_numeric_value_raises(self) -> None: |
| 72 | import typer |
| 73 | |
| 74 | with pytest.raises(typer.BadParameter, match="not a number"): |
| 75 | _parse_weights_flag("adherence=high") |
| 76 | |
| 77 | |
| 78 | class TestComputeWithOverride: |
| 79 | def test_overridden_weights_change_composite(self) -> None: |
| 80 | """A re-weighted compose should change the overall score.""" |
| 81 | from datetime import UTC, datetime |
| 82 | |
| 83 | from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict |
| 84 | from dlm_sway.suite.score import compute as compute_score |
| 85 | |
| 86 | now = datetime.now(UTC) |
| 87 | # Two probes: one perfect adherence, one zero attribution. |
| 88 | probes = ( |
| 89 | ProbeResult( |
| 90 | name="dk", |
| 91 | kind="delta_kl", |
| 92 | verdict=Verdict.PASS, |
| 93 | score=1.0, |
| 94 | evidence={"weight": 1.0}, |
| 95 | ), |
| 96 | ProbeResult( |
| 97 | name="sis", |
| 98 | kind="section_internalization", |
| 99 | verdict=Verdict.FAIL, |
| 100 | score=0.0, |
| 101 | evidence={"weight": 1.0}, |
| 102 | ), |
| 103 | ) |
| 104 | suite = SuiteResult( |
| 105 | spec_path="<test>", |
| 106 | started_at=now, |
| 107 | finished_at=now, |
| 108 | base_model_id="b", |
| 109 | adapter_id="a", |
| 110 | sway_version="0.0.0", |
| 111 | probes=probes, |
| 112 | ) |
| 113 | |
| 114 | # Default weights skew toward attribution (0.35 vs 0.30). |
| 115 | default_score = compute_score(suite) |
| 116 | # Override to favor adherence. |
| 117 | skewed_score = compute_score( |
| 118 | suite, |
| 119 | weights={ |
| 120 | "adherence": 0.9, |
| 121 | "attribution": 0.05, |
| 122 | "calibration": 0.025, |
| 123 | "ablation": 0.025, |
| 124 | "baseline": 0.0, |
| 125 | }, |
| 126 | ) |
| 127 | assert skewed_score.overall > default_score.overall, ( |
| 128 | f"expected skewed_score > default; got skewed={skewed_score.overall:.3f}, " |
| 129 | f"default={default_score.overall:.3f}" |
| 130 | ) |