@@ -0,0 +1,130 @@ |
| 1 | +"""Tests for score_weights overrides via spec field + CLI flag.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +import pytest |
| 6 | +from pydantic import ValidationError |
| 7 | + |
| 8 | +from dlm_sway.cli.commands import _parse_weights_flag |
| 9 | +from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS |
| 10 | +from dlm_sway.suite.spec import SuiteDefaults |
| 11 | + |
| 12 | + |
| 13 | +class TestSuiteDefaultsScoreWeights: |
| 14 | + def test_none_means_use_defaults(self) -> None: |
| 15 | + d = SuiteDefaults() |
| 16 | + assert d.score_weights is None |
| 17 | + |
| 18 | + def test_partial_override_inherits_defaults(self) -> None: |
| 19 | + d = SuiteDefaults(score_weights={"adherence": 0.5}) |
| 20 | + assert d.score_weights is not None |
| 21 | + # The partial override is merged with the defaults. |
| 22 | + assert d.score_weights["adherence"] == 0.5 |
| 23 | + assert d.score_weights["attribution"] == DEFAULT_COMPONENT_WEIGHTS["attribution"] |
| 24 | + assert d.score_weights["calibration"] == DEFAULT_COMPONENT_WEIGHTS["calibration"] |
| 25 | + assert d.score_weights["ablation"] == DEFAULT_COMPONENT_WEIGHTS["ablation"] |
| 26 | + assert d.score_weights["baseline"] == DEFAULT_COMPONENT_WEIGHTS["baseline"] |
| 27 | + |
| 28 | + def test_unknown_key_rejected(self) -> None: |
| 29 | + with pytest.raises(ValidationError, match="unknown category keys"): |
| 30 | + SuiteDefaults(score_weights={"bogus": 0.5}) |
| 31 | + |
| 32 | + def test_negative_weight_rejected(self) -> None: |
| 33 | + with pytest.raises(ValidationError, match="non-negative"): |
| 34 | + SuiteDefaults(score_weights={"adherence": -0.1}) |
| 35 | + |
| 36 | + def test_all_zero_weights_rejected(self) -> None: |
| 37 | + with pytest.raises(ValidationError, match="at least one positive weight"): |
| 38 | + SuiteDefaults( |
| 39 | + score_weights={ |
| 40 | + "adherence": 0.0, |
| 41 | + "attribution": 0.0, |
| 42 | + "calibration": 0.0, |
| 43 | + "ablation": 0.0, |
| 44 | + "baseline": 0.0, |
| 45 | + } |
| 46 | + ) |
| 47 | + |
| 48 | + |
| 49 | +class TestParseWeightsFlag: |
| 50 | + def test_none_input_returns_none(self) -> None: |
| 51 | + assert _parse_weights_flag(None) is None |
| 52 | + assert _parse_weights_flag("") is None |
| 53 | + |
| 54 | + def test_single_pair(self) -> None: |
| 55 | + assert _parse_weights_flag("adherence=0.4") == {"adherence": 0.4} |
| 56 | + |
| 57 | + def test_multiple_pairs(self) -> None: |
| 58 | + out = _parse_weights_flag("adherence=0.4,attribution=0.3,ablation=0.1") |
| 59 | + assert out == {"adherence": 0.4, "attribution": 0.3, "ablation": 0.1} |
| 60 | + |
| 61 | + def test_whitespace_tolerated(self) -> None: |
| 62 | + out = _parse_weights_flag(" adherence=0.4 , attribution=0.3 ") |
| 63 | + assert out == {"adherence": 0.4, "attribution": 0.3} |
| 64 | + |
| 65 | + def test_missing_equals_raises(self) -> None: |
| 66 | + import typer |
| 67 | + |
| 68 | + with pytest.raises(typer.BadParameter, match="key=value"): |
| 69 | + _parse_weights_flag("adherence0.4") |
| 70 | + |
| 71 | + def test_non_numeric_value_raises(self) -> None: |
| 72 | + import typer |
| 73 | + |
| 74 | + with pytest.raises(typer.BadParameter, match="not a number"): |
| 75 | + _parse_weights_flag("adherence=high") |
| 76 | + |
| 77 | + |
| 78 | +class TestComputeWithOverride: |
| 79 | + def test_overridden_weights_change_composite(self) -> None: |
| 80 | + """A re-weighted compose should change the overall score.""" |
| 81 | + from datetime import UTC, datetime |
| 82 | + |
| 83 | + from dlm_sway.core.result import ProbeResult, SuiteResult, Verdict |
| 84 | + from dlm_sway.suite.score import compute as compute_score |
| 85 | + |
| 86 | + now = datetime.now(UTC) |
| 87 | + # Two probes: one perfect adherence, one zero attribution. |
| 88 | + probes = ( |
| 89 | + ProbeResult( |
| 90 | + name="dk", |
| 91 | + kind="delta_kl", |
| 92 | + verdict=Verdict.PASS, |
| 93 | + score=1.0, |
| 94 | + evidence={"weight": 1.0}, |
| 95 | + ), |
| 96 | + ProbeResult( |
| 97 | + name="sis", |
| 98 | + kind="section_internalization", |
| 99 | + verdict=Verdict.FAIL, |
| 100 | + score=0.0, |
| 101 | + evidence={"weight": 1.0}, |
| 102 | + ), |
| 103 | + ) |
| 104 | + suite = SuiteResult( |
| 105 | + spec_path="<test>", |
| 106 | + started_at=now, |
| 107 | + finished_at=now, |
| 108 | + base_model_id="b", |
| 109 | + adapter_id="a", |
| 110 | + sway_version="0.0.0", |
| 111 | + probes=probes, |
| 112 | + ) |
| 113 | + |
| 114 | + # Default weights skew toward attribution (0.35 vs 0.30). |
| 115 | + default_score = compute_score(suite) |
| 116 | + # Override to favor adherence. |
| 117 | + skewed_score = compute_score( |
| 118 | + suite, |
| 119 | + weights={ |
| 120 | + "adherence": 0.9, |
| 121 | + "attribution": 0.05, |
| 122 | + "calibration": 0.025, |
| 123 | + "ablation": 0.025, |
| 124 | + "baseline": 0.0, |
| 125 | + }, |
| 126 | + ) |
| 127 | + assert skewed_score.overall > default_score.overall, ( |
| 128 | + f"expected skewed_score > default; got skewed={skewed_score.overall:.3f}, " |
| 129 | + f"default={default_score.overall:.3f}" |
| 130 | + ) |