Python · 5250 bytes Raw Blame History
1 """CLI tests for ``sway compare`` (S11)."""
2
3 from __future__ import annotations
4
5 import json
6 from pathlib import Path
7
8 from typer.testing import CliRunner
9
10 from dlm_sway.cli.app import app
11
12
13 def _write_run(path: Path, *, timestamp: str, score: float, probe_score: float) -> None:
14 """Write a minimal result JSON at ``path``.
15
16 The payload only needs the fields ``report.from_json`` and ``compare``
17 actually read — probes (name + score), overall score, timestamp.
18 """
19 payload = {
20 "schema_version": 1,
21 "sway_version": "0.1.0.dev0",
22 "base_model_id": "base",
23 "adapter_id": "adp",
24 "started_at": timestamp,
25 "finished_at": timestamp,
26 "score": {
27 "overall": score,
28 "band": "healthy",
29 "components": {},
30 "findings": [],
31 },
32 "probes": [
33 {
34 "name": "dk",
35 "kind": "delta_kl",
36 "verdict": "pass",
37 "score": probe_score,
38 "message": f"dk score={probe_score}",
39 },
40 ],
41 }
42 path.write_text(json.dumps(payload), encoding="utf-8")
43
44
45 class TestCompareCli:
46 def test_two_files_terminal_default(self, tmp_path: Path) -> None:
47 a = tmp_path / "a.json"
48 b = tmp_path / "b.json"
49 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
50 _write_run(b, timestamp="2026-01-02T12:00:00+00:00", score=0.82, probe_score=0.82)
51 result = CliRunner().invoke(app, ["compare", str(a), str(b)])
52 assert result.exit_code == 0, result.stdout
53 assert "sway compare" in result.stdout
54 assert "dk" in result.stdout
55 assert "composite" in result.stdout
56
57 def test_markdown_format(self, tmp_path: Path) -> None:
58 a = tmp_path / "a.json"
59 b = tmp_path / "b.json"
60 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
61 _write_run(b, timestamp="2026-01-02T12:00:00+00:00", score=0.82, probe_score=0.82)
62 result = CliRunner().invoke(app, ["compare", str(a), str(b), "--format", "md"])
63 assert result.exit_code == 0, result.stdout
64 # Markdown header + table pipes.
65 assert "# sway compare" in result.stdout
66 assert "| probe |" in result.stdout
67
68 def test_json_format(self, tmp_path: Path) -> None:
69 a = tmp_path / "a.json"
70 b = tmp_path / "b.json"
71 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
72 _write_run(b, timestamp="2026-01-02T12:00:00+00:00", score=0.82, probe_score=0.82)
73 result = CliRunner().invoke(app, ["compare", str(a), str(b), "--format", "json"])
74 assert result.exit_code == 0, result.stdout
75 parsed = json.loads(result.stdout)
76 assert parsed["labels"] == ["a", "b"]
77 assert "dk" in parsed["scores"]
78
79 def test_fewer_than_two_files_exits_2(self, tmp_path: Path) -> None:
80 a = tmp_path / "a.json"
81 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
82 result = CliRunner().invoke(app, ["compare", str(a)])
83 assert result.exit_code == 2
84 assert "at least two" in result.stderr + result.stdout
85
86 def test_unreadable_file_exits_2(self, tmp_path: Path) -> None:
87 a = tmp_path / "a.json"
88 b = tmp_path / "b.json"
89 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
90 b.write_text("{ not valid json", encoding="utf-8")
91 result = CliRunner().invoke(app, ["compare", str(a), str(b)])
92 assert result.exit_code == 2
93
94 def test_fail_on_regression_exits_0_when_improving(self, tmp_path: Path) -> None:
95 a = tmp_path / "a.json"
96 b = tmp_path / "b.json"
97 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
98 _write_run(b, timestamp="2026-01-02T12:00:00+00:00", score=0.85, probe_score=0.85)
99 result = CliRunner().invoke(
100 app, ["compare", str(a), str(b), "--fail-on-regression", "0.10"]
101 )
102 assert result.exit_code == 0, result.stdout
103
104 def test_fail_on_regression_exits_1_when_probe_drops(self, tmp_path: Path) -> None:
105 a = tmp_path / "a.json"
106 b = tmp_path / "b.json"
107 # dk dropped 0.20 — above the 0.10 threshold.
108 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
109 _write_run(b, timestamp="2026-01-02T12:00:00+00:00", score=0.60, probe_score=0.60)
110 result = CliRunner().invoke(
111 app, ["compare", str(a), str(b), "--fail-on-regression", "0.10"]
112 )
113 assert result.exit_code == 1
114
115 def test_fail_on_regression_zero_disables_gate(self, tmp_path: Path) -> None:
116 a = tmp_path / "a.json"
117 b = tmp_path / "b.json"
118 # Drop ≥0.10 but threshold=0 → no gate.
119 _write_run(a, timestamp="2026-01-01T12:00:00+00:00", score=0.80, probe_score=0.80)
120 _write_run(b, timestamp="2026-01-02T12:00:00+00:00", score=0.50, probe_score=0.50)
121 result = CliRunner().invoke(app, ["compare", str(a), str(b), "--fail-on-regression", "0"])
122 assert result.exit_code == 0, result.stdout