Python · 4724 bytes Raw Blame History
1 """Stronger-test #10 — ``run`` / ``gate`` / ``report --format junit`` agree.
2
3 B16 pinned the gate's PASS/FAIL accounting against ``sway run``'s
4 rendered table but didn't pin the ``report --format junit``
5 re-rendering of saved JSON. Schema drift between the writer and the
6 JUnit emitter would produce inconsistent PASS/FAIL tallies across the
7 three surfaces — a CI dashboard reading the JUnit file and a developer
8 reading the terminal output would see different numbers.
9
10 This test runs a controlled spec through all three CLI paths against
11 a stubbed dummy backend (the same pattern used by
12 ``test_sway_gate_exit_code.py``) and asserts the per-verdict tally
13 is identical on every surface.
14 """
15
16 from __future__ import annotations
17
18 import json
19 import xml.etree.ElementTree as ET
20 from collections import Counter
21 from pathlib import Path
22
23 import pytest
24 from typer.testing import CliRunner
25
26 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
27 from dlm_sway.cli.app import app
28
29
30 @pytest.fixture
31 def stub_build_backend(monkeypatch: pytest.MonkeyPatch) -> None:
32 """Replace ``backends.build`` with a dummy-returning factory so the
33 CLI commands run without loading a real HF model. Same shape as
34 ``tests/integration/test_sway_gate_exit_code.py``'s fixture."""
35
36 def _factory(*_args: object, **_kwargs: object) -> DummyDifferentialBackend:
37 return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
38
39 import dlm_sway.backends as backends_mod
40
41 monkeypatch.setattr(backends_mod, "build", _factory)
42
43
44 def _write_spec(path: Path) -> None:
45 """Two delta_kl probes with different thresholds — one passes on
46 the dummy backend's synthesized divergence, one fails."""
47 path.write_text(
48 """
49 version: 1
50 models:
51 base:
52 base: stub
53 kind: hf
54 adapter: /tmp/stub
55 ft:
56 base: stub
57 kind: hf
58 adapter: /tmp/stub
59 defaults:
60 seed: 0
61 coverage_threshold: 0.0
62 suite:
63 - name: dk_loose
64 kind: delta_kl
65 prompts: [p1, p2, p3, p4]
66 assert_mean_gte: 0.0
67 - name: dk_strict
68 kind: delta_kl
69 prompts: [p1, p2, p3, p4]
70 assert_mean_gte: 100.0
71 """.strip()
72 )
73
74
75 def _verdicts_from_json(payload: dict[str, object]) -> Counter[str]:
76 probes = payload["probes"]
77 assert isinstance(probes, list)
78 return Counter(str(p["verdict"]) for p in probes)
79
80
81 def _verdicts_from_junit(xml_text: str) -> Counter[str]:
82 """<testcase> without child → PASS; with failure/error/skipped → mapped accordingly."""
83 root = ET.fromstring(xml_text)
84 counts: Counter[str] = Counter()
85 for case in root.iter("testcase"):
86 if case.find("error") is not None:
87 counts["error"] += 1
88 elif case.find("failure") is not None:
89 counts["fail"] += 1
90 elif case.find("skipped") is not None:
91 counts["skip"] += 1
92 else:
93 counts["pass"] += 1
94 return counts
95
96
97 def _verdicts_from_stdout(stdout: str) -> Counter[str]:
98 """Lift per-row verdict tokens out of the markdown/terminal
99 table rendered to stdout. The verdict column carries a single
100 lowercased keyword in the fixed set."""
101 return Counter(
102 tok for tok in stdout.split() if tok in {"pass", "fail", "skip", "error", "warn"}
103 )
104
105
106 def test_gate_run_junit_agree_on_verdicts(
107 stub_build_backend: None, # noqa: ARG001
108 tmp_path: Path,
109 ) -> None:
110 spec = tmp_path / "sway.yaml"
111 _write_spec(spec)
112 runner = CliRunner()
113
114 # ``sway run`` writes the JSON we'll re-render as JUnit.
115 json_out = tmp_path / "run.json"
116 run = runner.invoke(app, ["run", str(spec), "--json", str(json_out)])
117 assert run.exit_code == 0, run.stdout
118 assert json_out.exists()
119 run_payload = json.loads(json_out.read_text(encoding="utf-8"))
120 run_verdicts = _verdicts_from_json(run_payload)
121
122 # ``sway gate`` surfaces PASS/FAIL in stdout even when it exits 1.
123 gate = runner.invoke(app, ["gate", str(spec)])
124 assert gate.exit_code == 1, gate.stdout # one failing probe
125 gate_verdicts = _verdicts_from_stdout(gate.stdout)
126
127 # ``sway report --format junit`` re-renders the JSON file.
128 junit_out = tmp_path / "run.junit.xml"
129 rpt = runner.invoke(
130 app, ["report", str(json_out), "--format", "junit", "--out", str(junit_out)]
131 )
132 assert rpt.exit_code == 0, rpt.stdout
133 junit_verdicts = _verdicts_from_junit(junit_out.read_text(encoding="utf-8"))
134
135 assert run_verdicts == gate_verdicts == junit_verdicts, (
136 f"surfaces disagree:\n"
137 f" run: {dict(run_verdicts)}\n"
138 f" gate: {dict(gate_verdicts)}\n"
139 f" junit: {dict(junit_verdicts)}"
140 )
141 assert run_verdicts["pass"] == 1
142 assert run_verdicts["fail"] == 1