tenseleyflow/sway / 277fbdc

Browse files

tests: cross-verdict consistency — run/gate/junit tally agree (stronger-test #10)

Authored by espadonne
SHA
277fbdcf227eaf0b280ec789fd3571c93bc86865
Parents
cea35e6
Tree
0ad6b25

1 changed file

StatusFile+-
A tests/unit/test_cross_verdict_consistency.py 142 0
tests/unit/test_cross_verdict_consistency.pyadded
@@ -0,0 +1,142 @@
1
+"""Stronger-test #10 — ``run`` / ``gate`` / ``report --format junit`` agree.
2
+
3
+B16 pinned the gate's PASS/FAIL accounting against ``sway run``'s
4
+rendered table but didn't pin the ``report --format junit``
5
+re-rendering of saved JSON. Schema drift between the writer and the
6
+JUnit emitter would produce inconsistent PASS/FAIL tallies across the
7
+three surfaces — a CI dashboard reading the JUnit file and a developer
8
+reading the terminal output would see different numbers.
9
+
10
+This test runs a controlled spec through all three CLI paths against
11
+a stubbed dummy backend (the same pattern used by
12
+``test_sway_gate_exit_code.py``) and asserts the per-verdict tally
13
+is identical on every surface.
14
+"""
15
+
16
+from __future__ import annotations
17
+
18
+import json
19
+import xml.etree.ElementTree as ET
20
+from collections import Counter
21
+from pathlib import Path
22
+
23
+import pytest
24
+from typer.testing import CliRunner
25
+
26
+from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
27
+from dlm_sway.cli.app import app
28
+
29
+
30
+@pytest.fixture
31
+def stub_build_backend(monkeypatch: pytest.MonkeyPatch) -> None:
32
+    """Replace ``backends.build`` with a dummy-returning factory so the
33
+    CLI commands run without loading a real HF model. Same shape as
34
+    ``tests/integration/test_sway_gate_exit_code.py``'s fixture."""
35
+
36
+    def _factory(*_args: object, **_kwargs: object) -> DummyDifferentialBackend:
37
+        return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
38
+
39
+    import dlm_sway.backends as backends_mod
40
+
41
+    monkeypatch.setattr(backends_mod, "build", _factory)
42
+
43
+
44
+def _write_spec(path: Path) -> None:
45
+    """Two delta_kl probes with different thresholds — one passes on
46
+    the dummy backend's synthesized divergence, one fails."""
47
+    path.write_text(
48
+        """
49
+version: 1
50
+models:
51
+  base:
52
+    base: stub
53
+    kind: hf
54
+    adapter: /tmp/stub
55
+  ft:
56
+    base: stub
57
+    kind: hf
58
+    adapter: /tmp/stub
59
+defaults:
60
+  seed: 0
61
+  coverage_threshold: 0.0
62
+suite:
63
+  - name: dk_loose
64
+    kind: delta_kl
65
+    prompts: [p1, p2, p3, p4]
66
+    assert_mean_gte: 0.0
67
+  - name: dk_strict
68
+    kind: delta_kl
69
+    prompts: [p1, p2, p3, p4]
70
+    assert_mean_gte: 100.0
71
+""".strip()
72
+    )
73
+
74
+
75
+def _verdicts_from_json(payload: dict[str, object]) -> Counter[str]:
76
+    probes = payload["probes"]
77
+    assert isinstance(probes, list)
78
+    return Counter(str(p["verdict"]) for p in probes)
79
+
80
+
81
+def _verdicts_from_junit(xml_text: str) -> Counter[str]:
82
+    """<testcase> without child → PASS; with failure/error/skipped → mapped accordingly."""
83
+    root = ET.fromstring(xml_text)
84
+    counts: Counter[str] = Counter()
85
+    for case in root.iter("testcase"):
86
+        if case.find("error") is not None:
87
+            counts["error"] += 1
88
+        elif case.find("failure") is not None:
89
+            counts["fail"] += 1
90
+        elif case.find("skipped") is not None:
91
+            counts["skip"] += 1
92
+        else:
93
+            counts["pass"] += 1
94
+    return counts
95
+
96
+
97
+def _verdicts_from_stdout(stdout: str) -> Counter[str]:
98
+    """Lift per-row verdict tokens out of the markdown/terminal
99
+    table rendered to stdout. The verdict column carries a single
100
+    lowercased keyword in the fixed set."""
101
+    return Counter(
102
+        tok for tok in stdout.split() if tok in {"pass", "fail", "skip", "error", "warn"}
103
+    )
104
+
105
+
106
+def test_gate_run_junit_agree_on_verdicts(
107
+    stub_build_backend: None,  # noqa: ARG001
108
+    tmp_path: Path,
109
+) -> None:
110
+    spec = tmp_path / "sway.yaml"
111
+    _write_spec(spec)
112
+    runner = CliRunner()
113
+
114
+    # ``sway run`` writes the JSON we'll re-render as JUnit.
115
+    json_out = tmp_path / "run.json"
116
+    run = runner.invoke(app, ["run", str(spec), "--json", str(json_out)])
117
+    assert run.exit_code == 0, run.stdout
118
+    assert json_out.exists()
119
+    run_payload = json.loads(json_out.read_text(encoding="utf-8"))
120
+    run_verdicts = _verdicts_from_json(run_payload)
121
+
122
+    # ``sway gate`` surfaces PASS/FAIL in stdout even when it exits 1.
123
+    gate = runner.invoke(app, ["gate", str(spec)])
124
+    assert gate.exit_code == 1, gate.stdout  # one failing probe
125
+    gate_verdicts = _verdicts_from_stdout(gate.stdout)
126
+
127
+    # ``sway report --format junit`` re-renders the JSON file.
128
+    junit_out = tmp_path / "run.junit.xml"
129
+    rpt = runner.invoke(
130
+        app, ["report", str(json_out), "--format", "junit", "--out", str(junit_out)]
131
+    )
132
+    assert rpt.exit_code == 0, rpt.stdout
133
+    junit_verdicts = _verdicts_from_junit(junit_out.read_text(encoding="utf-8"))
134
+
135
+    assert run_verdicts == gate_verdicts == junit_verdicts, (
136
+        f"surfaces disagree:\n"
137
+        f"  run:   {dict(run_verdicts)}\n"
138
+        f"  gate:  {dict(gate_verdicts)}\n"
139
+        f"  junit: {dict(junit_verdicts)}"
140
+    )
141
+    assert run_verdicts["pass"] == 1
142
+    assert run_verdicts["fail"] == 1