sway Public

Watch 0 Fork 0 Star 0

Python · 4724 bytes Raw Blame History

  
        1
        """Stronger-test #10 — ``run`` / ``gate`` / ``report --format junit`` agree.
      
        2
        
        3
        B16 pinned the gate's PASS/FAIL accounting against ``sway run``'s
      
        4
        rendered table but didn't pin the ``report --format junit``
      
        5
        re-rendering of saved JSON. Schema drift between the writer and the
      
        6
        JUnit emitter would produce inconsistent PASS/FAIL tallies across the
      
        7
        three surfaces — a CI dashboard reading the JUnit file and a developer
      
        8
        reading the terminal output would see different numbers.
      
        9
        
        10
        This test runs a controlled spec through all three CLI paths against
      
        11
        a stubbed dummy backend (the same pattern used by
      
        12
        ``test_sway_gate_exit_code.py``) and asserts the per-verdict tally
      
        13
        is identical on every surface.
      
        14
        """
      
        15
        
        16
        from __future__ import annotations
      
        17
        
        18
        import json
      
        19
        import xml.etree.ElementTree as ET
      
        20
        from collections import Counter
      
        21
        from pathlib import Path
      
        22
        
        23
        import pytest
      
        24
        from typer.testing import CliRunner
      
        25
        
        26
        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
      
        27
        from dlm_sway.cli.app import app
      
        28
        
        29
        
        30
        @pytest.fixture
      
        31
        def stub_build_backend(monkeypatch: pytest.MonkeyPatch) -> None:
      
        32
            """Replace ``backends.build`` with a dummy-returning factory so the
      
        33
            CLI commands run without loading a real HF model. Same shape as
      
        34
            ``tests/integration/test_sway_gate_exit_code.py``'s fixture."""
      
        35
        
        36
            def _factory(*_args: object, **_kwargs: object) -> DummyDifferentialBackend:
      
        37
                return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        38
        
        39
            import dlm_sway.backends as backends_mod
      
        40
        
        41
            monkeypatch.setattr(backends_mod, "build", _factory)
      
        42
        
        43
        
        44
        def _write_spec(path: Path) -> None:
      
        45
            """Two delta_kl probes with different thresholds — one passes on
      
        46
            the dummy backend's synthesized divergence, one fails."""
      
        47
            path.write_text(
      
        48
                """
      
        49
        version: 1
      
        50
        models:
      
        51
          base:
      
        52
            base: stub
      
        53
            kind: hf
      
        54
            adapter: /tmp/stub
      
        55
          ft:
      
        56
            base: stub
      
        57
            kind: hf
      
        58
            adapter: /tmp/stub
      
        59
        defaults:
      
        60
          seed: 0
      
        61
          coverage_threshold: 0.0
      
        62
        suite:
      
        63
          - name: dk_loose
      
        64
            kind: delta_kl
      
        65
            prompts: [p1, p2, p3, p4]
      
        66
            assert_mean_gte: 0.0
      
        67
          - name: dk_strict
      
        68
            kind: delta_kl
      
        69
            prompts: [p1, p2, p3, p4]
      
        70
            assert_mean_gte: 100.0
      
        71
        """.strip()
      
        72
            )
      
        73
        
        74
        
        75
        def _verdicts_from_json(payload: dict[str, object]) -> Counter[str]:
      
        76
            probes = payload["probes"]
      
        77
            assert isinstance(probes, list)
      
        78
            return Counter(str(p["verdict"]) for p in probes)
      
        79
        
        80
        
        81
        def _verdicts_from_junit(xml_text: str) -> Counter[str]:
      
        82
            """<testcase> without child → PASS; with failure/error/skipped → mapped accordingly."""
      
        83
            root = ET.fromstring(xml_text)
      
        84
            counts: Counter[str] = Counter()
      
        85
            for case in root.iter("testcase"):
      
        86
                if case.find("error") is not None:
      
        87
                    counts["error"] += 1
      
        88
                elif case.find("failure") is not None:
      
        89
                    counts["fail"] += 1
      
        90
                elif case.find("skipped") is not None:
      
        91
                    counts["skip"] += 1
      
        92
                else:
      
        93
                    counts["pass"] += 1
      
        94
            return counts
      
        95
        
        96
        
        97
        def _verdicts_from_stdout(stdout: str) -> Counter[str]:
      
        98
            """Lift per-row verdict tokens out of the markdown/terminal
      
        99
            table rendered to stdout. The verdict column carries a single
      
        100
            lowercased keyword in the fixed set."""
      
        101
            return Counter(
      
        102
                tok for tok in stdout.split() if tok in {"pass", "fail", "skip", "error", "warn"}
      
        103
            )
      
        104
        
        105
        
        106
        def test_gate_run_junit_agree_on_verdicts(
      
        107
            stub_build_backend: None,  # noqa: ARG001
      
        108
            tmp_path: Path,
      
        109
        ) -> None:
      
        110
            spec = tmp_path / "sway.yaml"
      
        111
            _write_spec(spec)
      
        112
            runner = CliRunner()
      
        113
        
        114
            # ``sway run`` writes the JSON we'll re-render as JUnit.
      
        115
            json_out = tmp_path / "run.json"
      
        116
            run = runner.invoke(app, ["run", str(spec), "--json", str(json_out)])
      
        117
            assert run.exit_code == 0, run.stdout
      
        118
            assert json_out.exists()
      
        119
            run_payload = json.loads(json_out.read_text(encoding="utf-8"))
      
        120
            run_verdicts = _verdicts_from_json(run_payload)
      
        121
        
        122
            # ``sway gate`` surfaces PASS/FAIL in stdout even when it exits 1.
      
        123
            gate = runner.invoke(app, ["gate", str(spec)])
      
        124
            assert gate.exit_code == 1, gate.stdout  # one failing probe
      
        125
            gate_verdicts = _verdicts_from_stdout(gate.stdout)
      
        126
        
        127
            # ``sway report --format junit`` re-renders the JSON file.
      
        128
            junit_out = tmp_path / "run.junit.xml"
      
        129
            rpt = runner.invoke(
      
        130
                app, ["report", str(json_out), "--format", "junit", "--out", str(junit_out)]
      
        131
            )
      
        132
            assert rpt.exit_code == 0, rpt.stdout
      
        133
            junit_verdicts = _verdicts_from_junit(junit_out.read_text(encoding="utf-8"))
      
        134
        
        135
            assert run_verdicts == gate_verdicts == junit_verdicts, (
      
        136
                f"surfaces disagree:\n"
      
        137
                f"  run:   {dict(run_verdicts)}\n"
      
        138
                f"  gate:  {dict(gate_verdicts)}\n"
      
        139
                f"  junit: {dict(junit_verdicts)}"
      
        140
            )
      
        141
            assert run_verdicts["pass"] == 1
      
        142
            assert run_verdicts["fail"] == 1

1	"""Stronger-test #10 — ``run`` / ``gate`` / ``report --format junit`` agree.
2
3	B16 pinned the gate's PASS/FAIL accounting against ``sway run``'s
4	rendered table but didn't pin the ``report --format junit``
5	re-rendering of saved JSON. Schema drift between the writer and the
6	JUnit emitter would produce inconsistent PASS/FAIL tallies across the
7	three surfaces — a CI dashboard reading the JUnit file and a developer
8	reading the terminal output would see different numbers.
9
10	This test runs a controlled spec through all three CLI paths against
11	a stubbed dummy backend (the same pattern used by
12	``test_sway_gate_exit_code.py``) and asserts the per-verdict tally
13	is identical on every surface.
14	"""
15
16	from __future__ import annotations
17
18	import json
19	import xml.etree.ElementTree as ET
20	from collections import Counter
21	from pathlib import Path
22
23	import pytest
24	from typer.testing import CliRunner
25
26	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
27	from dlm_sway.cli.app import app
28
29
30	@pytest.fixture
31	def stub_build_backend(monkeypatch: pytest.MonkeyPatch) -> None:
32	"""Replace ``backends.build`` with a dummy-returning factory so the
33	CLI commands run without loading a real HF model. Same shape as
34	``tests/integration/test_sway_gate_exit_code.py``'s fixture."""
35
36	def _factory(_args: object, *_kwargs: object) -> DummyDifferentialBackend:
37	return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
38
39	import dlm_sway.backends as backends_mod
40
41	monkeypatch.setattr(backends_mod, "build", _factory)
42
43
44	def _write_spec(path: Path) -> None:
45	"""Two delta_kl probes with different thresholds — one passes on
46	the dummy backend's synthesized divergence, one fails."""
47	path.write_text(
48	"""
49	version: 1
50	models:
51	base:
52	base: stub
53	kind: hf
54	adapter: /tmp/stub
55	ft:
56	base: stub
57	kind: hf
58	adapter: /tmp/stub
59	defaults:
60	seed: 0
61	coverage_threshold: 0.0
62	suite:
63	- name: dk_loose
64	kind: delta_kl
65	prompts: [p1, p2, p3, p4]
66	assert_mean_gte: 0.0
67	- name: dk_strict
68	kind: delta_kl
69	prompts: [p1, p2, p3, p4]
70	assert_mean_gte: 100.0
71	""".strip()
72	)
73
74
75	def _verdicts_from_json(payload: dict[str, object]) -> Counter[str]:
76	probes = payload["probes"]
77	assert isinstance(probes, list)
78	return Counter(str(p["verdict"]) for p in probes)
79
80
81	def _verdicts_from_junit(xml_text: str) -> Counter[str]:
82	"""<testcase> without child → PASS; with failure/error/skipped → mapped accordingly."""
83	root = ET.fromstring(xml_text)
84	counts: Counter[str] = Counter()
85	for case in root.iter("testcase"):
86	if case.find("error") is not None:
87	counts["error"] += 1
88	elif case.find("failure") is not None:
89	counts["fail"] += 1
90	elif case.find("skipped") is not None:
91	counts["skip"] += 1
92	else:
93	counts["pass"] += 1
94	return counts
95
96
97	def _verdicts_from_stdout(stdout: str) -> Counter[str]:
98	"""Lift per-row verdict tokens out of the markdown/terminal
99	table rendered to stdout. The verdict column carries a single
100	lowercased keyword in the fixed set."""
101	return Counter(
102	tok for tok in stdout.split() if tok in {"pass", "fail", "skip", "error", "warn"}
103	)
104
105
106	def test_gate_run_junit_agree_on_verdicts(
107	stub_build_backend: None, # noqa: ARG001
108	tmp_path: Path,
109	) -> None:
110	spec = tmp_path / "sway.yaml"
111	_write_spec(spec)
112	runner = CliRunner()
113
114	# ``sway run`` writes the JSON we'll re-render as JUnit.
115	json_out = tmp_path / "run.json"
116	run = runner.invoke(app, ["run", str(spec), "--json", str(json_out)])
117	assert run.exit_code == 0, run.stdout
118	assert json_out.exists()
119	run_payload = json.loads(json_out.read_text(encoding="utf-8"))
120	run_verdicts = _verdicts_from_json(run_payload)
121
122	# ``sway gate`` surfaces PASS/FAIL in stdout even when it exits 1.
123	gate = runner.invoke(app, ["gate", str(spec)])
124	assert gate.exit_code == 1, gate.stdout # one failing probe
125	gate_verdicts = _verdicts_from_stdout(gate.stdout)
126
127	# ``sway report --format junit`` re-renders the JSON file.
128	junit_out = tmp_path / "run.junit.xml"
129	rpt = runner.invoke(
130	app, ["report", str(json_out), "--format", "junit", "--out", str(junit_out)]
131	)
132	assert rpt.exit_code == 0, rpt.stdout
133	junit_verdicts = _verdicts_from_junit(junit_out.read_text(encoding="utf-8"))
134
135	assert run_verdicts == gate_verdicts == junit_verdicts, (
136	f"surfaces disagree:\n"
137	f" run: {dict(run_verdicts)}\n"
138	f" gate: {dict(gate_verdicts)}\n"
139	f" junit: {dict(junit_verdicts)}"
140	)
141	assert run_verdicts["pass"] == 1
142	assert run_verdicts["fail"] == 1