sway Public

Watch 0 Fork 0 Star 0

Python · 3827 bytes Raw Blame History

  
        1
        """Integration test: ``sway gate`` exits non-zero on FAIL (C6).
      
        2
        
        3
        Audit 01 flagged that no test asserts the gate's CI contract — the
      
        4
        whole point of ``sway gate`` is to fail a CI run when probes fail or
      
        5
        the composite score drops below the coverage threshold. This test
      
        6
        spawns the CLI via Typer's ``CliRunner`` (in-process, fast) and pins
      
        7
        the exit-code behavior on three scenarios:
      
        8
        
        9
        1. Spec that produces only PASS verdicts → exit 0.
      
        10
        2. Spec that produces a FAIL verdict → exit 1.
      
        11
        3. Spec where the composite score is below an explicit ``--threshold``
      
        12
           override → exit 1.
      
        13
        
        14
        We swap ``backends.build`` for a dummy-backend factory so the test
      
        15
        doesn't need a real HF model. That's a unit-style shortcut around the
      
        16
        ``backends/__init__.py:build`` rejection of ``kind="dummy"``; the
      
        17
        gate's own logic is what's under test.
      
        18
        """
      
        19
        
        20
        from __future__ import annotations
      
        21
        
        22
        from pathlib import Path
      
        23
        
        24
        import pytest
      
        25
        from typer.testing import CliRunner
      
        26
        
        27
        from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
      
        28
        from dlm_sway.cli.app import app
      
        29
        
        30
        
        31
        @pytest.fixture
      
        32
        def stub_build_backend(monkeypatch: pytest.MonkeyPatch):
      
        33
            """Replace ``backends.build`` with a dummy-returning factory."""
      
        34
        
        35
            def _factory(*_args, **_kwargs):
      
        36
                return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
      
        37
        
        38
            # The CLI imports ``build`` lazily inside ``_execute_spec``; patch
      
        39
            # the module-level attribute on ``dlm_sway.backends``.
      
        40
            import dlm_sway.backends as backends_mod
      
        41
        
        42
            monkeypatch.setattr(backends_mod, "build", _factory)
      
        43
        
        44
        
        45
        def _write_spec(path: Path, *, prompts: list[str], threshold: float, assert_mean: float) -> None:
      
        46
            path.write_text(
      
        47
                f"""
      
        48
        version: 1
      
        49
        models:
      
        50
          base:
      
        51
            base: stub
      
        52
            kind: hf
      
        53
            adapter: /tmp/stub-adapter
      
        54
          ft:
      
        55
            base: stub
      
        56
            kind: hf
      
        57
            adapter: /tmp/stub-adapter
      
        58
        defaults:
      
        59
          seed: 0
      
        60
          coverage_threshold: {threshold}
      
        61
        suite:
      
        62
          - name: dk
      
        63
            kind: delta_kl
      
        64
            prompts: {prompts!r}
      
        65
            assert_mean_gte: {assert_mean}
      
        66
        """.strip()
      
        67
            )
      
        68
        
        69
        
        70
        def test_gate_exits_zero_on_pass(stub_build_backend, tmp_path: Path) -> None:
      
        71
            """Dummy backend produces zero divergence → assert_mean_gte=0.0 → PASS."""
      
        72
            spec = tmp_path / "pass.yaml"
      
        73
            _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
      
        74
            result = CliRunner().invoke(app, ["gate", str(spec)])
      
        75
            assert result.exit_code == 0, (
      
        76
                f"expected exit 0; got {result.exit_code}\nstdout: {result.stdout}"
      
        77
            )
      
        78
        
        79
        
        80
        def test_gate_exits_one_on_fail(stub_build_backend, tmp_path: Path) -> None:
      
        81
            """Dummy backend produces zero divergence → assert_mean_gte=0.5 → FAIL → exit 1."""
      
        82
            spec = tmp_path / "fail.yaml"
      
        83
            _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.5)
      
        84
            result = CliRunner().invoke(app, ["gate", str(spec)])
      
        85
            assert result.exit_code == 1, (
      
        86
                f"expected exit 1 on FAIL; got {result.exit_code}\nstdout: {result.stdout}"
      
        87
            )
      
        88
            assert "FAILED" in result.stdout
      
        89
        
        90
        
        91
        def test_gate_exits_one_when_below_threshold(stub_build_backend, tmp_path: Path) -> None:
      
        92
            """PASS verdict, but composite score < --threshold override → exit 1."""
      
        93
            spec = tmp_path / "below.yaml"
      
        94
            _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
      
        95
            # Pass-criterion is satisfied (assert_mean_gte=0.0 always passes),
      
        96
            # so the gate would exit 0 on verdict alone. Pin the threshold above
      
        97
            # any reachable composite score (delta_kl's score is bounded by the
      
        98
            # JS-ln(2)-normalized mean, well under 1.0 for the dummy backend) to
      
        99
            # exercise the score-only failure path.
      
        100
            result = CliRunner().invoke(app, ["gate", str(spec), "--threshold", "0.999"])
      
        101
            assert result.exit_code == 1, (
      
        102
                f"expected exit 1 below threshold; got {result.exit_code}\nstdout: {result.stdout}"
      
        103
            )
      
        104
            assert "FAILED" in result.stdout

1	"""Integration test: ``sway gate`` exits non-zero on FAIL (C6).
2
3	Audit 01 flagged that no test asserts the gate's CI contract — the
4	whole point of ``sway gate`` is to fail a CI run when probes fail or
5	the composite score drops below the coverage threshold. This test
6	spawns the CLI via Typer's ``CliRunner`` (in-process, fast) and pins
7	the exit-code behavior on three scenarios:
8
9	1. Spec that produces only PASS verdicts → exit 0.
10	2. Spec that produces a FAIL verdict → exit 1.
11	3. Spec where the composite score is below an explicit ``--threshold``
12	override → exit 1.
13
14	We swap ``backends.build`` for a dummy-backend factory so the test
15	doesn't need a real HF model. That's a unit-style shortcut around the
16	``backends/__init__.py:build`` rejection of ``kind="dummy"``; the
17	gate's own logic is what's under test.
18	"""
19
20	from __future__ import annotations
21
22	from pathlib import Path
23
24	import pytest
25	from typer.testing import CliRunner
26
27	from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
28	from dlm_sway.cli.app import app
29
30
31	@pytest.fixture
32	def stub_build_backend(monkeypatch: pytest.MonkeyPatch):
33	"""Replace ``backends.build`` with a dummy-returning factory."""
34
35	def _factory(_args, *_kwargs):
36	return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
37
38	# The CLI imports ``build`` lazily inside ``_execute_spec``; patch
39	# the module-level attribute on ``dlm_sway.backends``.
40	import dlm_sway.backends as backends_mod
41
42	monkeypatch.setattr(backends_mod, "build", _factory)
43
44
45	def _write_spec(path: Path, *, prompts: list[str], threshold: float, assert_mean: float) -> None:
46	path.write_text(
47	f"""
48	version: 1
49	models:
50	base:
51	base: stub
52	kind: hf
53	adapter: /tmp/stub-adapter
54	ft:
55	base: stub
56	kind: hf
57	adapter: /tmp/stub-adapter
58	defaults:
59	seed: 0
60	coverage_threshold: {threshold}
61	suite:
62	- name: dk
63	kind: delta_kl
64	prompts: {prompts!r}
65	assert_mean_gte: {assert_mean}
66	""".strip()
67	)
68
69
70	def test_gate_exits_zero_on_pass(stub_build_backend, tmp_path: Path) -> None:
71	"""Dummy backend produces zero divergence → assert_mean_gte=0.0 → PASS."""
72	spec = tmp_path / "pass.yaml"
73	_write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
74	result = CliRunner().invoke(app, ["gate", str(spec)])
75	assert result.exit_code == 0, (
76	f"expected exit 0; got {result.exit_code}\nstdout: {result.stdout}"
77	)
78
79
80	def test_gate_exits_one_on_fail(stub_build_backend, tmp_path: Path) -> None:
81	"""Dummy backend produces zero divergence → assert_mean_gte=0.5 → FAIL → exit 1."""
82	spec = tmp_path / "fail.yaml"
83	_write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.5)
84	result = CliRunner().invoke(app, ["gate", str(spec)])
85	assert result.exit_code == 1, (
86	f"expected exit 1 on FAIL; got {result.exit_code}\nstdout: {result.stdout}"
87	)
88	assert "FAILED" in result.stdout
89
90
91	def test_gate_exits_one_when_below_threshold(stub_build_backend, tmp_path: Path) -> None:
92	"""PASS verdict, but composite score < --threshold override → exit 1."""
93	spec = tmp_path / "below.yaml"
94	_write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
95	# Pass-criterion is satisfied (assert_mean_gte=0.0 always passes),
96	# so the gate would exit 0 on verdict alone. Pin the threshold above
97	# any reachable composite score (delta_kl's score is bounded by the
98	# JS-ln(2)-normalized mean, well under 1.0 for the dummy backend) to
99	# exercise the score-only failure path.
100	result = CliRunner().invoke(app, ["gate", str(spec), "--threshold", "0.999"])
101	assert result.exit_code == 1, (
102	f"expected exit 1 below threshold; got {result.exit_code}\nstdout: {result.stdout}"
103	)
104	assert "FAILED" in result.stdout