tenseleyflow/sway / 4c0d941

Browse files

tests/sway_gate_exit_code: pass / fail-verdict / below-threshold exit codes (C6)

Authored by espadonne
SHA
4c0d941f9ea161ca344d64b187840ce06a19df28
Parents
217ef1d
Tree
f36c6e0

1 changed file

StatusFile+-
A tests/integration/test_sway_gate_exit_code.py 104 0
tests/integration/test_sway_gate_exit_code.pyadded
@@ -0,0 +1,104 @@
1
+"""Integration test: ``sway gate`` exits non-zero on FAIL (C6).
2
+
3
+Audit 01 flagged that no test asserts the gate's CI contract — the
4
+whole point of ``sway gate`` is to fail a CI run when probes fail or
5
+the composite score drops below the coverage threshold. This test
6
+spawns the CLI via Typer's ``CliRunner`` (in-process, fast) and pins
7
+the exit-code behavior on three scenarios:
8
+
9
+1. Spec that produces only PASS verdicts → exit 0.
10
+2. Spec that produces a FAIL verdict → exit 1.
11
+3. Spec where the composite score is below an explicit ``--threshold``
12
+   override → exit 1.
13
+
14
+We swap ``backends.build`` for a dummy-backend factory so the test
15
+doesn't need a real HF model. That's a unit-style shortcut around the
16
+``backends/__init__.py:build`` rejection of ``kind="dummy"``; the
17
+gate's own logic is what's under test.
18
+"""
19
+
20
+from __future__ import annotations
21
+
22
+from pathlib import Path
23
+
24
+import pytest
25
+from typer.testing import CliRunner
26
+
27
+from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
28
+from dlm_sway.cli.app import app
29
+
30
+
31
+@pytest.fixture
32
+def stub_build_backend(monkeypatch: pytest.MonkeyPatch):
33
+    """Replace ``backends.build`` with a dummy-returning factory."""
34
+
35
+    def _factory(*_args, **_kwargs):
36
+        return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
37
+
38
+    # The CLI imports ``build`` lazily inside ``_execute_spec``; patch
39
+    # the module-level attribute on ``dlm_sway.backends``.
40
+    import dlm_sway.backends as backends_mod
41
+
42
+    monkeypatch.setattr(backends_mod, "build", _factory)
43
+
44
+
45
+def _write_spec(path: Path, *, prompts: list[str], threshold: float, assert_mean: float) -> None:
46
+    path.write_text(
47
+        f"""
48
+version: 1
49
+models:
50
+  base:
51
+    base: stub
52
+    kind: hf
53
+    adapter: /tmp/stub-adapter
54
+  ft:
55
+    base: stub
56
+    kind: hf
57
+    adapter: /tmp/stub-adapter
58
+defaults:
59
+  seed: 0
60
+  coverage_threshold: {threshold}
61
+suite:
62
+  - name: dk
63
+    kind: delta_kl
64
+    prompts: {prompts!r}
65
+    assert_mean_gte: {assert_mean}
66
+""".strip()
67
+    )
68
+
69
+
70
+def test_gate_exits_zero_on_pass(stub_build_backend, tmp_path: Path) -> None:
71
+    """Dummy backend produces zero divergence → assert_mean_gte=0.0 → PASS."""
72
+    spec = tmp_path / "pass.yaml"
73
+    _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
74
+    result = CliRunner().invoke(app, ["gate", str(spec)])
75
+    assert result.exit_code == 0, (
76
+        f"expected exit 0; got {result.exit_code}\nstdout: {result.stdout}"
77
+    )
78
+
79
+
80
+def test_gate_exits_one_on_fail(stub_build_backend, tmp_path: Path) -> None:
81
+    """Dummy backend produces zero divergence → assert_mean_gte=0.5 → FAIL → exit 1."""
82
+    spec = tmp_path / "fail.yaml"
83
+    _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.5)
84
+    result = CliRunner().invoke(app, ["gate", str(spec)])
85
+    assert result.exit_code == 1, (
86
+        f"expected exit 1 on FAIL; got {result.exit_code}\nstdout: {result.stdout}"
87
+    )
88
+    assert "FAILED" in result.stdout
89
+
90
+
91
+def test_gate_exits_one_when_below_threshold(stub_build_backend, tmp_path: Path) -> None:
92
+    """PASS verdict, but composite score < --threshold override → exit 1."""
93
+    spec = tmp_path / "below.yaml"
94
+    _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
95
+    # Pass-criterion is satisfied (assert_mean_gte=0.0 always passes),
96
+    # so the gate would exit 0 on verdict alone. Pin the threshold above
97
+    # any reachable composite score (delta_kl's score is bounded by the
98
+    # JS-ln(2)-normalized mean, well under 1.0 for the dummy backend) to
99
+    # exercise the score-only failure path.
100
+    result = CliRunner().invoke(app, ["gate", str(spec), "--threshold", "0.999"])
101
+    assert result.exit_code == 1, (
102
+        f"expected exit 1 below threshold; got {result.exit_code}\nstdout: {result.stdout}"
103
+    )
104
+    assert "FAILED" in result.stdout