Python · 3827 bytes Raw Blame History
1 """Integration test: ``sway gate`` exits non-zero on FAIL (C6).
2
3 Audit 01 flagged that no test asserts the gate's CI contract — the
4 whole point of ``sway gate`` is to fail a CI run when probes fail or
5 the composite score drops below the coverage threshold. This test
6 spawns the CLI via Typer's ``CliRunner`` (in-process, fast) and pins
7 the exit-code behavior on three scenarios:
8
9 1. Spec that produces only PASS verdicts → exit 0.
10 2. Spec that produces a FAIL verdict → exit 1.
11 3. Spec where the composite score is below an explicit ``--threshold``
12 override → exit 1.
13
14 We swap ``backends.build`` for a dummy-backend factory so the test
15 doesn't need a real HF model. That's a unit-style shortcut around the
16 ``backends/__init__.py:build`` rejection of ``kind="dummy"``; the
17 gate's own logic is what's under test.
18 """
19
20 from __future__ import annotations
21
22 from pathlib import Path
23
24 import pytest
25 from typer.testing import CliRunner
26
27 from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
28 from dlm_sway.cli.app import app
29
30
31 @pytest.fixture
32 def stub_build_backend(monkeypatch: pytest.MonkeyPatch):
33 """Replace ``backends.build`` with a dummy-returning factory."""
34
35 def _factory(*_args, **_kwargs):
36 return DummyDifferentialBackend(base=DummyResponses(), ft=DummyResponses())
37
38 # The CLI imports ``build`` lazily inside ``_execute_spec``; patch
39 # the module-level attribute on ``dlm_sway.backends``.
40 import dlm_sway.backends as backends_mod
41
42 monkeypatch.setattr(backends_mod, "build", _factory)
43
44
45 def _write_spec(path: Path, *, prompts: list[str], threshold: float, assert_mean: float) -> None:
46 path.write_text(
47 f"""
48 version: 1
49 models:
50 base:
51 base: stub
52 kind: hf
53 adapter: /tmp/stub-adapter
54 ft:
55 base: stub
56 kind: hf
57 adapter: /tmp/stub-adapter
58 defaults:
59 seed: 0
60 coverage_threshold: {threshold}
61 suite:
62 - name: dk
63 kind: delta_kl
64 prompts: {prompts!r}
65 assert_mean_gte: {assert_mean}
66 """.strip()
67 )
68
69
70 def test_gate_exits_zero_on_pass(stub_build_backend, tmp_path: Path) -> None:
71 """Dummy backend produces zero divergence → assert_mean_gte=0.0 → PASS."""
72 spec = tmp_path / "pass.yaml"
73 _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
74 result = CliRunner().invoke(app, ["gate", str(spec)])
75 assert result.exit_code == 0, (
76 f"expected exit 0; got {result.exit_code}\nstdout: {result.stdout}"
77 )
78
79
80 def test_gate_exits_one_on_fail(stub_build_backend, tmp_path: Path) -> None:
81 """Dummy backend produces zero divergence → assert_mean_gte=0.5 → FAIL → exit 1."""
82 spec = tmp_path / "fail.yaml"
83 _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.5)
84 result = CliRunner().invoke(app, ["gate", str(spec)])
85 assert result.exit_code == 1, (
86 f"expected exit 1 on FAIL; got {result.exit_code}\nstdout: {result.stdout}"
87 )
88 assert "FAILED" in result.stdout
89
90
91 def test_gate_exits_one_when_below_threshold(stub_build_backend, tmp_path: Path) -> None:
92 """PASS verdict, but composite score < --threshold override → exit 1."""
93 spec = tmp_path / "below.yaml"
94 _write_spec(spec, prompts=["q1", "q2"], threshold=0.0, assert_mean=0.0)
95 # Pass-criterion is satisfied (assert_mean_gte=0.0 always passes),
96 # so the gate would exit 0 on verdict alone. Pin the threshold above
97 # any reachable composite score (delta_kl's score is bounded by the
98 # JS-ln(2)-normalized mean, well under 1.0 for the dummy backend) to
99 # exercise the score-only failure path.
100 result = CliRunner().invoke(app, ["gate", str(spec), "--threshold", "0.999"])
101 assert result.exit_code == 1, (
102 f"expected exit 1 below threshold; got {result.exit_code}\nstdout: {result.stdout}"
103 )
104 assert "FAILED" in result.stdout