tenseleyflow/sway / 82c59c3

Browse files

Add slow integration test for warm-backend speedup

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
82c59c35ffb72012ea4c29f209d6f41a24e5996b
Parents
6d55ae1
Tree
4963800

1 changed file

StatusFile+-
A tests/integration/test_serve_warm_path.py 93 0
tests/integration/test_serve_warm_path.pyadded
@@ -0,0 +1,93 @@
1
+"""Integration test: warm-backend speedup is real.
2
+
3
+Sprint 36 prove-the-value: first ``POST /run`` cold-loads the HF
4
+backend (~tens of seconds even with the tiny model), and subsequent
5
+runs reuse the cached backend. The exact speedup depends on the
6
+host (cache state, disk speed, CPU vs GPU); we assert the *shape*
7
+of the curve — call N+1 should be substantially faster than call 1
8
+once the load is amortized.
9
+"""
10
+
11
+from __future__ import annotations
12
+
13
+from pathlib import Path
14
+
15
+import pytest
16
+
17
+pytest.importorskip("fastapi")
18
+pytest.importorskip("httpx")
19
+pytest.importorskip("torch")
20
+
21
+from fastapi.testclient import TestClient  # noqa: E402
22
+
23
+from dlm_sway.serve.app import create_app  # noqa: E402
24
+from dlm_sway.serve.cache import BackendCache  # noqa: E402
25
+
26
+pytestmark = [pytest.mark.slow, pytest.mark.online]
27
+
28
+
29
+def _spec_payload(tiny_model_dir: Path) -> dict[str, object]:
30
+    """A backend-touching delta_kl spec against the tiny model."""
31
+    return {
32
+        "version": 1,
33
+        "models": {
34
+            "base": {
35
+                "kind": "hf",
36
+                "base": str(tiny_model_dir),
37
+                "dtype": "fp32",
38
+                "device": "cpu",
39
+            },
40
+            "ft": {
41
+                "kind": "hf",
42
+                "base": str(tiny_model_dir),
43
+                "dtype": "fp32",
44
+                "device": "cpu",
45
+            },
46
+        },
47
+        "defaults": {"seed": 0, "differential": True},
48
+        "suite": [
49
+            {"name": "dk", "kind": "delta_kl", "prompts": ["hello"]},
50
+        ],
51
+    }
52
+
53
+
54
+def test_warm_backend_speedup(tiny_model_dir: Path) -> None:
55
+    """First call pays the load; runs 2 and 3 reuse the cache.
56
+
57
+    Assertion is intentionally loose — we want to catch the case
58
+    where the cache silently misses (every call cold-loads), not
59
+    pin a specific ratio that varies with host hardware.
60
+    """
61
+    cache = BackendCache(max_size=1)
62
+    app = create_app(cache=cache)
63
+    payload = _spec_payload(tiny_model_dir)
64
+
65
+    with TestClient(app) as client:
66
+        first = client.post("/run", json={"spec": payload})
67
+        assert first.status_code == 200, first.text
68
+        first_body = first.json()
69
+        first_seconds = first_body["request_seconds"]
70
+
71
+        second = client.post("/run", json={"spec": payload})
72
+        assert second.status_code == 200, second.text
73
+        second_seconds = second.json()["request_seconds"]
74
+
75
+        third = client.post("/run", json={"spec": payload})
76
+        assert third.status_code == 200, third.text
77
+        third_seconds = third.json()["request_seconds"]
78
+
79
+        # The cache must report exactly one loaded backend after three
80
+        # calls against the same spec — confirms cache hit, not miss.
81
+        health = client.get("/health").json()
82
+        assert len(health["loaded_models"]) == 1
83
+
84
+    # Warm calls should be at least 30% faster than the first. On a
85
+    # tiny model + cold disk, first ≈ 5-15s and warm ≈ 1-3s; the
86
+    # margin handles flaky timing without false-passing a regression
87
+    # to "every call rebuilds the backend" (which would show
88
+    # warm ≈ first within the same order of magnitude).
89
+    assert second_seconds < first_seconds * 0.7, (
90
+        f"warm path not detected: first={first_seconds:.2f}s "
91
+        f"second={second_seconds:.2f}s third={third_seconds:.2f}s"
92
+    )
93
+    assert third_seconds < first_seconds * 0.7