Python · 3340 bytes Raw Blame History
1 """Integration test: warm-backend speedup is real.
2
3 Sprint 36 prove-the-value: first ``POST /run`` cold-loads the HF
4 backend (~tens of seconds even with the tiny model), and subsequent
5 runs reuse the cached backend. The exact speedup depends on the
6 host (cache state, disk speed, CPU vs GPU); we assert the *shape*
7 of the curve — call N+1 should be substantially faster than call 1
8 once the load is amortized.
9 """
10
11 from __future__ import annotations
12
13 from pathlib import Path
14
15 import pytest
16
17 pytest.importorskip("fastapi")
18 pytest.importorskip("httpx")
19 pytest.importorskip("torch")
20
21 from fastapi.testclient import TestClient # noqa: E402
22
23 from dlm_sway.serve.app import create_app # noqa: E402
24 from dlm_sway.serve.cache import BackendCache # noqa: E402
25
26 pytestmark = [pytest.mark.slow, pytest.mark.online]
27
28
29 def _spec_payload(tiny_model_dir: Path) -> dict[str, object]:
30 """A backend-touching delta_kl spec against the tiny model."""
31 return {
32 "version": 1,
33 "models": {
34 "base": {
35 "kind": "hf",
36 "base": str(tiny_model_dir),
37 "dtype": "fp32",
38 "device": "cpu",
39 },
40 "ft": {
41 "kind": "hf",
42 "base": str(tiny_model_dir),
43 "dtype": "fp32",
44 "device": "cpu",
45 },
46 },
47 "defaults": {"seed": 0, "differential": True},
48 "suite": [
49 {"name": "dk", "kind": "delta_kl", "prompts": ["hello"]},
50 ],
51 }
52
53
54 def test_warm_backend_speedup(tiny_model_dir: Path) -> None:
55 """First call pays the load; runs 2 and 3 reuse the cache.
56
57 Assertion is intentionally loose — we want to catch the case
58 where the cache silently misses (every call cold-loads), not
59 pin a specific ratio that varies with host hardware.
60 """
61 cache = BackendCache(max_size=1)
62 app = create_app(cache=cache)
63 payload = _spec_payload(tiny_model_dir)
64
65 with TestClient(app) as client:
66 first = client.post("/run", json={"spec": payload})
67 assert first.status_code == 200, first.text
68 first_body = first.json()
69 first_seconds = first_body["request_seconds"]
70
71 second = client.post("/run", json={"spec": payload})
72 assert second.status_code == 200, second.text
73 second_seconds = second.json()["request_seconds"]
74
75 third = client.post("/run", json={"spec": payload})
76 assert third.status_code == 200, third.text
77 third_seconds = third.json()["request_seconds"]
78
79 # The cache must report exactly one loaded backend after three
80 # calls against the same spec — confirms cache hit, not miss.
81 health = client.get("/health").json()
82 assert len(health["loaded_models"]) == 1
83
84 # Warm calls should be at least 30% faster than the first. On a
85 # tiny model + cold disk, first ≈ 5-15s and warm ≈ 1-3s; the
86 # margin handles flaky timing without false-passing a regression
87 # to "every call rebuilds the backend" (which would show
88 # warm ≈ first within the same order of magnitude).
89 assert second_seconds < first_seconds * 0.7, (
90 f"warm path not detected: first={first_seconds:.2f}s "
91 f"second={second_seconds:.2f}s third={third_seconds:.2f}s"
92 )
93 assert third_seconds < first_seconds * 0.7