@@ -0,0 +1,93 @@ |
| 1 | +"""Integration test: warm-backend speedup is real. |
| 2 | + |
| 3 | +Sprint 36 prove-the-value: first ``POST /run`` cold-loads the HF |
| 4 | +backend (~tens of seconds even with the tiny model), and subsequent |
| 5 | +runs reuse the cached backend. The exact speedup depends on the |
| 6 | +host (cache state, disk speed, CPU vs GPU); we assert the *shape* |
| 7 | +of the curve — call N+1 should be substantially faster than call 1 |
| 8 | +once the load is amortized. |
| 9 | +""" |
| 10 | + |
| 11 | +from __future__ import annotations |
| 12 | + |
| 13 | +from pathlib import Path |
| 14 | + |
| 15 | +import pytest |
| 16 | + |
| 17 | +pytest.importorskip("fastapi") |
| 18 | +pytest.importorskip("httpx") |
| 19 | +pytest.importorskip("torch") |
| 20 | + |
| 21 | +from fastapi.testclient import TestClient # noqa: E402 |
| 22 | + |
| 23 | +from dlm_sway.serve.app import create_app # noqa: E402 |
| 24 | +from dlm_sway.serve.cache import BackendCache # noqa: E402 |
| 25 | + |
| 26 | +pytestmark = [pytest.mark.slow, pytest.mark.online] |
| 27 | + |
| 28 | + |
| 29 | +def _spec_payload(tiny_model_dir: Path) -> dict[str, object]: |
| 30 | + """A backend-touching delta_kl spec against the tiny model.""" |
| 31 | + return { |
| 32 | + "version": 1, |
| 33 | + "models": { |
| 34 | + "base": { |
| 35 | + "kind": "hf", |
| 36 | + "base": str(tiny_model_dir), |
| 37 | + "dtype": "fp32", |
| 38 | + "device": "cpu", |
| 39 | + }, |
| 40 | + "ft": { |
| 41 | + "kind": "hf", |
| 42 | + "base": str(tiny_model_dir), |
| 43 | + "dtype": "fp32", |
| 44 | + "device": "cpu", |
| 45 | + }, |
| 46 | + }, |
| 47 | + "defaults": {"seed": 0, "differential": True}, |
| 48 | + "suite": [ |
| 49 | + {"name": "dk", "kind": "delta_kl", "prompts": ["hello"]}, |
| 50 | + ], |
| 51 | + } |
| 52 | + |
| 53 | + |
| 54 | +def test_warm_backend_speedup(tiny_model_dir: Path) -> None: |
| 55 | + """First call pays the load; runs 2 and 3 reuse the cache. |
| 56 | + |
| 57 | + Assertion is intentionally loose — we want to catch the case |
| 58 | + where the cache silently misses (every call cold-loads), not |
| 59 | + pin a specific ratio that varies with host hardware. |
| 60 | + """ |
| 61 | + cache = BackendCache(max_size=1) |
| 62 | + app = create_app(cache=cache) |
| 63 | + payload = _spec_payload(tiny_model_dir) |
| 64 | + |
| 65 | + with TestClient(app) as client: |
| 66 | + first = client.post("/run", json={"spec": payload}) |
| 67 | + assert first.status_code == 200, first.text |
| 68 | + first_body = first.json() |
| 69 | + first_seconds = first_body["request_seconds"] |
| 70 | + |
| 71 | + second = client.post("/run", json={"spec": payload}) |
| 72 | + assert second.status_code == 200, second.text |
| 73 | + second_seconds = second.json()["request_seconds"] |
| 74 | + |
| 75 | + third = client.post("/run", json={"spec": payload}) |
| 76 | + assert third.status_code == 200, third.text |
| 77 | + third_seconds = third.json()["request_seconds"] |
| 78 | + |
| 79 | + # The cache must report exactly one loaded backend after three |
| 80 | + # calls against the same spec — confirms cache hit, not miss. |
| 81 | + health = client.get("/health").json() |
| 82 | + assert len(health["loaded_models"]) == 1 |
| 83 | + |
| 84 | + # Warm calls should be at least 30% faster than the first. On a |
| 85 | + # tiny model + cold disk, first ≈ 5-15s and warm ≈ 1-3s; the |
| 86 | + # margin handles flaky timing without false-passing a regression |
| 87 | + # to "every call rebuilds the backend" (which would show |
| 88 | + # warm ≈ first within the same order of magnitude). |
| 89 | + assert second_seconds < first_seconds * 0.7, ( |
| 90 | + f"warm path not detected: first={first_seconds:.2f}s " |
| 91 | + f"second={second_seconds:.2f}s third={third_seconds:.2f}s" |
| 92 | + ) |
| 93 | + assert third_seconds < first_seconds * 0.7 |