sway Public

Watch 0 Fork 0 Star 0

Python · 3340 bytes Raw Blame History

  
        1
        """Integration test: warm-backend speedup is real.
      
        2
        
        3
        Sprint 36 prove-the-value: first ``POST /run`` cold-loads the HF
      
        4
        backend (~tens of seconds even with the tiny model), and subsequent
      
        5
        runs reuse the cached backend. The exact speedup depends on the
      
        6
        host (cache state, disk speed, CPU vs GPU); we assert the *shape*
      
        7
        of the curve — call N+1 should be substantially faster than call 1
      
        8
        once the load is amortized.
      
        9
        """
      
        10
        
        11
        from __future__ import annotations
      
        12
        
        13
        from pathlib import Path
      
        14
        
        15
        import pytest
      
        16
        
        17
        pytest.importorskip("fastapi")
      
        18
        pytest.importorskip("httpx")
      
        19
        pytest.importorskip("torch")
      
        20
        
        21
        from fastapi.testclient import TestClient  # noqa: E402
      
        22
        
        23
        from dlm_sway.serve.app import create_app  # noqa: E402
      
        24
        from dlm_sway.serve.cache import BackendCache  # noqa: E402
      
        25
        
        26
        pytestmark = [pytest.mark.slow, pytest.mark.online]
      
        27
        
        28
        
        29
        def _spec_payload(tiny_model_dir: Path) -> dict[str, object]:
      
        30
            """A backend-touching delta_kl spec against the tiny model."""
      
        31
            return {
      
        32
                "version": 1,
      
        33
                "models": {
      
        34
                    "base": {
      
        35
                        "kind": "hf",
      
        36
                        "base": str(tiny_model_dir),
      
        37
                        "dtype": "fp32",
      
        38
                        "device": "cpu",
      
        39
                    },
      
        40
                    "ft": {
      
        41
                        "kind": "hf",
      
        42
                        "base": str(tiny_model_dir),
      
        43
                        "dtype": "fp32",
      
        44
                        "device": "cpu",
      
        45
                    },
      
        46
                },
      
        47
                "defaults": {"seed": 0, "differential": True},
      
        48
                "suite": [
      
        49
                    {"name": "dk", "kind": "delta_kl", "prompts": ["hello"]},
      
        50
                ],
      
        51
            }
      
        52
        
        53
        
        54
        def test_warm_backend_speedup(tiny_model_dir: Path) -> None:
      
        55
            """First call pays the load; runs 2 and 3 reuse the cache.
      
        56
        
        57
            Assertion is intentionally loose — we want to catch the case
      
        58
            where the cache silently misses (every call cold-loads), not
      
        59
            pin a specific ratio that varies with host hardware.
      
        60
            """
      
        61
            cache = BackendCache(max_size=1)
      
        62
            app = create_app(cache=cache)
      
        63
            payload = _spec_payload(tiny_model_dir)
      
        64
        
        65
            with TestClient(app) as client:
      
        66
                first = client.post("/run", json={"spec": payload})
      
        67
                assert first.status_code == 200, first.text
      
        68
                first_body = first.json()
      
        69
                first_seconds = first_body["request_seconds"]
      
        70
        
        71
                second = client.post("/run", json={"spec": payload})
      
        72
                assert second.status_code == 200, second.text
      
        73
                second_seconds = second.json()["request_seconds"]
      
        74
        
        75
                third = client.post("/run", json={"spec": payload})
      
        76
                assert third.status_code == 200, third.text
      
        77
                third_seconds = third.json()["request_seconds"]
      
        78
        
        79
                # The cache must report exactly one loaded backend after three
      
        80
                # calls against the same spec — confirms cache hit, not miss.
      
        81
                health = client.get("/health").json()
      
        82
                assert len(health["loaded_models"]) == 1
      
        83
        
        84
            # Warm calls should be at least 30% faster than the first. On a
      
        85
            # tiny model + cold disk, first ≈ 5-15s and warm ≈ 1-3s; the
      
        86
            # margin handles flaky timing without false-passing a regression
      
        87
            # to "every call rebuilds the backend" (which would show
      
        88
            # warm ≈ first within the same order of magnitude).
      
        89
            assert second_seconds < first_seconds * 0.7, (
      
        90
                f"warm path not detected: first={first_seconds:.2f}s "
      
        91
                f"second={second_seconds:.2f}s third={third_seconds:.2f}s"
      
        92
            )
      
        93
            assert third_seconds < first_seconds * 0.7

1	"""Integration test: warm-backend speedup is real.
2
3	Sprint 36 prove-the-value: first ``POST /run`` cold-loads the HF
4	backend (~tens of seconds even with the tiny model), and subsequent
5	runs reuse the cached backend. The exact speedup depends on the
6	host (cache state, disk speed, CPU vs GPU); we assert the shape
7	of the curve — call N+1 should be substantially faster than call 1
8	once the load is amortized.
9	"""
10
11	from __future__ import annotations
12
13	from pathlib import Path
14
15	import pytest
16
17	pytest.importorskip("fastapi")
18	pytest.importorskip("httpx")
19	pytest.importorskip("torch")
20
21	from fastapi.testclient import TestClient # noqa: E402
22
23	from dlm_sway.serve.app import create_app # noqa: E402
24	from dlm_sway.serve.cache import BackendCache # noqa: E402
25
26	pytestmark = [pytest.mark.slow, pytest.mark.online]
27
28
29	def _spec_payload(tiny_model_dir: Path) -> dict[str, object]:
30	"""A backend-touching delta_kl spec against the tiny model."""
31	return {
32	"version": 1,
33	"models": {
34	"base": {
35	"kind": "hf",
36	"base": str(tiny_model_dir),
37	"dtype": "fp32",
38	"device": "cpu",
39	},
40	"ft": {
41	"kind": "hf",
42	"base": str(tiny_model_dir),
43	"dtype": "fp32",
44	"device": "cpu",
45	},
46	},
47	"defaults": {"seed": 0, "differential": True},
48	"suite": [
49	{"name": "dk", "kind": "delta_kl", "prompts": ["hello"]},
50	],
51	}
52
53
54	def test_warm_backend_speedup(tiny_model_dir: Path) -> None:
55	"""First call pays the load; runs 2 and 3 reuse the cache.
56
57	Assertion is intentionally loose — we want to catch the case
58	where the cache silently misses (every call cold-loads), not
59	pin a specific ratio that varies with host hardware.
60	"""
61	cache = BackendCache(max_size=1)
62	app = create_app(cache=cache)
63	payload = _spec_payload(tiny_model_dir)
64
65	with TestClient(app) as client:
66	first = client.post("/run", json={"spec": payload})
67	assert first.status_code == 200, first.text
68	first_body = first.json()
69	first_seconds = first_body["request_seconds"]
70
71	second = client.post("/run", json={"spec": payload})
72	assert second.status_code == 200, second.text
73	second_seconds = second.json()["request_seconds"]
74
75	third = client.post("/run", json={"spec": payload})
76	assert third.status_code == 200, third.text
77	third_seconds = third.json()["request_seconds"]
78
79	# The cache must report exactly one loaded backend after three
80	# calls against the same spec — confirms cache hit, not miss.
81	health = client.get("/health").json()
82	assert len(health["loaded_models"]) == 1
83
84	# Warm calls should be at least 30% faster than the first. On a
85	# tiny model + cold disk, first ≈ 5-15s and warm ≈ 1-3s; the
86	# margin handles flaky timing without false-passing a regression
87	# to "every call rebuilds the backend" (which would show
88	# warm ≈ first within the same order of magnitude).
89	assert second_seconds < first_seconds * 0.7, (
90	f"warm path not detected: first={first_seconds:.2f}s "
91	f"second={second_seconds:.2f}s third={third_seconds:.2f}s"
92	)
93	assert third_seconds < first_seconds * 0.7