@@ -1387,3 +1387,95 @@ def unpack_cmd( |
| 1387 | 1387 | typer.echo(f" SWAY_NULL_CACHE_DIR={report.null_stats_dir} sway run {report.spec_path}") |
| 1388 | 1388 | else: |
| 1389 | 1389 | typer.echo(f" sway run {report.spec_path}") |
| 1390 | + |
| 1391 | + |
| 1392 | +def serve_cmd( |
| 1393 | + host: Annotated[ |
| 1394 | + str, |
| 1395 | + typer.Option( |
| 1396 | + "--host", |
| 1397 | + help=( |
| 1398 | + "Interface to bind. Default 127.0.0.1 (localhost only). " |
| 1399 | + "Binding to 0.0.0.0 requires --api-key." |
| 1400 | + ), |
| 1401 | + ), |
| 1402 | + ] = "127.0.0.1", |
| 1403 | + port: Annotated[ |
| 1404 | + int, |
| 1405 | + typer.Option("--port", help="TCP port to bind."), |
| 1406 | + ] = 8787, |
| 1407 | + max_loaded_models: Annotated[ |
| 1408 | + int, |
| 1409 | + typer.Option( |
| 1410 | + "--max-loaded-models", |
| 1411 | + help=( |
| 1412 | + "How many backends to keep warm in memory. Each loaded " |
| 1413 | + "model holds its own VRAM/RAM; default 2 fits a 16 GB GPU " |
| 1414 | + "with two ~1.5B fp16 adapters." |
| 1415 | + ), |
| 1416 | + ), |
| 1417 | + ] = 2, |
| 1418 | + api_key: Annotated[ |
| 1419 | + str | None, |
| 1420 | + typer.Option( |
| 1421 | + "--api-key", |
| 1422 | + help=( |
| 1423 | + "Bearer token required on every non-/health request. " |
| 1424 | + "Required when --host is not loopback." |
| 1425 | + ), |
| 1426 | + ), |
| 1427 | + ] = None, |
| 1428 | + log_level: Annotated[ |
| 1429 | + str, |
| 1430 | + typer.Option("--log-level", help="uvicorn log level."), |
| 1431 | + ] = "info", |
| 1432 | +) -> None: |
| 1433 | + """Run the warm-backend HTTP daemon (S36). |
| 1434 | + |
| 1435 | + First call loads the backend (~15s); subsequent calls reuse it |
| 1436 | + (~2s). See ``sway run`` for the equivalent one-shot CLI. |
| 1437 | + """ |
| 1438 | + try: |
| 1439 | + import uvicorn # noqa: F401 — presence check |
| 1440 | + except ImportError as exc: |
| 1441 | + typer.secho( |
| 1442 | + "sway serve requires the [serve] extra: pip install 'dlm-sway[serve]'", |
| 1443 | + fg=typer.colors.RED, |
| 1444 | + err=True, |
| 1445 | + ) |
| 1446 | + raise typer.Exit(code=2) from exc |
| 1447 | + |
| 1448 | + from dlm_sway.serve.app import create_app, parse_host_port |
| 1449 | + from dlm_sway.serve.cache import BackendCache |
| 1450 | + |
| 1451 | + # Public-bind safety — refuse before any uvicorn startup work. |
| 1452 | + loopback = host in ("127.0.0.1", "::1", "localhost") |
| 1453 | + if not loopback and api_key is None: |
| 1454 | + typer.secho( |
| 1455 | + f"refusing to bind {host}:{port} without --api-key. " |
| 1456 | + "Either pass --api-key <key> or use --host 127.0.0.1.", |
| 1457 | + fg=typer.colors.RED, |
| 1458 | + err=True, |
| 1459 | + ) |
| 1460 | + raise typer.Exit(code=2) |
| 1461 | + |
| 1462 | + parse_host_port(host, port) |
| 1463 | + if max_loaded_models < 1: |
| 1464 | + typer.secho("--max-loaded-models must be >= 1", fg=typer.colors.RED, err=True) |
| 1465 | + raise typer.Exit(code=2) |
| 1466 | + |
| 1467 | + cache = BackendCache(max_size=max_loaded_models) |
| 1468 | + app = create_app(cache=cache, api_key=api_key) |
| 1469 | + |
| 1470 | + typer.echo(f"sway serve {__version__} listening on http://{host}:{port}") |
| 1471 | + typer.echo(f" max_loaded_models={max_loaded_models} auth={'yes' if api_key else 'no'}") |
| 1472 | + if not loopback: |
| 1473 | + typer.secho( |
| 1474 | + " WARNING: bound to a non-loopback interface — anyone on " |
| 1475 | + "this network with the API key can drive your GPU.", |
| 1476 | + fg=typer.colors.YELLOW, |
| 1477 | + ) |
| 1478 | + |
| 1479 | + import uvicorn as _uvicorn |
| 1480 | + |
| 1481 | + _uvicorn.run(app, host=host, port=port, log_level=log_level) |