Add llama-server HTTP smoke
- SHA
82b797f4401c59ab54a21527c6cbc6229852205b- Parents
-
135311c - Tree
580c358
82b797f
82b797f4401c59ab54a21527c6cbc6229852205b135311c
580c358| Status | File | + | - |
|---|---|---|---|
| M |
README.md
|
1 | 2 |
| M |
docs/cli/reference.md
|
1 | 1 |
| M |
src/dlm/cli/commands.py
|
9 | 8 |
| M |
src/dlm/export/errors.py
|
4 | 0 |
| A |
src/dlm/export/smoke.py
|
228 | 0 |
| M |
src/dlm/export/targets/llama_server.py
|
51 | 23 |
| M |
tests/unit/cli/test_export_target_flag.py
|
7 | 2 |
| A |
tests/unit/export/targets/test_llama_server_smoke.py
|
90 | 0 |
| A |
tests/unit/export/test_smoke.py
|
128 | 0 |
README.mdmodified@@ -20,8 +20,7 @@ Ollama and `llama-server`. | ||
| 20 | 20 | **Status:** pre-v1.0, but far beyond the original MVP framing. The core |
| 21 | 21 | author/train/prompt/export/pack/share loop is real, and newer runtime-target |
| 22 | 22 | work is landing incrementally. Current export targets are `ollama` and |
| 23 | -`llama-server` (`llama-server` currently requires `--no-smoke` while the HTTP | |
| 24 | -smoke harness lands). | |
| 23 | +`llama-server`. | |
| 25 | 24 | |
| 26 | 25 | ## What A `.dlm` Actually Is |
| 27 | 26 | |
docs/cli/reference.mdmodified@@ -203,7 +203,7 @@ dlm export <path> [--target NAME] [--quant Q] [--merged [--dequantize]] | ||
| 203 | 203 | |
| 204 | 204 | | Option | Default | Notes | |
| 205 | 205 | |---|---|---| |
| 206 | -| `--target NAME` | `ollama` | Export destination. Sprint 41 currently supports `ollama` and `llama-server`. The `llama-server` path writes launch artifacts against the existing GGUF export and currently requires `--no-smoke` while the HTTP smoke harness lands. | | |
| 206 | +| `--target NAME` | `ollama` | Export destination. Sprint 41 currently supports `ollama` and `llama-server`. The `llama-server` path writes launch artifacts against the existing GGUF export and uses the shared OpenAI-compatible HTTP smoke harness when `--no-smoke` is not passed. | | |
| 207 | 207 | | `--quant Q` | frontmatter.export.default_quant | `Q4_K_M` / `Q5_K_M` / `Q6_K` / `Q8_0` / `F16`. | |
| 208 | 208 | | `--merged` | false | Merge LoRA into base before quantizing. | |
| 209 | 209 | | `--dequantize` | false | Required with `--merged` on a QLoRA adapter (pitfall #3). | |
src/dlm/cli/commands.pymodified@@ -1696,14 +1696,6 @@ def export_cmd( | ||
| 1696 | 1696 | except UnknownExportTargetError as exc: |
| 1697 | 1697 | console.print(f"[red]export:[/red] {exc}") |
| 1698 | 1698 | raise typer.Exit(code=2) from exc |
| 1699 | - if resolved_target.name == "llama-server" and not no_smoke: | |
| 1700 | - console.print( | |
| 1701 | - "[red]export:[/red] --target llama-server currently requires " | |
| 1702 | - "`--no-smoke`; the HTTP smoke harness lands in a follow-up " | |
| 1703 | - "Sprint 41 slice." | |
| 1704 | - ) | |
| 1705 | - raise typer.Exit(code=2) | |
| 1706 | - | |
| 1707 | 1699 | parsed = parse_file(path) |
| 1708 | 1700 | adapters_declared = parsed.frontmatter.training.adapters |
| 1709 | 1701 | if adapter is not None: |
@@ -1970,6 +1962,13 @@ def export_cmd( | ||
| 1970 | 1962 | except ExportError as exc: |
| 1971 | 1963 | console.print(f"[red]export:[/red] {exc}") |
| 1972 | 1964 | raise typer.Exit(code=1) from exc |
| 1965 | + llama_server_smoke = None if no_smoke else resolved_target.smoke_test(llama_server_result) | |
| 1966 | + if llama_server_smoke is not None and not llama_server_smoke.ok: | |
| 1967 | + console.print( | |
| 1968 | + f"[red]smoke:[/red] {llama_server_smoke.detail}\n" | |
| 1969 | + " re-run with `--no-smoke` to skip the smoke test." | |
| 1970 | + ) | |
| 1971 | + raise typer.Exit(code=1) | |
| 1973 | 1972 | |
| 1974 | 1973 | cached_tag = " [dim](cached base)[/dim]" if result.cached else "" |
| 1975 | 1974 | console.print(f"[green]exported:[/green] {result.export_dir}{cached_tag}") |
@@ -1981,6 +1980,8 @@ def export_cmd( | ||
| 1981 | 1980 | console.print(f"target: {result.target}") |
| 1982 | 1981 | console.print(f"launch: {llama_server_result.launch_script_path.name}") |
| 1983 | 1982 | console.print(f"template: {llama_server_result.config_path.name}") |
| 1983 | + if llama_server_smoke is not None and llama_server_smoke.detail: | |
| 1984 | + console.print(f"smoke: {llama_server_smoke.detail}") | |
| 1984 | 1985 | return |
| 1985 | 1986 | if result.ollama_name: |
| 1986 | 1987 | console.print(f"ollama: {result.ollama_name} (v{result.ollama_version})") |
src/dlm/export/errors.pymodified@@ -81,6 +81,10 @@ class UnknownExportTargetError(ExportError): | ||
| 81 | 81 | self.available = available |
| 82 | 82 | |
| 83 | 83 | |
| 84 | +class TargetSmokeError(ExportError): | |
| 85 | + """A runtime-target smoke check failed to start or answer correctly.""" | |
| 86 | + | |
| 87 | + | |
| 84 | 88 | class ProcessorLoadError(ExportError): |
| 85 | 89 | """HF-snapshot export couldn't load the processor for a VL/audio base. |
| 86 | 90 | |
src/dlm/export/smoke.pyadded@@ -0,0 +1,228 @@ | ||
| 1 | +"""Shared HTTP smoke helpers for OpenAI-compatible local runtimes.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import json | |
| 6 | +import socket | |
| 7 | +import subprocess # nosec B404 | |
| 8 | +import tempfile | |
| 9 | +import time | |
| 10 | +import urllib.error | |
| 11 | +import urllib.request | |
| 12 | +from collections.abc import Sequence | |
| 13 | +from typing import TextIO | |
| 14 | + | |
| 15 | +from dlm.export.errors import TargetSmokeError | |
| 16 | + | |
| 17 | +_DEFAULT_HOST = "127.0.0.1" | |
| 18 | +_DEFAULT_STARTUP_TIMEOUT_SECONDS = 30.0 | |
| 19 | +_DEFAULT_REQUEST_TIMEOUT_SECONDS = 5.0 | |
| 20 | +_DEFAULT_POLL_INTERVAL_SECONDS = 0.1 | |
| 21 | +_DEFAULT_PROMPT = "Hello." | |
| 22 | + | |
| 23 | + | |
| 24 | +def reserve_local_port(host: str = _DEFAULT_HOST) -> int: | |
| 25 | + """Ask the OS for a free loopback TCP port.""" | |
| 26 | + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: | |
| 27 | + sock.bind((host, 0)) | |
| 28 | + return int(sock.getsockname()[1]) | |
| 29 | + | |
| 30 | + | |
| 31 | +def smoke_openai_compat_server( | |
| 32 | + command: Sequence[str], | |
| 33 | + *, | |
| 34 | + host: str = _DEFAULT_HOST, | |
| 35 | + port: int | None = None, | |
| 36 | + startup_timeout: float = _DEFAULT_STARTUP_TIMEOUT_SECONDS, | |
| 37 | + request_timeout: float = _DEFAULT_REQUEST_TIMEOUT_SECONDS, | |
| 38 | + poll_interval: float = _DEFAULT_POLL_INTERVAL_SECONDS, | |
| 39 | + prompt: str = _DEFAULT_PROMPT, | |
| 40 | +) -> str: | |
| 41 | + """Start a local OpenAI-compatible server, wait for readiness, then chat.""" | |
| 42 | + | |
| 43 | + real_port = port if port is not None else reserve_local_port(host) | |
| 44 | + argv = _replace_or_append_flag(list(command), "--host", host) | |
| 45 | + argv = _replace_or_append_flag(argv, "--port", str(real_port)) | |
| 46 | + | |
| 47 | + with tempfile.TemporaryFile(mode="w+t", encoding="utf-8") as log: | |
| 48 | + proc = subprocess.Popen( # nosec B603 | |
| 49 | + argv, | |
| 50 | + stdout=log, | |
| 51 | + stderr=subprocess.STDOUT, | |
| 52 | + text=True, | |
| 53 | + ) | |
| 54 | + try: | |
| 55 | + model_id = _wait_for_models( | |
| 56 | + proc, | |
| 57 | + log, | |
| 58 | + host=host, | |
| 59 | + port=real_port, | |
| 60 | + startup_timeout=startup_timeout, | |
| 61 | + request_timeout=request_timeout, | |
| 62 | + poll_interval=poll_interval, | |
| 63 | + ) | |
| 64 | + content = _chat_completion( | |
| 65 | + host=host, | |
| 66 | + port=real_port, | |
| 67 | + model_id=model_id, | |
| 68 | + prompt=prompt, | |
| 69 | + request_timeout=request_timeout, | |
| 70 | + ) | |
| 71 | + first = _first_non_empty_line(content) | |
| 72 | + if not first: | |
| 73 | + raise TargetSmokeError("openai-compatible smoke returned empty assistant content") | |
| 74 | + return first | |
| 75 | + finally: | |
| 76 | + _stop_process(proc) | |
| 77 | + | |
| 78 | + | |
| 79 | +def _wait_for_models( | |
| 80 | + proc: subprocess.Popen[str], | |
| 81 | + log: TextIO, | |
| 82 | + *, | |
| 83 | + host: str, | |
| 84 | + port: int, | |
| 85 | + startup_timeout: float, | |
| 86 | + request_timeout: float, | |
| 87 | + poll_interval: float, | |
| 88 | +) -> str | None: | |
| 89 | + deadline = time.monotonic() + startup_timeout | |
| 90 | + last_error: str | None = None | |
| 91 | + while time.monotonic() < deadline: | |
| 92 | + if proc.poll() is not None: | |
| 93 | + raise TargetSmokeError( | |
| 94 | + f"server exited before readiness (exit {proc.returncode}){_log_tail(log)}" | |
| 95 | + ) | |
| 96 | + try: | |
| 97 | + return _fetch_model_id(host=host, port=port, request_timeout=request_timeout) | |
| 98 | + except ( | |
| 99 | + OSError, | |
| 100 | + TimeoutError, | |
| 101 | + ValueError, | |
| 102 | + urllib.error.HTTPError, | |
| 103 | + urllib.error.URLError, | |
| 104 | + ) as exc: | |
| 105 | + last_error = f"{type(exc).__name__}: {exc}" | |
| 106 | + time.sleep(poll_interval) | |
| 107 | + suffix = f" last error: {last_error}." if last_error else "." | |
| 108 | + raise TargetSmokeError( | |
| 109 | + f"server did not become ready on http://{host}:{port}/v1/models within " | |
| 110 | + f"{startup_timeout:.1f}s.{suffix}{_log_tail(log)}" | |
| 111 | + ) | |
| 112 | + | |
| 113 | + | |
| 114 | +def _fetch_model_id(*, host: str, port: int, request_timeout: float) -> str | None: | |
| 115 | + req = urllib.request.Request( | |
| 116 | + f"http://{host}:{port}/v1/models", | |
| 117 | + headers={"Accept": "application/json"}, | |
| 118 | + method="GET", | |
| 119 | + ) | |
| 120 | + with urllib.request.urlopen(req, timeout=request_timeout) as resp: # noqa: S310 | |
| 121 | + payload = json.loads(resp.read()) | |
| 122 | + data = payload.get("data") | |
| 123 | + if not isinstance(data, list) or not data: | |
| 124 | + return None | |
| 125 | + first = data[0] | |
| 126 | + if not isinstance(first, dict): | |
| 127 | + return None | |
| 128 | + model_id = first.get("id") | |
| 129 | + return model_id if isinstance(model_id, str) and model_id.strip() else None | |
| 130 | + | |
| 131 | + | |
| 132 | +def _chat_completion( | |
| 133 | + *, | |
| 134 | + host: str, | |
| 135 | + port: int, | |
| 136 | + model_id: str | None, | |
| 137 | + prompt: str, | |
| 138 | + request_timeout: float, | |
| 139 | +) -> str: | |
| 140 | + payload = { | |
| 141 | + "model": model_id or "dlm-smoke", | |
| 142 | + "messages": [{"role": "user", "content": prompt}], | |
| 143 | + } | |
| 144 | + req = urllib.request.Request( | |
| 145 | + f"http://{host}:{port}/v1/chat/completions", | |
| 146 | + data=json.dumps(payload).encode("utf-8"), | |
| 147 | + headers={ | |
| 148 | + "Accept": "application/json", | |
| 149 | + "Content-Type": "application/json", | |
| 150 | + }, | |
| 151 | + method="POST", | |
| 152 | + ) | |
| 153 | + with urllib.request.urlopen(req, timeout=request_timeout) as resp: # noqa: S310 | |
| 154 | + body = json.loads(resp.read()) | |
| 155 | + choices = body.get("choices") | |
| 156 | + if not isinstance(choices, list) or not choices: | |
| 157 | + raise TargetSmokeError("chat completion response missing choices") | |
| 158 | + first_choice = choices[0] | |
| 159 | + if not isinstance(first_choice, dict): | |
| 160 | + raise TargetSmokeError("chat completion response has non-object choices[0]") | |
| 161 | + message = first_choice.get("message") | |
| 162 | + if not isinstance(message, dict): | |
| 163 | + raise TargetSmokeError("chat completion response missing choices[0].message") | |
| 164 | + content = _normalize_message_content(message.get("content")) | |
| 165 | + if content is None: | |
| 166 | + raise TargetSmokeError( | |
| 167 | + "chat completion response missing non-empty choices[0].message.content" | |
| 168 | + ) | |
| 169 | + return content | |
| 170 | + | |
| 171 | + | |
| 172 | +def _normalize_message_content(content: object) -> str | None: | |
| 173 | + if isinstance(content, str): | |
| 174 | + stripped = content.strip() | |
| 175 | + return stripped if stripped else None | |
| 176 | + if isinstance(content, list): | |
| 177 | + parts: list[str] = [] | |
| 178 | + for item in content: | |
| 179 | + if not isinstance(item, dict): | |
| 180 | + continue | |
| 181 | + text = item.get("text") | |
| 182 | + if isinstance(text, str) and text.strip(): | |
| 183 | + parts.append(text.strip()) | |
| 184 | + merged = "\n".join(parts).strip() | |
| 185 | + return merged if merged else None | |
| 186 | + return None | |
| 187 | + | |
| 188 | + | |
| 189 | +def _replace_or_append_flag(argv: list[str], flag: str, value: str) -> list[str]: | |
| 190 | + updated = list(argv) | |
| 191 | + try: | |
| 192 | + idx = updated.index(flag) | |
| 193 | + except ValueError: | |
| 194 | + updated.extend([flag, value]) | |
| 195 | + return updated | |
| 196 | + if idx + 1 >= len(updated): | |
| 197 | + updated.append(value) | |
| 198 | + return updated | |
| 199 | + updated[idx + 1] = value | |
| 200 | + return updated | |
| 201 | + | |
| 202 | + | |
| 203 | +def _first_non_empty_line(text: str) -> str: | |
| 204 | + for line in text.splitlines(): | |
| 205 | + stripped = line.strip() | |
| 206 | + if stripped: | |
| 207 | + return stripped | |
| 208 | + return "" | |
| 209 | + | |
| 210 | + | |
| 211 | +def _stop_process(proc: subprocess.Popen[str]) -> None: | |
| 212 | + if proc.poll() is not None: | |
| 213 | + return | |
| 214 | + proc.terminate() | |
| 215 | + try: | |
| 216 | + proc.wait(timeout=5.0) | |
| 217 | + except subprocess.TimeoutExpired: | |
| 218 | + proc.kill() | |
| 219 | + proc.wait(timeout=5.0) | |
| 220 | + | |
| 221 | + | |
| 222 | +def _log_tail(log: TextIO, *, lines: int = 20) -> str: | |
| 223 | + log.seek(0) | |
| 224 | + text = log.read().strip() | |
| 225 | + if not text: | |
| 226 | + return "" | |
| 227 | + tail = "\n".join(text.splitlines()[-lines:]) | |
| 228 | + return f"\n--- server log tail ---\n{tail}" | |
src/dlm/export/targets/llama_server.pymodified@@ -5,20 +5,17 @@ from __future__ import annotations | ||
| 5 | 5 | import json |
| 6 | 6 | import shlex |
| 7 | 7 | from pathlib import Path |
| 8 | -from typing import TYPE_CHECKING | |
| 9 | 8 | |
| 9 | +from dlm.base_models import BaseModelSpec | |
| 10 | 10 | from dlm.export.dispatch import DispatchResult |
| 11 | -from dlm.export.errors import ExportError | |
| 11 | +from dlm.export.errors import ExportError, TargetSmokeError | |
| 12 | 12 | from dlm.export.manifest import build_artifact, load_export_manifest, save_export_manifest |
| 13 | 13 | from dlm.export.ollama.modelfile_shared import resolve_num_ctx |
| 14 | +from dlm.export.smoke import smoke_openai_compat_server | |
| 14 | 15 | from dlm.export.targets.base import ExportTarget, SmokeResult, TargetResult |
| 15 | 16 | from dlm.export.vendoring import llama_server_bin |
| 16 | 17 | from dlm.io.atomic import write_text |
| 17 | 18 | |
| 18 | -if TYPE_CHECKING: | |
| 19 | - from dlm.base_models import BaseModelSpec | |
| 20 | - | |
| 21 | - | |
| 22 | 19 | CHAT_TEMPLATE_FILENAME = "chat-template.jinja" |
| 23 | 20 | LAUNCH_SCRIPT_FILENAME = "llama-server_launch.sh" |
| 24 | 21 | |
@@ -29,11 +26,13 @@ class LlamaServerTarget: | ||
| 29 | 26 | name = "llama-server" |
| 30 | 27 | |
| 31 | 28 | def prepare(self, ctx: DispatchResult) -> TargetResult: |
| 32 | - model_path = _require_path_extra(ctx, "model_path") | |
| 33 | 29 | adapter_dir = _require_path_extra(ctx, "adapter_dir") |
| 34 | - context_length = _require_int_extra(ctx, "context_length") | |
| 35 | - adapter_gguf_path = _optional_path_extra(ctx, "adapter_gguf_path") | |
| 30 | + training_sequence_len = _optional_int_extra(ctx, "training_sequence_len") | |
| 31 | + spec = _require_spec_extra(ctx, "spec") | |
| 36 | 32 | vendor_override = _optional_path_extra(ctx, "vendor_override") |
| 33 | + model_path = _find_artifact(ctx.artifacts, prefix="base.") | |
| 34 | + adapter_gguf_path = _find_optional_artifact(ctx.artifacts, exact_name="adapter.gguf") | |
| 35 | + context_length = resolve_num_ctx(training_sequence_len, spec.context_length) | |
| 37 | 36 | |
| 38 | 37 | template_path = ctx.export_dir / CHAT_TEMPLATE_FILENAME |
| 39 | 38 | write_text(template_path, _read_chat_template(adapter_dir)) |
@@ -84,12 +83,36 @@ class LlamaServerTarget: | ||
| 84 | 83 | return command |
| 85 | 84 | |
| 86 | 85 | def smoke_test(self, prepared: TargetResult) -> SmokeResult: |
| 87 | - _ = prepared | |
| 88 | - return SmokeResult( | |
| 89 | - attempted=False, | |
| 90 | - ok=True, | |
| 91 | - detail="llama-server HTTP smoke lands in a follow-up Sprint 41 slice", | |
| 92 | - ) | |
| 86 | + try: | |
| 87 | + first_line = smoke_openai_compat_server(self._runtime_command(prepared)) | |
| 88 | + except (OSError, TargetSmokeError, ExportError) as exc: | |
| 89 | + return SmokeResult(attempted=True, ok=False, detail=str(exc)) | |
| 90 | + return SmokeResult(attempted=True, ok=True, detail=first_line) | |
| 91 | + | |
| 92 | + def _runtime_command(self, prepared: TargetResult) -> list[str]: | |
| 93 | + model_path = _require_prepared_path(prepared, "model_path") | |
| 94 | + adapter_gguf_path = _optional_prepared_path(prepared, "adapter_gguf_path") | |
| 95 | + context_length = _require_prepared_int(prepared, "context_length") | |
| 96 | + vendor_override = _optional_prepared_path(prepared, "vendor_override") | |
| 97 | + | |
| 98 | + command = [ | |
| 99 | + str(llama_server_bin(vendor_override)), | |
| 100 | + "--model", | |
| 101 | + str(model_path), | |
| 102 | + "--api-key", | |
| 103 | + "disabled", | |
| 104 | + "--ctx-size", | |
| 105 | + str(context_length), | |
| 106 | + "--chat-template-file", | |
| 107 | + str(prepared.config_path), | |
| 108 | + "--host", | |
| 109 | + "127.0.0.1", | |
| 110 | + "--port", | |
| 111 | + "8000", | |
| 112 | + ] | |
| 113 | + if adapter_gguf_path is not None: | |
| 114 | + command.extend(["--lora", str(adapter_gguf_path)]) | |
| 115 | + return command | |
| 93 | 116 | |
| 94 | 117 | |
| 95 | 118 | def prepare_llama_server_export( |
@@ -104,19 +127,15 @@ def prepare_llama_server_export( | ||
| 104 | 127 | ) -> TargetResult: |
| 105 | 128 | """Build launch artifacts for a text GGUF export.""" |
| 106 | 129 | |
| 107 | - model_path = _find_artifact(artifacts, prefix="base.") | |
| 108 | - adapter_gguf_path = _find_optional_artifact(artifacts, exact_name="adapter.gguf") | |
| 109 | - context_length = resolve_num_ctx(training_sequence_len, spec.context_length) | |
| 110 | 130 | ctx = DispatchResult( |
| 111 | 131 | export_dir=export_dir, |
| 112 | 132 | manifest_path=manifest_path, |
| 113 | 133 | artifacts=list(artifacts), |
| 114 | 134 | banner_lines=[], |
| 115 | 135 | extras={ |
| 116 | - "model_path": model_path, | |
| 117 | 136 | "adapter_dir": adapter_dir, |
| 118 | - "adapter_gguf_path": adapter_gguf_path, | |
| 119 | - "context_length": context_length, | |
| 137 | + "training_sequence_len": training_sequence_len, | |
| 138 | + "spec": spec, | |
| 120 | 139 | "vendor_override": vendor_override, |
| 121 | 140 | }, |
| 122 | 141 | ) |
@@ -203,10 +222,19 @@ def _optional_path_extra(ctx: DispatchResult, key: str) -> Path | None: | ||
| 203 | 222 | return value |
| 204 | 223 | |
| 205 | 224 | |
| 206 | -def _require_int_extra(ctx: DispatchResult, key: str) -> int: | |
| 225 | +def _optional_int_extra(ctx: DispatchResult, key: str) -> int | None: | |
| 207 | 226 | value = ctx.extras.get(key) |
| 227 | + if value is None: | |
| 228 | + return None | |
| 208 | 229 | if not isinstance(value, int): |
| 209 | - raise ExportError(f"llama-server target missing int extra {key!r}") | |
| 230 | + raise ExportError(f"llama-server target extra {key!r} must be an int") | |
| 231 | + return value | |
| 232 | + | |
| 233 | + | |
| 234 | +def _require_spec_extra(ctx: DispatchResult, key: str) -> BaseModelSpec: | |
| 235 | + value = ctx.extras.get(key) | |
| 236 | + if not isinstance(value, BaseModelSpec): | |
| 237 | + raise ExportError(f"llama-server target missing BaseModelSpec extra {key!r}") | |
| 210 | 238 | return value |
| 211 | 239 | |
| 212 | 240 | |
tests/unit/cli/test_export_target_flag.pymodified@@ -72,7 +72,7 @@ class TestExportTargetFlag: | ||
| 72 | 72 | assert result.exit_code == 2 |
| 73 | 73 | assert "mutually exclusive" in _joined(result) |
| 74 | 74 | |
| 75 | - def test_llama_server_requires_no_smoke_for_now(self, tmp_path: Path) -> None: | |
| 75 | + def test_llama_server_target_reaches_existing_mutex_validation(self, tmp_path: Path) -> None: | |
| 76 | 76 | runner = CliRunner() |
| 77 | 77 | result = runner.invoke( |
| 78 | 78 | app, |
@@ -83,7 +83,12 @@ class TestExportTargetFlag: | ||
| 83 | 83 | str(tmp_path / "ghost.dlm"), |
| 84 | 84 | "--target", |
| 85 | 85 | "llama-server", |
| 86 | + "--draft", | |
| 87 | + "qwen2.5:0.5b", | |
| 88 | + "--no-draft", | |
| 86 | 89 | ], |
| 87 | 90 | ) |
| 88 | 91 | assert result.exit_code == 2 |
| 89 | - assert "--no-smoke" in _joined(result) | |
| 92 | + text = _joined(result) | |
| 93 | + assert "mutually exclusive" in text | |
| 94 | + assert "--no-smoke" not in text | |
tests/unit/export/targets/test_llama_server_smoke.pyadded@@ -0,0 +1,90 @@ | ||
| 1 | +"""llama-server smoke wiring.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from pathlib import Path | |
| 6 | + | |
| 7 | +from dlm.export.errors import TargetSmokeError | |
| 8 | +from dlm.export.targets.base import TargetResult | |
| 9 | +from dlm.export.targets.llama_server import LLAMA_SERVER_TARGET | |
| 10 | + | |
| 11 | + | |
| 12 | +def _vendor_tree(tmp_path: Path) -> Path: | |
| 13 | + vendor = tmp_path / "vendor" / "llama.cpp" | |
| 14 | + (vendor / "build" / "bin").mkdir(parents=True) | |
| 15 | + server = vendor / "build" / "bin" / "llama-server" | |
| 16 | + server.write_text("#!/bin/sh\n", encoding="utf-8") | |
| 17 | + server.chmod(0o755) | |
| 18 | + return vendor | |
| 19 | + | |
| 20 | + | |
| 21 | +def _prepared_target(tmp_path: Path) -> TargetResult: | |
| 22 | + export_dir = tmp_path / "exports" / "Q4_K_M" | |
| 23 | + export_dir.mkdir(parents=True) | |
| 24 | + manifest_path = export_dir / "export_manifest.json" | |
| 25 | + manifest_path.write_text("{}\n", encoding="utf-8") | |
| 26 | + model = export_dir / "base.Q4_K_M.gguf" | |
| 27 | + model.write_bytes(b"base") | |
| 28 | + adapter = export_dir / "adapter.gguf" | |
| 29 | + adapter.write_bytes(b"adapter") | |
| 30 | + template = export_dir / "chat-template.jinja" | |
| 31 | + template.write_text("{{ .Prompt }}\n", encoding="utf-8") | |
| 32 | + return TargetResult( | |
| 33 | + name="llama-server", | |
| 34 | + export_dir=export_dir, | |
| 35 | + manifest_path=manifest_path, | |
| 36 | + artifacts=(model, adapter, template), | |
| 37 | + config_path=template, | |
| 38 | + extras={ | |
| 39 | + "model_path": model, | |
| 40 | + "adapter_gguf_path": adapter, | |
| 41 | + "context_length": 4096, | |
| 42 | + "vendor_override": _vendor_tree(tmp_path), | |
| 43 | + }, | |
| 44 | + ) | |
| 45 | + | |
| 46 | + | |
| 47 | +class TestLlamaServerSmoke: | |
| 48 | + def test_smoke_uses_absolute_runtime_argv(self, tmp_path: Path, monkeypatch: object) -> None: | |
| 49 | + prepared = _prepared_target(tmp_path) | |
| 50 | + seen: list[list[str]] = [] | |
| 51 | + | |
| 52 | + def _fake_smoke(argv: list[str], **_: object) -> str: | |
| 53 | + seen.append(list(argv)) | |
| 54 | + return "server replied" | |
| 55 | + | |
| 56 | + monkeypatch.setattr( | |
| 57 | + "dlm.export.targets.llama_server.smoke_openai_compat_server", _fake_smoke | |
| 58 | + ) | |
| 59 | + | |
| 60 | + result = LLAMA_SERVER_TARGET.smoke_test(prepared) | |
| 61 | + | |
| 62 | + assert result.attempted is True | |
| 63 | + assert result.ok is True | |
| 64 | + assert result.detail == "server replied" | |
| 65 | + assert len(seen) == 1 | |
| 66 | + argv = seen[0] | |
| 67 | + assert argv[0].endswith("llama-server") | |
| 68 | + assert "$SCRIPT_DIR" not in " ".join(argv) | |
| 69 | + assert str(prepared.extras["model_path"]) in argv | |
| 70 | + assert str(prepared.config_path) in argv | |
| 71 | + assert str(prepared.extras["adapter_gguf_path"]) in argv | |
| 72 | + assert "--host" in argv | |
| 73 | + assert "--port" in argv | |
| 74 | + | |
| 75 | + def test_smoke_failure_returns_failed_result(self, tmp_path: Path, monkeypatch: object) -> None: | |
| 76 | + prepared = _prepared_target(tmp_path) | |
| 77 | + | |
| 78 | + def _fake_smoke(argv: list[str], **_: object) -> str: | |
| 79 | + _ = argv | |
| 80 | + raise TargetSmokeError("boom") | |
| 81 | + | |
| 82 | + monkeypatch.setattr( | |
| 83 | + "dlm.export.targets.llama_server.smoke_openai_compat_server", _fake_smoke | |
| 84 | + ) | |
| 85 | + | |
| 86 | + result = LLAMA_SERVER_TARGET.smoke_test(prepared) | |
| 87 | + | |
| 88 | + assert result.attempted is True | |
| 89 | + assert result.ok is False | |
| 90 | + assert result.detail == "boom" | |
tests/unit/export/test_smoke.pyadded@@ -0,0 +1,128 @@ | ||
| 1 | +"""Shared OpenAI-compatible smoke harness.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import socket | |
| 6 | +import sys | |
| 7 | +from pathlib import Path | |
| 8 | + | |
| 9 | +import pytest | |
| 10 | + | |
| 11 | +from dlm.export.errors import TargetSmokeError | |
| 12 | +from dlm.export.smoke import smoke_openai_compat_server | |
| 13 | + | |
| 14 | + | |
| 15 | +def _require_loopback_bind() -> None: | |
| 16 | + try: | |
| 17 | + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: | |
| 18 | + sock.bind(("127.0.0.1", 0)) | |
| 19 | + except PermissionError as exc: | |
| 20 | + pytest.skip(f"loopback bind blocked on this host: {exc}") | |
| 21 | + | |
| 22 | + | |
| 23 | +def _write_server_script(tmp_path: Path, *, mode: str) -> Path: | |
| 24 | + script = tmp_path / f"fake_server_{mode}.py" | |
| 25 | + script.write_text( | |
| 26 | + ( | |
| 27 | + "from __future__ import annotations\n" | |
| 28 | + "import argparse\n" | |
| 29 | + "import json\n" | |
| 30 | + "from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer\n" | |
| 31 | + "\n" | |
| 32 | + "parser = argparse.ArgumentParser()\n" | |
| 33 | + "parser.add_argument('--host', required=True)\n" | |
| 34 | + "parser.add_argument('--port', required=True, type=int)\n" | |
| 35 | + "parser.add_argument('--mode', required=True)\n" | |
| 36 | + "args = parser.parse_args()\n" | |
| 37 | + "\n" | |
| 38 | + "if args.mode == 'exit':\n" | |
| 39 | + " raise SystemExit(3)\n" | |
| 40 | + "\n" | |
| 41 | + "class Handler(BaseHTTPRequestHandler):\n" | |
| 42 | + " def do_GET(self) -> None:\n" | |
| 43 | + " if self.path != '/v1/models':\n" | |
| 44 | + " self.send_response(404)\n" | |
| 45 | + " self.end_headers()\n" | |
| 46 | + " return\n" | |
| 47 | + " body = json.dumps({'data': [{'id': 'fake-model'}]}).encode('utf-8')\n" | |
| 48 | + " self.send_response(200)\n" | |
| 49 | + " self.send_header('Content-Type', 'application/json')\n" | |
| 50 | + " self.send_header('Content-Length', str(len(body)))\n" | |
| 51 | + " self.end_headers()\n" | |
| 52 | + " self.wfile.write(body)\n" | |
| 53 | + "\n" | |
| 54 | + " def do_POST(self) -> None:\n" | |
| 55 | + " if self.path != '/v1/chat/completions':\n" | |
| 56 | + " self.send_response(404)\n" | |
| 57 | + " self.end_headers()\n" | |
| 58 | + " return\n" | |
| 59 | + " _ = self.rfile.read(int(self.headers.get('Content-Length', '0')))\n" | |
| 60 | + " if args.mode == 'empty':\n" | |
| 61 | + " payload = {'choices': [{'message': {'content': ''}}]}\n" | |
| 62 | + " else:\n" | |
| 63 | + " payload = {'choices': [{'message': {'content': 'hello from fake server'}}]}\n" | |
| 64 | + " body = json.dumps(payload).encode('utf-8')\n" | |
| 65 | + " self.send_response(200)\n" | |
| 66 | + " self.send_header('Content-Type', 'application/json')\n" | |
| 67 | + " self.send_header('Content-Length', str(len(body)))\n" | |
| 68 | + " self.end_headers()\n" | |
| 69 | + " self.wfile.write(body)\n" | |
| 70 | + "\n" | |
| 71 | + " def log_message(self, format: str, *args: object) -> None:\n" | |
| 72 | + " return\n" | |
| 73 | + "\n" | |
| 74 | + "server = ThreadingHTTPServer((args.host, args.port), Handler)\n" | |
| 75 | + "server.serve_forever()\n" | |
| 76 | + ), | |
| 77 | + encoding="utf-8", | |
| 78 | + ) | |
| 79 | + return script | |
| 80 | + | |
| 81 | + | |
| 82 | +class TestSmokeOpenAiCompatServer: | |
| 83 | + def test_returns_first_response_line(self, tmp_path: Path) -> None: | |
| 84 | + _require_loopback_bind() | |
| 85 | + script = _write_server_script(tmp_path, mode="ok") | |
| 86 | + | |
| 87 | + first_line = smoke_openai_compat_server( | |
| 88 | + [sys.executable, str(script), "--mode", "ok", "--host", "127.0.0.1", "--port", "8000"] | |
| 89 | + ) | |
| 90 | + | |
| 91 | + assert first_line == "hello from fake server" | |
| 92 | + | |
| 93 | + def test_empty_content_raises(self, tmp_path: Path) -> None: | |
| 94 | + _require_loopback_bind() | |
| 95 | + script = _write_server_script(tmp_path, mode="empty") | |
| 96 | + | |
| 97 | + with pytest.raises(TargetSmokeError, match="non-empty"): | |
| 98 | + smoke_openai_compat_server( | |
| 99 | + [ | |
| 100 | + sys.executable, | |
| 101 | + str(script), | |
| 102 | + "--mode", | |
| 103 | + "empty", | |
| 104 | + "--host", | |
| 105 | + "127.0.0.1", | |
| 106 | + "--port", | |
| 107 | + "8000", | |
| 108 | + ] | |
| 109 | + ) | |
| 110 | + | |
| 111 | + def test_early_exit_raises_with_readiness_message(self, tmp_path: Path) -> None: | |
| 112 | + _require_loopback_bind() | |
| 113 | + script = _write_server_script(tmp_path, mode="exit") | |
| 114 | + | |
| 115 | + with pytest.raises(TargetSmokeError, match="exited before readiness"): | |
| 116 | + smoke_openai_compat_server( | |
| 117 | + [ | |
| 118 | + sys.executable, | |
| 119 | + str(script), | |
| 120 | + "--mode", | |
| 121 | + "exit", | |
| 122 | + "--host", | |
| 123 | + "127.0.0.1", | |
| 124 | + "--port", | |
| 125 | + "8000", | |
| 126 | + ], | |
| 127 | + startup_timeout=1.0, | |
| 128 | + ) | |