Add mlx-serve export target
- SHA
fc1bd697666b2f6d760ea21062e9b46ad2bac3ae- Parents
-
787628d - Tree
cb96c6e
fc1bd69
fc1bd697666b2f6d760ea21062e9b46ad2bac3ae787628d
cb96c6e| Status | File | + | - |
|---|---|---|---|
| M |
README.md
|
18 | 8 |
| M |
docs/cli/reference.md
|
1 | 1 |
| A |
docs/cookbook/multi-target-export.md
|
175 | 0 |
| A |
docs/format/export-manifest.md
|
95 | 0 |
| M |
docs/getting-started/first-export.md
|
17 | 0 |
| M |
docs/index.md
|
3 | 3 |
| M |
mkdocs.yml
|
2 | 0 |
| M |
src/dlm/cli/commands.py
|
79 | 1 |
| M |
src/dlm/export/targets/__init__.py
|
9 | 0 |
| A |
src/dlm/export/targets/mlx_serve.py
|
272 | 0 |
| M |
tests/unit/cli/test_export_target_flag.py
|
20 | 0 |
| A |
tests/unit/export/targets/test_mlx_serve_argv.py
|
173 | 0 |
| M |
tests/unit/export/targets/test_registry.py
|
3 | 2 |
README.mdmodified@@ -15,12 +15,12 @@ A `.dlm` can be: | ||
| 15 | 15 | |
| 16 | 16 | DLM trains LoRA / QLoRA / DoRA adapters on real pretrained bases, keeps a replay |
| 17 | 17 | history so retrains do not silently forget, and exports local runtimes such as |
| 18 | -Ollama and `llama-server`. | |
| 18 | +Ollama, `llama-server`, `vllm`, and `mlx-serve`. | |
| 19 | 19 | |
| 20 | 20 | **Status:** pre-v1.0, but far beyond the original MVP framing. The core |
| 21 | 21 | author/train/prompt/export/pack/share loop is real, and newer runtime-target |
| 22 | 22 | work is landing incrementally. Current export targets are `ollama`, |
| 23 | -`llama-server`, and `vllm`. | |
| 23 | +`llama-server`, `vllm`, and `mlx-serve`. | |
| 24 | 24 | |
| 25 | 25 | ## What A `.dlm` Actually Is |
| 26 | 26 | |
@@ -79,8 +79,8 @@ DLM sits in the gap: | ||
| 79 | 79 | `dlm train --watch`, `dlm metrics`, and `dlm doctor` are all part of the |
| 80 | 80 | normal workflow now. |
| 81 | 81 | - **Export beyond the original Ollama-only story.** DLM still does explicit |
| 82 | - Ollama exports with pinned templates, and now also emits `llama-server` | |
| 83 | - launch artifacts against the same GGUF path. | |
| 82 | + Ollama exports with pinned templates, and now also emits `llama-server`, | |
| 83 | + `vllm`, and `mlx-serve` launch artifacts for local runtime targets. | |
| 84 | 84 | - **Close the eval loop.** `dlm harvest` can pull failing `sway`-style probe |
| 85 | 85 | reports back into the document as new training examples. |
| 86 | 86 | - **Pack and share reproducibly.** `.dlm.pack`, verification, push/pull, and |
@@ -90,10 +90,10 @@ DLM sits in the gap: | ||
| 90 | 90 | |
| 91 | 91 | | Tier | Training | Inference / export | |
| 92 | 92 | |---|---|---| |
| 93 | -| NVIDIA CUDA (SM ≥ 8.0) | bf16 + QLoRA 4-bit + FlashAttention | Ollama, GGUF export, `llama-server` launch artifacts | | |
| 94 | -| NVIDIA CUDA (SM < 8.0) | fp16 LoRA | Ollama, GGUF export, `llama-server` launch artifacts | | |
| 95 | -| Apple Silicon (MPS) | fp16 or fp32 LoRA depending on doctor plan | Ollama, selected MLX inference paths, GGUF export | | |
| 96 | -| CPU | inference-first; training refused above small bases unless forced | GGUF export, Ollama, `llama-server` launch artifacts | | |
| 93 | +| NVIDIA CUDA (SM ≥ 8.0) | bf16 + QLoRA 4-bit + FlashAttention | Ollama, GGUF export, `llama-server`, `vllm` | | |
| 94 | +| NVIDIA CUDA (SM < 8.0) | fp16 LoRA | Ollama, GGUF export, `llama-server`, `vllm` | | |
| 95 | +| Apple Silicon (MPS) | fp16 or fp32 LoRA depending on doctor plan | Ollama, selected MLX inference paths, GGUF export, `vllm` (conservative Metal defaults), `mlx-serve` | | |
| 96 | +| CPU | inference-first; training refused above small bases unless forced | GGUF export, Ollama, `llama-server` | | |
| 97 | 97 | | AMD ROCm | experimental | ROCm-oriented llama.cpp flows | |
| 98 | 98 | |
| 99 | 99 | See [docs/hardware](./docs/hardware/memory-estimates.md) and |
@@ -133,6 +133,13 @@ scripts/bump-llama-cpp.sh build | ||
| 133 | 133 | # If you want the llama.cpp HTTP target too: |
| 134 | 134 | scripts/bump-llama-cpp.sh build --with-server |
| 135 | 135 | |
| 136 | +# If you want the Apple Silicon MLX HTTP target: | |
| 137 | +uv sync --extra mlx | |
| 138 | + | |
| 139 | +# If you want the vLLM HTTP target: | |
| 140 | +# install a compatible vllm runtime separately; DLM writes launch artifacts | |
| 141 | +# but does not bundle the server runtime itself. | |
| 142 | + | |
| 136 | 143 | uv run dlm --help |
| 137 | 144 | ``` |
| 138 | 145 | |
@@ -276,6 +283,8 @@ uv run dlm metrics mydoc.dlm | ||
| 276 | 283 | ```sh |
| 277 | 284 | uv run dlm export mydoc.dlm --target ollama --name mydoc |
| 278 | 285 | uv run dlm export mydoc.dlm --target llama-server --no-smoke |
| 286 | +uv run dlm export mydoc.dlm --target vllm --no-smoke | |
| 287 | +uv run dlm export mydoc.dlm --target mlx-serve --no-smoke | |
| 279 | 288 | uv run dlm pack mydoc.dlm --include-exports |
| 280 | 289 | uv run dlm verify mydoc.dlm.pack |
| 281 | 290 | ``` |
@@ -319,6 +328,7 @@ See the [CLI reference](./docs/cli/reference.md) for the full flag surface. | ||
| 319 | 328 | - [Multimodal training](./docs/cookbook/multimodal-training.md) |
| 320 | 329 | - [Audio training](./docs/cookbook/audio-training.md) |
| 321 | 330 | - [Probe-driven training / sway harvest](./docs/cookbook/probe-driven-training.md) |
| 331 | +- [Multi-target export](./docs/cookbook/multi-target-export.md) | |
| 322 | 332 | - [CLI reference](./docs/cli/reference.md) |
| 323 | 333 | - [Architecture](./docs/architecture.md) |
| 324 | 334 | - [Determinism](./docs/determinism.md) |
docs/cli/reference.mdmodified@@ -203,7 +203,7 @@ dlm export <path> [--target NAME] [--quant Q] [--merged [--dequantize]] | ||
| 203 | 203 | |
| 204 | 204 | | Option | Default | Notes | |
| 205 | 205 | |---|---|---| |
| 206 | -| `--target NAME` | `ollama` | Export destination. Sprint 41 currently supports `ollama`, `llama-server`, and `vllm`. The `llama-server` path writes launch artifacts against the existing GGUF export and uses the shared OpenAI-compatible HTTP smoke harness; the `vllm` path writes `vllm_launch.sh` + `vllm_config.json` against the local adapter layout and ignores GGUF-only flags. On Apple Silicon, the generated `vllm` launch path forces the documented low-risk `vllm-metal` settings (`VLLM_METAL_USE_PAGED_ATTENTION=0`, `VLLM_METAL_MEMORY_FRACTION=auto`) and caps `--max-model-len` to the document's `training.sequence_len`. | | |
| 206 | +| `--target NAME` | `ollama` | Export destination. Sprint 41 currently supports `ollama`, `llama-server`, `vllm`, and `mlx-serve`. The `llama-server` path writes launch artifacts against the existing GGUF export and uses the shared OpenAI-compatible HTTP smoke harness. The `vllm` path writes `vllm_launch.sh` + `vllm_config.json` against the local adapter layout and ignores GGUF-only flags. On Apple Silicon, the generated `vllm` launch path forces the documented low-risk `vllm-metal` settings (`VLLM_METAL_USE_PAGED_ATTENTION=0`, `VLLM_METAL_MEMORY_FRACTION=auto`) and caps `--max-model-len` to the document's `training.sequence_len`. The `mlx-serve` path is Apple Silicon only, writes `mlx_serve_launch.sh` plus a staged MLX adapter directory, and currently supports text bases only. | | |
| 207 | 207 | | `--quant Q` | frontmatter.export.default_quant | `Q4_K_M` / `Q5_K_M` / `Q6_K` / `Q8_0` / `F16`. | |
| 208 | 208 | | `--merged` | false | Merge LoRA into base before quantizing. | |
| 209 | 209 | | `--dequantize` | false | Required with `--merged` on a QLoRA adapter (pitfall #3). | |
docs/cookbook/multi-target-export.mdadded@@ -0,0 +1,175 @@ | ||
| 1 | +# Multi-target export | |
| 2 | + | |
| 3 | +`dlm export` is no longer just an Ollama registration path. The same | |
| 4 | +trained store can now emit local runtime artifacts for four targets: | |
| 5 | + | |
| 6 | +- `ollama` for managed local registration plus the existing Modelfile flow | |
| 7 | +- `llama-server` for GGUF-backed OpenAI-compatible HTTP serving via vendored | |
| 8 | + `llama.cpp` | |
| 9 | +- `vllm` for HF-snapshot plus LoRA-module serving on machines that can run | |
| 10 | + `vllm` | |
| 11 | +- `mlx-serve` for Apple Silicon text serving through `mlx_lm.server` | |
| 12 | + | |
| 13 | +Use this when you want one training loop but different local runtimes for | |
| 14 | +prompting, evaluation harnesses, agents, or deployment experiments. | |
| 15 | + | |
| 16 | +## Quick map | |
| 17 | + | |
| 18 | +| Target | Best for | Artifact shape | Smoke path | | |
| 19 | +|---|---|---|---| | |
| 20 | +| `ollama` | Easiest local chat loop | GGUF + `Modelfile` + local registration | existing Ollama smoke | | |
| 21 | +| `llama-server` | GGUF-backed OpenAI-compatible server | `base.<quant>.gguf` + `adapter.gguf` + `chat-template.jinja` + `llama-server_launch.sh` | shared HTTP smoke | | |
| 22 | +| `vllm` | HF-snapshot + LoRA serving on supported hosts | `vllm_launch.sh` + `vllm_config.json` + staged adapters | shared HTTP smoke | | |
| 23 | +| `mlx-serve` | Apple Silicon text serving without GGUF conversion | `mlx_serve_launch.sh` + staged MLX adapter dir | shared HTTP smoke | | |
| 24 | + | |
| 25 | +## Prerequisites | |
| 26 | + | |
| 27 | +### Ollama | |
| 28 | + | |
| 29 | +```sh | |
| 30 | +brew install ollama | |
| 31 | +``` | |
| 32 | + | |
| 33 | +### llama-server | |
| 34 | + | |
| 35 | +```sh | |
| 36 | +scripts/bump-llama-cpp.sh build --with-server | |
| 37 | +``` | |
| 38 | + | |
| 39 | +That compiles the vendored `llama-server` binary alongside the GGUF tooling. | |
| 40 | + | |
| 41 | +### vLLM | |
| 42 | + | |
| 43 | +Install a compatible `vllm` runtime in the environment you plan to launch | |
| 44 | +from. DLM writes the launch/config artifacts, but it does not bundle the | |
| 45 | +server runtime. | |
| 46 | + | |
| 47 | +On Apple Silicon, the generated `vllm` launch path is deliberately cautious: | |
| 48 | + | |
| 49 | +- `VLLM_METAL_USE_PAGED_ATTENTION=0` | |
| 50 | +- `VLLM_METAL_MEMORY_FRACTION=auto` | |
| 51 | +- `--max-model-len` capped to the document's `training.sequence_len` | |
| 52 | + | |
| 53 | +Those defaults exist to avoid the Metal OOM / hang pattern that shows up when | |
| 54 | +`vllm-metal` blindly asks for the base model's full context window. | |
| 55 | + | |
| 56 | +### MLX-serve | |
| 57 | + | |
| 58 | +```sh | |
| 59 | +uv sync --extra mlx | |
| 60 | +``` | |
| 61 | + | |
| 62 | +`mlx-serve` is Apple Silicon only. DLM refuses it on CUDA, ROCm, and CPU-only | |
| 63 | +hosts, and this Sprint 41 slice only supports text bases on that target. | |
| 64 | + | |
| 65 | +## Common exports | |
| 66 | + | |
| 67 | +### Ollama | |
| 68 | + | |
| 69 | +```sh | |
| 70 | +uv run dlm export tutor.dlm --target ollama --name my-tutor | |
| 71 | +``` | |
| 72 | + | |
| 73 | +This is the classic DLM path: GGUF conversion, explicit Go-template | |
| 74 | +`Modelfile`, optional registration, and an Ollama smoke prompt. | |
| 75 | + | |
| 76 | +### llama-server | |
| 77 | + | |
| 78 | +```sh | |
| 79 | +uv run dlm export tutor.dlm --target llama-server | |
| 80 | +bash ~/.dlm/store/<dlm_id>/exports/Q4_K_M/llama-server_launch.sh | |
| 81 | +``` | |
| 82 | + | |
| 83 | +This reuses the GGUF export artifacts and adds: | |
| 84 | + | |
| 85 | +- `chat-template.jinja` | |
| 86 | +- `llama-server_launch.sh` | |
| 87 | +- `target: "llama-server"` in `export_manifest.json` | |
| 88 | + | |
| 89 | +The launch script binds `127.0.0.1` and speaks `/v1/chat/completions`. | |
| 90 | + | |
| 91 | +### vLLM | |
| 92 | + | |
| 93 | +```sh | |
| 94 | +uv run dlm export tutor.dlm --target vllm | |
| 95 | +bash ~/.dlm/store/<dlm_id>/exports/vllm/vllm_launch.sh | |
| 96 | +``` | |
| 97 | + | |
| 98 | +This path stages local LoRA modules and writes: | |
| 99 | + | |
| 100 | +- `vllm_launch.sh` | |
| 101 | +- `vllm_config.json` | |
| 102 | +- `exports/vllm/adapters/...` | |
| 103 | + | |
| 104 | +Flags that only matter to GGUF or Ollama are ignored with a banner: | |
| 105 | +`--quant`, `--merged`, `--dequantize`, `--no-template`, `--skip-ollama`, | |
| 106 | +`--no-imatrix`, `--draft`, `--no-draft`. | |
| 107 | + | |
| 108 | +### MLX-serve | |
| 109 | + | |
| 110 | +```sh | |
| 111 | +uv run dlm export tutor.dlm --target mlx-serve | |
| 112 | +bash ~/.dlm/store/<dlm_id>/exports/mlx-serve/mlx_serve_launch.sh | |
| 113 | +``` | |
| 114 | + | |
| 115 | +This path stages an MLX-loadable adapter directory and writes: | |
| 116 | + | |
| 117 | +- `mlx_serve_launch.sh` | |
| 118 | +- `exports/mlx-serve/adapter/` or one named adapter directory | |
| 119 | +- `target: "mlx-serve"` in `export_manifest.json` | |
| 120 | + | |
| 121 | +`mlx-serve` also ignores the GGUF/Ollama-only flags above, plus `--name`. | |
| 122 | + | |
| 123 | +## Multi-adapter behavior | |
| 124 | + | |
| 125 | +The runtime targets split into two families: | |
| 126 | + | |
| 127 | +- `ollama` and `llama-server` can reuse the GGUF weighted-merge path for | |
| 128 | + `--adapter-mix` | |
| 129 | +- `vllm` and `mlx-serve` work from local adapter directories | |
| 130 | + | |
| 131 | +For `vllm`: | |
| 132 | + | |
| 133 | +- single-adapter docs export one staged module | |
| 134 | +- multi-adapter docs without `--adapter` export every named adapter as a | |
| 135 | + `--lora-modules` list | |
| 136 | +- `--adapter-mix` exports the staged composite adapter instead | |
| 137 | + | |
| 138 | +For `mlx-serve`: | |
| 139 | + | |
| 140 | +- single-adapter docs export the current flat adapter | |
| 141 | +- multi-adapter docs must choose one adapter with `--adapter`, or pass | |
| 142 | + `--adapter-mix` to export the staged composite adapter | |
| 143 | + | |
| 144 | +That "one adapter at a time" rule is intentional: this target is a simple | |
| 145 | +local-serving path, not a dynamic multi-LoRA router. | |
| 146 | + | |
| 147 | +## Smoke behavior | |
| 148 | + | |
| 149 | +All three HTTP targets use the shared OpenAI-compatible smoke harness: | |
| 150 | + | |
| 151 | +1. reserve a loopback port | |
| 152 | +2. launch the target-specific server command | |
| 153 | +3. poll `/v1/models` | |
| 154 | +4. POST `/v1/chat/completions` | |
| 155 | +5. record the first non-empty line in the store manifest | |
| 156 | + | |
| 157 | +Skip it with `--no-smoke` when the runtime is not installed or you want the | |
| 158 | +artifacts only. | |
| 159 | + | |
| 160 | +## Inspecting what got written | |
| 161 | + | |
| 162 | +Every export writes `export_manifest.json` under its target directory. The | |
| 163 | +important fields are: | |
| 164 | + | |
| 165 | +- `target` | |
| 166 | +- `quant` | |
| 167 | +- `artifacts` | |
| 168 | +- `adapter_version` | |
| 169 | +- `base_model_hf_id` | |
| 170 | +- `base_model_revision` | |
| 171 | + | |
| 172 | +The per-store `manifest.json` also gets an appended `exports[-1]` row with the | |
| 173 | +same `target` plus the smoke first line when a smoke test ran. | |
| 174 | + | |
| 175 | +See [Export manifest](../format/export-manifest.md) for the exact schema. | |
docs/format/export-manifest.mdadded@@ -0,0 +1,95 @@ | ||
| 1 | +# Export manifest | |
| 2 | + | |
| 3 | +Every `dlm export` writes an `export_manifest.json` inside the export directory. | |
| 4 | +It is the target-local record of what DLM emitted, separate from the broader | |
| 5 | +per-store `manifest.json`. | |
| 6 | + | |
| 7 | +Examples: | |
| 8 | + | |
| 9 | +- `~/.dlm/store/<dlm_id>/exports/Q4_K_M/export_manifest.json` | |
| 10 | +- `~/.dlm/store/<dlm_id>/exports/vllm/export_manifest.json` | |
| 11 | +- `~/.dlm/store/<dlm_id>/exports/mlx-serve/export_manifest.json` | |
| 12 | + | |
| 13 | +## What it records | |
| 14 | + | |
| 15 | +The manifest captures: | |
| 16 | + | |
| 17 | +- `target`: which runtime this export was prepared for | |
| 18 | +- `quant`: the export family (`Q4_K_M`, `Q8_0`, `hf`, ...) | |
| 19 | +- `merged` / `dequantized`: whether LoRA weights were merged into the base | |
| 20 | +- `created_at` and `created_by` | |
| 21 | +- `llama_cpp_tag` when the target depends on vendored `llama.cpp` | |
| 22 | +- `base_model_hf_id` and `base_model_revision` | |
| 23 | +- `adapter_version` | |
| 24 | +- `artifacts`: every emitted file with relative path, sha256, and size | |
| 25 | + | |
| 26 | +The schema is strict and round-trips through the Pydantic model in | |
| 27 | +`src/dlm/export/manifest.py`. | |
| 28 | + | |
| 29 | +## Example | |
| 30 | + | |
| 31 | +```json | |
| 32 | +{ | |
| 33 | + "target": "llama-server", | |
| 34 | + "quant": "Q4_K_M", | |
| 35 | + "merged": false, | |
| 36 | + "dequantized": false, | |
| 37 | + "ollama_name": null, | |
| 38 | + "created_at": "2026-04-23T18:42:00", | |
| 39 | + "created_by": "dlm-0.1.0", | |
| 40 | + "llama_cpp_tag": "b4281", | |
| 41 | + "base_model_hf_id": "HuggingFaceTB/SmolLM2-135M-Instruct", | |
| 42 | + "base_model_revision": "4c0d2...", | |
| 43 | + "adapter_version": 3, | |
| 44 | + "artifacts": [ | |
| 45 | + { | |
| 46 | + "path": "base.Q4_K_M.gguf", | |
| 47 | + "sha256": "…", | |
| 48 | + "size_bytes": 47211904 | |
| 49 | + }, | |
| 50 | + { | |
| 51 | + "path": "adapter.gguf", | |
| 52 | + "sha256": "…", | |
| 53 | + "size_bytes": 3145728 | |
| 54 | + }, | |
| 55 | + { | |
| 56 | + "path": "llama-server_launch.sh", | |
| 57 | + "sha256": "…", | |
| 58 | + "size_bytes": 312 | |
| 59 | + } | |
| 60 | + ] | |
| 61 | +} | |
| 62 | +``` | |
| 63 | + | |
| 64 | +## `target` | |
| 65 | + | |
| 66 | +`target` is now the load-bearing field for Sprint 41’s runtime split. | |
| 67 | + | |
| 68 | +Current values: | |
| 69 | + | |
| 70 | +- `ollama` | |
| 71 | +- `llama-server` | |
| 72 | +- `vllm` | |
| 73 | +- `mlx-serve` | |
| 74 | + | |
| 75 | +That lets downstream tooling distinguish: | |
| 76 | + | |
| 77 | +- a GGUF + Modelfile export meant for Ollama | |
| 78 | +- a GGUF-backed OpenAI-compatible launch artifact set | |
| 79 | +- an HF-snapshot + LoRA-module export for `vllm` | |
| 80 | +- an MLX adapter export for Apple Silicon serving | |
| 81 | + | |
| 82 | +## Relationship to the store manifest | |
| 83 | + | |
| 84 | +`export_manifest.json` is per-export and artifact-focused. | |
| 85 | + | |
| 86 | +The store-level `manifest.json` keeps the running narrative in `exports[]`: | |
| 87 | + | |
| 88 | +- when the export happened | |
| 89 | +- which `target` it used | |
| 90 | +- GGUF checksums when present | |
| 91 | +- `ollama_name` when relevant | |
| 92 | +- the first smoke output line when a smoke test ran | |
| 93 | + | |
| 94 | +Use `export_manifest.json` when you need exact artifact provenance for one | |
| 95 | +export directory. Use `manifest.json` when you want the store’s full history. | |
docs/getting-started/first-export.mdmodified@@ -4,6 +4,11 @@ | ||
| 4 | 4 | Modelfile with an explicit Go `text/template` (no fuzzy matching), |
| 5 | 5 | registers the model with `ollama create`, and runs a smoke prompt. |
| 6 | 6 | |
| 7 | +That is still the default path, but it is no longer the only one. Sprint 41 | |
| 8 | +also adds local runtime targets such as `llama-server`, `vllm`, and | |
| 9 | +`mlx-serve`; see the [multi-target export cookbook](../cookbook/multi-target-export.md) | |
| 10 | +once you want an OpenAI-compatible local server instead of an Ollama model. | |
| 11 | + | |
| 7 | 12 | ## Prerequisites |
| 8 | 13 | |
| 9 | 14 | - `vendor/llama.cpp` submodule is built: |
@@ -80,6 +85,18 @@ $ uv run dlm export tutor.dlm --quant Q4_K_M --skip-ollama | ||
| 80 | 85 | Useful on CI runners without the Ollama daemon installed. The GGUFs |
| 81 | 86 | land in `exports/Q4_K_M/`; wire them into your own runtime. |
| 82 | 87 | |
| 88 | +## Other runtime targets | |
| 89 | + | |
| 90 | +Once the basic GGUF/Ollama flow is familiar, the same store can export to: | |
| 91 | + | |
| 92 | +- `--target llama-server` for a vendored `llama.cpp` HTTP server | |
| 93 | +- `--target vllm` for HF-snapshot + LoRA-module serving | |
| 94 | +- `--target mlx-serve` for Apple Silicon text serving through `mlx_lm.server` | |
| 95 | + | |
| 96 | +Those targets have different prerequisites and artifact layouts, so they live | |
| 97 | +in the [multi-target export cookbook](../cookbook/multi-target-export.md) | |
| 98 | +instead of this first-run page. | |
| 99 | + | |
| 83 | 100 | ## Next |
| 84 | 101 | |
| 85 | 102 | Want to send the whole training history to a friend? The |
docs/index.mdmodified@@ -10,7 +10,7 @@ A `.dlm` can be a hand-authored training doc, a directive-driven entrypoint | ||
| 10 | 10 | into a codebase, a multi-adapter project with learned routing, or a selected |
| 11 | 11 | multimodal / audio-language document. DLM trains LoRA / QLoRA / DoRA adapters |
| 12 | 12 | on real pretrained bases, keeps replay history, and exports local runtimes such |
| 13 | -as Ollama and `llama-server`. | |
| 13 | +as Ollama, `llama-server`, `vllm`, and `mlx-serve`. | |
| 14 | 14 | |
| 15 | 15 | ## What DLM Ships Today |
| 16 | 16 | |
@@ -27,7 +27,7 @@ as Ollama and `llama-server`. | ||
| 27 | 27 | persona lanes inside one project |
| 28 | 28 | - **Local iteration UX** with `prompt`, `repl`, `train --watch`, `metrics`, |
| 29 | 29 | and `doctor` |
| 30 | -- **Runtime export** to `ollama` and `llama-server` | |
| 30 | +- **Runtime export** to `ollama`, `llama-server`, `vllm`, and `mlx-serve` | |
| 31 | 31 | - **Probe-driven improvement** through `sway`-style harvest flows |
| 32 | 32 | |
| 33 | 33 | ## 30-Second Demo |
@@ -49,7 +49,7 @@ $ uv run dlm export tutor.dlm --target ollama --name my-tutor | ||
| 49 | 49 | | Train across a real repo | [Training across codebases](cookbook/training-across-codebases.md) | |
| 50 | 50 | | Use named adapters and routing | [Multi-adapter](cookbook/multi-adapter.md) and [Learned adapter gate](cookbook/learned-adapter-gate.md) | |
| 51 | 51 | | Work with images or audio | [Multimodal training](cookbook/multimodal-training.md) and [Audio training](cookbook/audio-training.md) | |
| 52 | -| Export or ship a model | [CLI reference](cli/reference.md) and [Determinism](determinism.md) | | |
| 52 | +| Export or ship a model | [Multi-target export](cookbook/multi-target-export.md), [CLI reference](cli/reference.md), and [Determinism](determinism.md) | | |
| 53 | 53 | | Pull eval failures back into training | [Probe-driven training](cookbook/probe-driven-training.md) | |
| 54 | 54 | |
| 55 | 55 | ## Status |
mkdocs.ymlmodified@@ -58,6 +58,7 @@ nav: | ||
| 58 | 58 | - The .dlm format: |
| 59 | 59 | - Frontmatter: format/frontmatter.md |
| 60 | 60 | - Sections: format/sections.md |
| 61 | + - Export manifest: format/export-manifest.md | |
| 61 | 62 | - .dlm/training.yaml: format/dlm-training-yaml.md |
| 62 | 63 | - .dlm/ignore: format/dlm-ignore.md |
| 63 | 64 | - CLI reference: cli/reference.md |
@@ -77,6 +78,7 @@ nav: | ||
| 77 | 78 | - Template gallery: cookbook/template-gallery.md |
| 78 | 79 | - Sharing adapters: cookbook/sharing.md |
| 79 | 80 | - Multi-source training: cookbook/multi-source-training.md |
| 81 | + - Multi-target export: cookbook/multi-target-export.md | |
| 80 | 82 | - Train from a folder: cookbook/train-from-folder.md |
| 81 | 83 | - Training across codebases: cookbook/training-across-codebases.md |
| 82 | 84 | - Tokenized-section cache: cookbook/directive-cache.md |
src/dlm/cli/commands.pymodified@@ -1551,7 +1551,7 @@ def export_cmd( | ||
| 1551 | 1551 | str, |
| 1552 | 1552 | typer.Option( |
| 1553 | 1553 | "--target", |
| 1554 | - help="Export destination. Currently supported: ollama, llama-server, vllm.", | |
| 1554 | + help="Export destination. Currently supported: ollama, llama-server, vllm, mlx-serve.", | |
| 1555 | 1555 | ), |
| 1556 | 1556 | ] = "ollama", |
| 1557 | 1557 | quant: Annotated[ |
@@ -1679,8 +1679,10 @@ def export_cmd( | ||
| 1679 | 1679 | ) |
| 1680 | 1680 | from dlm.export.quantize import run_checked |
| 1681 | 1681 | from dlm.export.targets import ( |
| 1682 | + finalize_mlx_serve_export, | |
| 1682 | 1683 | finalize_vllm_export, |
| 1683 | 1684 | prepare_llama_server_export, |
| 1685 | + prepare_mlx_serve_export, | |
| 1684 | 1686 | prepare_vllm_export, |
| 1685 | 1687 | resolve_target, |
| 1686 | 1688 | ) |
@@ -1785,6 +1787,12 @@ def export_cmd( | ||
| 1785 | 1787 | "documents yet; this Sprint 41 slice only supports text bases." |
| 1786 | 1788 | ) |
| 1787 | 1789 | raise typer.Exit(code=2) |
| 1790 | + if resolved_target.name == "mlx-serve" and export_dispatch.accepts_audio: | |
| 1791 | + console.print( | |
| 1792 | + "[red]export:[/red] --target mlx-serve is not wired for audio-language " | |
| 1793 | + "documents yet; this Sprint 41 slice only supports text bases." | |
| 1794 | + ) | |
| 1795 | + raise typer.Exit(code=2) | |
| 1788 | 1796 | if export_dispatch.accepts_audio: |
| 1789 | 1797 | try: |
| 1790 | 1798 | dispatch_result = export_dispatch.dispatch_export( |
@@ -1830,6 +1838,12 @@ def export_cmd( | ||
| 1830 | 1838 | "documents yet; this Sprint 41 slice only supports text bases." |
| 1831 | 1839 | ) |
| 1832 | 1840 | raise typer.Exit(code=2) |
| 1841 | + if resolved_target.name == "mlx-serve" and export_dispatch.accepts_images: | |
| 1842 | + console.print( | |
| 1843 | + "[red]export:[/red] --target mlx-serve is not wired for vision-language " | |
| 1844 | + "documents yet; this Sprint 41 slice only supports text bases." | |
| 1845 | + ) | |
| 1846 | + raise typer.Exit(code=2) | |
| 1833 | 1847 | if export_dispatch.accepts_images: |
| 1834 | 1848 | gguf_emission_context = None |
| 1835 | 1849 | try: |
@@ -1957,6 +1971,70 @@ def export_cmd( | ||
| 1957 | 1971 | console.print(f"smoke: {vllm_smoke.detail}") |
| 1958 | 1972 | return |
| 1959 | 1973 | |
| 1974 | + if resolved_target.name == "mlx-serve": | |
| 1975 | + mlx_ignored_flags: list[str] = [] | |
| 1976 | + if quant is not None: | |
| 1977 | + mlx_ignored_flags.append("--quant") | |
| 1978 | + if merged: | |
| 1979 | + mlx_ignored_flags.append("--merged") | |
| 1980 | + if dequantize: | |
| 1981 | + mlx_ignored_flags.append("--dequantize") | |
| 1982 | + if name is not None: | |
| 1983 | + mlx_ignored_flags.append("--name") | |
| 1984 | + if no_template: | |
| 1985 | + mlx_ignored_flags.append("--no-template") | |
| 1986 | + if skip_ollama: | |
| 1987 | + mlx_ignored_flags.append("--skip-ollama") | |
| 1988 | + if no_imatrix: | |
| 1989 | + mlx_ignored_flags.append("--no-imatrix") | |
| 1990 | + if draft is not None: | |
| 1991 | + mlx_ignored_flags.append("--draft") | |
| 1992 | + if no_draft: | |
| 1993 | + mlx_ignored_flags.append("--no-draft") | |
| 1994 | + if mlx_ignored_flags: | |
| 1995 | + console.print( | |
| 1996 | + "[yellow]export:[/yellow] ignoring flags not applicable to " | |
| 1997 | + f"`--target mlx-serve`: {', '.join(mlx_ignored_flags)}" | |
| 1998 | + ) | |
| 1999 | + | |
| 2000 | + declared_adapter_names = tuple(adapters_declared.keys()) if adapters_declared else None | |
| 2001 | + try: | |
| 2002 | + mlx_serve_result = prepare_mlx_serve_export( | |
| 2003 | + store=store, | |
| 2004 | + spec=spec, | |
| 2005 | + adapter_name=adapter, | |
| 2006 | + adapter_path_override=adapter_path_override, | |
| 2007 | + declared_adapter_names=declared_adapter_names, | |
| 2008 | + ) | |
| 2009 | + except ExportError as exc: | |
| 2010 | + console.print(f"[red]export:[/red] {exc}") | |
| 2011 | + raise typer.Exit(code=1) from exc | |
| 2012 | + | |
| 2013 | + mlx_serve_smoke = None if no_smoke else resolved_target.smoke_test(mlx_serve_result) | |
| 2014 | + if mlx_serve_smoke is not None and not mlx_serve_smoke.ok: | |
| 2015 | + console.print( | |
| 2016 | + f"[red]smoke:[/red] {mlx_serve_smoke.detail}\n" | |
| 2017 | + " re-run with `--no-smoke` to skip the smoke test." | |
| 2018 | + ) | |
| 2019 | + raise typer.Exit(code=1) | |
| 2020 | + | |
| 2021 | + manifest_path = finalize_mlx_serve_export( | |
| 2022 | + store=store, | |
| 2023 | + spec=spec, | |
| 2024 | + prepared=mlx_serve_result, | |
| 2025 | + smoke_output_first_line=None if mlx_serve_smoke is None else mlx_serve_smoke.detail, | |
| 2026 | + adapter_name=adapter, | |
| 2027 | + adapter_mix=mix_entries, | |
| 2028 | + ) | |
| 2029 | + console.print(f"[green]exported:[/green] {mlx_serve_result.export_dir}") | |
| 2030 | + console.print("target: mlx-serve") | |
| 2031 | + assert mlx_serve_result.launch_script_path is not None | |
| 2032 | + console.print(f"launch: {mlx_serve_result.launch_script_path.name}") | |
| 2033 | + console.print(f"manifest: {manifest_path.name}") | |
| 2034 | + if mlx_serve_smoke is not None and mlx_serve_smoke.detail: | |
| 2035 | + console.print(f"smoke: {mlx_serve_smoke.detail}") | |
| 2036 | + return | |
| 2037 | + | |
| 1960 | 2038 | try: |
| 1961 | 2039 | result = run_export( |
| 1962 | 2040 | store, |
src/dlm/export/targets/__init__.pymodified@@ -5,6 +5,11 @@ from __future__ import annotations | ||
| 5 | 5 | from dlm.export.errors import UnknownExportTargetError |
| 6 | 6 | from dlm.export.targets.base import ExportTarget, SmokeResult, TargetResult |
| 7 | 7 | from dlm.export.targets.llama_server import LLAMA_SERVER_TARGET, prepare_llama_server_export |
| 8 | +from dlm.export.targets.mlx_serve import ( | |
| 9 | + MLX_SERVE_TARGET, | |
| 10 | + finalize_mlx_serve_export, | |
| 11 | + prepare_mlx_serve_export, | |
| 12 | +) | |
| 8 | 13 | from dlm.export.targets.ollama import OLLAMA_TARGET |
| 9 | 14 | from dlm.export.targets.vllm import VLLM_TARGET, finalize_vllm_export, prepare_vllm_export |
| 10 | 15 | |
@@ -12,6 +17,7 @@ TARGETS: dict[str, ExportTarget] = { | ||
| 12 | 17 | OLLAMA_TARGET.name: OLLAMA_TARGET, |
| 13 | 18 | LLAMA_SERVER_TARGET.name: LLAMA_SERVER_TARGET, |
| 14 | 19 | VLLM_TARGET.name: VLLM_TARGET, |
| 20 | + MLX_SERVE_TARGET.name: MLX_SERVE_TARGET, | |
| 15 | 21 | } |
| 16 | 22 | |
| 17 | 23 | |
@@ -31,12 +37,15 @@ def resolve_target(name: str) -> ExportTarget: | ||
| 31 | 37 | __all__ = [ |
| 32 | 38 | "ExportTarget", |
| 33 | 39 | "LLAMA_SERVER_TARGET", |
| 40 | + "MLX_SERVE_TARGET", | |
| 34 | 41 | "SmokeResult", |
| 35 | 42 | "TARGETS", |
| 36 | 43 | "TargetResult", |
| 37 | 44 | "VLLM_TARGET", |
| 38 | 45 | "available_targets", |
| 46 | + "finalize_mlx_serve_export", | |
| 39 | 47 | "finalize_vllm_export", |
| 48 | + "prepare_mlx_serve_export", | |
| 40 | 49 | "prepare_llama_server_export", |
| 41 | 50 | "prepare_vllm_export", |
| 42 | 51 | "resolve_target", |
src/dlm/export/targets/mlx_serve.pyadded@@ -0,0 +1,272 @@ | ||
| 1 | +"""MLX HTTP server target helpers.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import shlex | |
| 6 | +import shutil | |
| 7 | +from pathlib import Path | |
| 8 | + | |
| 9 | +from dlm.base_models import BaseModelSpec | |
| 10 | +from dlm.export.errors import ExportError, TargetSmokeError | |
| 11 | +from dlm.export.manifest import ExportManifest, build_artifact, save_export_manifest, utc_now | |
| 12 | +from dlm.export.record import append_export_summary | |
| 13 | +from dlm.export.smoke import smoke_openai_compat_server | |
| 14 | +from dlm.export.targets.base import ExportTarget, SmokeResult, TargetResult | |
| 15 | +from dlm.inference.backends.mlx_backend import stage_mlx_adapter_dir | |
| 16 | +from dlm.inference.backends.select import is_apple_silicon, mlx_available | |
| 17 | +from dlm.io.atomic import write_text | |
| 18 | +from dlm.store.paths import StorePath | |
| 19 | + | |
| 20 | +MLX_SERVE_EXPORT_SUBDIR = "mlx-serve" | |
| 21 | +LAUNCH_SCRIPT_FILENAME = "mlx_serve_launch.sh" | |
| 22 | +_HF_QUANT = "hf" | |
| 23 | +_DEFAULT_ADAPTER_DIRNAME = "adapter" | |
| 24 | +_MIXED_ADAPTER_DIRNAME = "mixed" | |
| 25 | + | |
| 26 | + | |
| 27 | +class MlxServeTarget: | |
| 28 | + """Registered export target for MLX HTTP server launch artifacts.""" | |
| 29 | + | |
| 30 | + name = "mlx-serve" | |
| 31 | + | |
| 32 | + def prepare(self, ctx: object) -> TargetResult: | |
| 33 | + raise NotImplementedError("mlx-serve exports are prepared via prepare_mlx_serve_export()") | |
| 34 | + | |
| 35 | + def launch_command(self, prepared: TargetResult) -> list[str]: | |
| 36 | + return _build_command(prepared, use_script_dir=True) | |
| 37 | + | |
| 38 | + def smoke_test(self, prepared: TargetResult) -> SmokeResult: | |
| 39 | + try: | |
| 40 | + first_line = smoke_openai_compat_server(_build_command(prepared, use_script_dir=False)) | |
| 41 | + except (OSError, TargetSmokeError, ExportError) as exc: | |
| 42 | + return SmokeResult(attempted=True, ok=False, detail=str(exc)) | |
| 43 | + return SmokeResult(attempted=True, ok=True, detail=first_line) | |
| 44 | + | |
| 45 | + | |
| 46 | +def prepare_mlx_serve_export( | |
| 47 | + *, | |
| 48 | + store: StorePath, | |
| 49 | + spec: BaseModelSpec, | |
| 50 | + adapter_name: str | None, | |
| 51 | + adapter_path_override: Path | None, | |
| 52 | + declared_adapter_names: tuple[str, ...] | None, | |
| 53 | +) -> TargetResult: | |
| 54 | + """Stage an MLX-loadable adapter dir plus launch script.""" | |
| 55 | + | |
| 56 | + _require_mlx_runtime() | |
| 57 | + source_adapter_dir, staged_dirname, adapter_version = _resolve_source_adapter( | |
| 58 | + store=store, | |
| 59 | + adapter_name=adapter_name, | |
| 60 | + adapter_path_override=adapter_path_override, | |
| 61 | + declared_adapter_names=declared_adapter_names, | |
| 62 | + ) | |
| 63 | + | |
| 64 | + export_dir = store.exports / MLX_SERVE_EXPORT_SUBDIR | |
| 65 | + export_dir.mkdir(parents=True, exist_ok=True) | |
| 66 | + | |
| 67 | + staged_adapter_dir = export_dir / staged_dirname | |
| 68 | + if staged_adapter_dir.exists(): | |
| 69 | + shutil.rmtree(staged_adapter_dir) | |
| 70 | + stage_mlx_adapter_dir(source_adapter_dir, staged_adapter_dir, base_hf_id=spec.hf_id) | |
| 71 | + | |
| 72 | + launch_script_path = export_dir / LAUNCH_SCRIPT_FILENAME | |
| 73 | + draft = TargetResult( | |
| 74 | + name=MLX_SERVE_TARGET.name, | |
| 75 | + export_dir=export_dir, | |
| 76 | + manifest_path=export_dir / "export_manifest.json", | |
| 77 | + artifacts=(), | |
| 78 | + launch_script_path=launch_script_path, | |
| 79 | + extras={ | |
| 80 | + "model": spec.hf_id, | |
| 81 | + "adapter_dir": staged_adapter_dir, | |
| 82 | + "adapter_version": adapter_version, | |
| 83 | + }, | |
| 84 | + ) | |
| 85 | + write_text(launch_script_path, _render_launch_script(MLX_SERVE_TARGET.launch_command(draft))) | |
| 86 | + launch_script_path.chmod(0o755) | |
| 87 | + return TargetResult( | |
| 88 | + name=draft.name, | |
| 89 | + export_dir=draft.export_dir, | |
| 90 | + manifest_path=draft.manifest_path, | |
| 91 | + artifacts=tuple(_artifact_paths(export_dir)), | |
| 92 | + launch_script_path=draft.launch_script_path, | |
| 93 | + config_path=None, | |
| 94 | + extras=draft.extras, | |
| 95 | + ) | |
| 96 | + | |
| 97 | + | |
| 98 | +def finalize_mlx_serve_export( | |
| 99 | + *, | |
| 100 | + store: StorePath, | |
| 101 | + spec: BaseModelSpec, | |
| 102 | + prepared: TargetResult, | |
| 103 | + smoke_output_first_line: str | None, | |
| 104 | + adapter_name: str | None, | |
| 105 | + adapter_mix: list[tuple[str, float]] | None, | |
| 106 | +) -> Path: | |
| 107 | + """Write export_manifest.json and append the store export summary.""" | |
| 108 | + | |
| 109 | + from dlm import __version__ as dlm_version | |
| 110 | + | |
| 111 | + artifacts = [ | |
| 112 | + build_artifact(prepared.export_dir, path) for path in _artifact_paths(prepared.export_dir) | |
| 113 | + ] | |
| 114 | + adapter_version = _require_prepared_int(prepared, "adapter_version") | |
| 115 | + manifest = ExportManifest( | |
| 116 | + target=MLX_SERVE_TARGET.name, | |
| 117 | + quant=_HF_QUANT, | |
| 118 | + merged=False, | |
| 119 | + dequantized=False, | |
| 120 | + ollama_name=None, | |
| 121 | + created_at=utc_now(), | |
| 122 | + created_by=f"dlm-{dlm_version}", | |
| 123 | + llama_cpp_tag=None, | |
| 124 | + base_model_hf_id=spec.hf_id, | |
| 125 | + base_model_revision=spec.revision, | |
| 126 | + adapter_version=adapter_version, | |
| 127 | + artifacts=artifacts, | |
| 128 | + ) | |
| 129 | + manifest_path = save_export_manifest(prepared.export_dir, manifest) | |
| 130 | + append_export_summary( | |
| 131 | + store=store, | |
| 132 | + quant=_HF_QUANT, | |
| 133 | + merged=False, | |
| 134 | + target=MLX_SERVE_TARGET.name, | |
| 135 | + llama_cpp_tag=None, | |
| 136 | + artifacts=artifacts, | |
| 137 | + ollama_name=None, | |
| 138 | + ollama_version_str=None, | |
| 139 | + smoke_first_line=smoke_output_first_line, | |
| 140 | + adapter_name=adapter_name, | |
| 141 | + adapter_mix=adapter_mix, | |
| 142 | + ) | |
| 143 | + return manifest_path | |
| 144 | + | |
| 145 | + | |
| 146 | +def _resolve_source_adapter( | |
| 147 | + *, | |
| 148 | + store: StorePath, | |
| 149 | + adapter_name: str | None, | |
| 150 | + adapter_path_override: Path | None, | |
| 151 | + declared_adapter_names: tuple[str, ...] | None, | |
| 152 | +) -> tuple[Path, str, int]: | |
| 153 | + if adapter_path_override is not None: | |
| 154 | + if not adapter_path_override.exists(): | |
| 155 | + raise ExportError(f"adapter_path_override {adapter_path_override} does not exist") | |
| 156 | + return ( | |
| 157 | + adapter_path_override, | |
| 158 | + _MIXED_ADAPTER_DIRNAME, | |
| 159 | + _version_from_dir_name(adapter_path_override), | |
| 160 | + ) | |
| 161 | + | |
| 162 | + if declared_adapter_names and adapter_name is None: | |
| 163 | + raise ExportError( | |
| 164 | + "mlx-serve exports one adapter at a time; pass `--adapter <name>` " | |
| 165 | + "or `--adapter-mix` for multi-adapter documents." | |
| 166 | + ) | |
| 167 | + | |
| 168 | + if adapter_name is not None: | |
| 169 | + path = store.resolve_current_adapter_for(adapter_name) | |
| 170 | + pointer = store.adapter_current_pointer_for(adapter_name) | |
| 171 | + if path is None or not path.exists(): | |
| 172 | + raise ExportError( | |
| 173 | + f"no current adapter under {pointer}; run `dlm train` before exporting." | |
| 174 | + ) | |
| 175 | + return path, adapter_name, _version_from_dir_name(path) | |
| 176 | + | |
| 177 | + path = store.resolve_current_adapter() | |
| 178 | + pointer = store.adapter_current_pointer | |
| 179 | + if path is None or not path.exists(): | |
| 180 | + raise ExportError(f"no current adapter under {pointer}; run `dlm train` before exporting.") | |
| 181 | + return path, _DEFAULT_ADAPTER_DIRNAME, _version_from_dir_name(path) | |
| 182 | + | |
| 183 | + | |
| 184 | +def _require_mlx_runtime() -> None: | |
| 185 | + if not is_apple_silicon(): | |
| 186 | + raise ExportError( | |
| 187 | + "mlx-serve export requires Apple Silicon (darwin-arm64); " | |
| 188 | + "this target is not available on CUDA, ROCm, or CPU-only hosts." | |
| 189 | + ) | |
| 190 | + if not mlx_available(): | |
| 191 | + raise ExportError( | |
| 192 | + "mlx-serve export requires the mlx extra to be installed; " | |
| 193 | + "run `uv sync --extra mlx` and re-try." | |
| 194 | + ) | |
| 195 | + | |
| 196 | + | |
| 197 | +def _artifact_paths(export_dir: Path) -> list[Path]: | |
| 198 | + artifacts: list[Path] = [] | |
| 199 | + for path in sorted(export_dir.rglob("*")): | |
| 200 | + if path.is_file() and path.name != "export_manifest.json": | |
| 201 | + artifacts.append(path) | |
| 202 | + return artifacts | |
| 203 | + | |
| 204 | + | |
| 205 | +def _build_command(prepared: TargetResult, *, use_script_dir: bool) -> list[str]: | |
| 206 | + model = _require_prepared_str(prepared, "model") | |
| 207 | + adapter_dir = _require_prepared_path(prepared, "adapter_dir") | |
| 208 | + return [ | |
| 209 | + "python", | |
| 210 | + "-m", | |
| 211 | + "mlx_lm.server", | |
| 212 | + "--model", | |
| 213 | + model, | |
| 214 | + "--adapter-path", | |
| 215 | + _script_dir_arg(adapter_dir) if use_script_dir else str(adapter_dir), | |
| 216 | + "--host", | |
| 217 | + "127.0.0.1", | |
| 218 | + "--port", | |
| 219 | + "8000", | |
| 220 | + ] | |
| 221 | + | |
| 222 | + | |
| 223 | +def _script_dir_arg(path: Path) -> str: | |
| 224 | + return f"$SCRIPT_DIR/{path.name}" | |
| 225 | + | |
| 226 | + | |
| 227 | +def _render_launch_script(command: list[str]) -> str: | |
| 228 | + rendered = " ".join(_quote_script_arg(arg) for arg in command) | |
| 229 | + return ( | |
| 230 | + "#!/usr/bin/env bash\n" | |
| 231 | + "set -euo pipefail\n" | |
| 232 | + 'SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"\n' | |
| 233 | + f'exec {rendered} "$@"\n' | |
| 234 | + ) | |
| 235 | + | |
| 236 | + | |
| 237 | +def _quote_script_arg(arg: str) -> str: | |
| 238 | + if arg.startswith("$SCRIPT_DIR/"): | |
| 239 | + return f'"{arg}"' | |
| 240 | + return shlex.quote(arg) | |
| 241 | + | |
| 242 | + | |
| 243 | +def _version_from_dir_name(path: Path) -> int: | |
| 244 | + stem = path.name | |
| 245 | + if not stem.startswith("v") or not stem[1:].isdigit(): | |
| 246 | + return 1 | |
| 247 | + return int(stem[1:]) | |
| 248 | + | |
| 249 | + | |
| 250 | +def _require_prepared_str(prepared: TargetResult, key: str) -> str: | |
| 251 | + value = prepared.extras.get(key) | |
| 252 | + if not isinstance(value, str) or not value: | |
| 253 | + raise ExportError(f"mlx-serve prepared target missing string extra {key!r}") | |
| 254 | + return value | |
| 255 | + | |
| 256 | + | |
| 257 | +def _require_prepared_path(prepared: TargetResult, key: str) -> Path: | |
| 258 | + value = prepared.extras.get(key) | |
| 259 | + if not isinstance(value, Path): | |
| 260 | + raise ExportError(f"mlx-serve prepared target missing Path extra {key!r}") | |
| 261 | + return value | |
| 262 | + | |
| 263 | + | |
| 264 | +def _require_prepared_int(prepared: TargetResult, key: str) -> int: | |
| 265 | + value = prepared.extras.get(key) | |
| 266 | + if not isinstance(value, int): | |
| 267 | + raise ExportError(f"mlx-serve prepared target missing int extra {key!r}") | |
| 268 | + return value | |
| 269 | + | |
| 270 | + | |
| 271 | +MLX_SERVE_TARGET = MlxServeTarget() | |
| 272 | +assert isinstance(MLX_SERVE_TARGET, ExportTarget) | |
tests/unit/cli/test_export_target_flag.pymodified@@ -52,6 +52,7 @@ class TestExportTargetFlag: | ||
| 52 | 52 | assert "ollama" in text |
| 53 | 53 | assert "llama-server" in text |
| 54 | 54 | assert "vllm" in text |
| 55 | + assert "mlx-serve" in text | |
| 55 | 56 | |
| 56 | 57 | def test_ollama_target_reaches_existing_mutex_validation(self, tmp_path: Path) -> None: |
| 57 | 58 | doc = _scaffold_doc(tmp_path) |
@@ -112,3 +113,22 @@ class TestExportTargetFlag: | ||
| 112 | 113 | ) |
| 113 | 114 | assert result.exit_code == 2 |
| 114 | 115 | assert "mutually exclusive" in _joined(result) |
| 116 | + | |
| 117 | + def test_mlx_serve_target_reaches_existing_mutex_validation(self, tmp_path: Path) -> None: | |
| 118 | + runner = CliRunner() | |
| 119 | + result = runner.invoke( | |
| 120 | + app, | |
| 121 | + [ | |
| 122 | + "--home", | |
| 123 | + str(tmp_path / "home"), | |
| 124 | + "export", | |
| 125 | + str(tmp_path / "ghost.dlm"), | |
| 126 | + "--target", | |
| 127 | + "mlx-serve", | |
| 128 | + "--draft", | |
| 129 | + "qwen2.5:0.5b", | |
| 130 | + "--no-draft", | |
| 131 | + ], | |
| 132 | + ) | |
| 133 | + assert result.exit_code == 2 | |
| 134 | + assert "mutually exclusive" in _joined(result) | |
tests/unit/export/targets/test_mlx_serve_argv.pyadded@@ -0,0 +1,173 @@ | ||
| 1 | +"""MLX serve launch artifact generation.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from pathlib import Path | |
| 6 | + | |
| 7 | +import pytest | |
| 8 | + | |
| 9 | +from dlm.base_models import BASE_MODELS | |
| 10 | +from dlm.export.errors import ExportError | |
| 11 | +from dlm.export.manifest import load_export_manifest | |
| 12 | +from dlm.export.targets.mlx_serve import ( | |
| 13 | + LAUNCH_SCRIPT_FILENAME, | |
| 14 | + MLX_SERVE_TARGET, | |
| 15 | + finalize_mlx_serve_export, | |
| 16 | + prepare_mlx_serve_export, | |
| 17 | +) | |
| 18 | +from dlm.store.manifest import Manifest, load_manifest, save_manifest | |
| 19 | +from dlm.store.paths import for_dlm | |
| 20 | + | |
| 21 | +_SPEC = BASE_MODELS["smollm2-135m"] | |
| 22 | + | |
| 23 | + | |
| 24 | +def _write_adapter(path: Path) -> None: | |
| 25 | + path.mkdir(parents=True) | |
| 26 | + (path / "adapter_config.json").write_text("{}", encoding="utf-8") | |
| 27 | + (path / "adapter_model.safetensors").write_bytes(b"adapter") | |
| 28 | + | |
| 29 | + | |
| 30 | +def _fake_stage_mlx(src: Path, dst: Path, *, base_hf_id: str) -> Path: | |
| 31 | + assert src.exists() | |
| 32 | + assert base_hf_id == _SPEC.hf_id | |
| 33 | + dst.mkdir(parents=True, exist_ok=True) | |
| 34 | + (dst / "adapter_config.json").write_text("{}", encoding="utf-8") | |
| 35 | + (dst / "adapters.safetensors").write_bytes(b"mlx-adapter") | |
| 36 | + return dst | |
| 37 | + | |
| 38 | + | |
| 39 | +def _setup_flat_store(tmp_path: Path) -> object: | |
| 40 | + store = for_dlm("01MLXTEST", home=tmp_path) | |
| 41 | + store.ensure_layout() | |
| 42 | + save_manifest(store.manifest, Manifest(dlm_id="01MLXTEST", base_model=_SPEC.key)) | |
| 43 | + adapter = store.adapter_version(3) | |
| 44 | + _write_adapter(adapter) | |
| 45 | + store.set_current_adapter(adapter) | |
| 46 | + return store | |
| 47 | + | |
| 48 | + | |
| 49 | +def _setup_named_store(tmp_path: Path) -> object: | |
| 50 | + store = for_dlm("01MLXMULTI", home=tmp_path) | |
| 51 | + store.ensure_layout() | |
| 52 | + save_manifest(store.manifest, Manifest(dlm_id="01MLXMULTI", base_model=_SPEC.key)) | |
| 53 | + knowledge = store.adapter_version_for("knowledge", 2) | |
| 54 | + tone = store.adapter_version_for("tone", 4) | |
| 55 | + _write_adapter(knowledge) | |
| 56 | + _write_adapter(tone) | |
| 57 | + store.set_current_adapter_for("knowledge", knowledge) | |
| 58 | + store.set_current_adapter_for("tone", tone) | |
| 59 | + return store | |
| 60 | + | |
| 61 | + | |
| 62 | +class TestPrepareMlxServeExport: | |
| 63 | + def test_prepare_writes_launch_script_and_manifest( | |
| 64 | + self, tmp_path: Path, monkeypatch: object | |
| 65 | + ) -> None: | |
| 66 | + store = _setup_flat_store(tmp_path) | |
| 67 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.is_apple_silicon", lambda: True) | |
| 68 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.mlx_available", lambda: True) | |
| 69 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.stage_mlx_adapter_dir", _fake_stage_mlx) | |
| 70 | + | |
| 71 | + prepared = prepare_mlx_serve_export( | |
| 72 | + store=store, | |
| 73 | + spec=_SPEC, | |
| 74 | + adapter_name=None, | |
| 75 | + adapter_path_override=None, | |
| 76 | + declared_adapter_names=None, | |
| 77 | + ) | |
| 78 | + manifest_path = finalize_mlx_serve_export( | |
| 79 | + store=store, | |
| 80 | + spec=_SPEC, | |
| 81 | + prepared=prepared, | |
| 82 | + smoke_output_first_line="hello from mlx", | |
| 83 | + adapter_name=None, | |
| 84 | + adapter_mix=None, | |
| 85 | + ) | |
| 86 | + | |
| 87 | + assert prepared.launch_script_path is not None | |
| 88 | + assert prepared.launch_script_path.name == LAUNCH_SCRIPT_FILENAME | |
| 89 | + script = prepared.launch_script_path.read_text(encoding="utf-8") | |
| 90 | + assert script.startswith("#!/usr/bin/env bash\nset -euo pipefail\n") | |
| 91 | + assert "python -m mlx_lm.server" in script | |
| 92 | + assert f"--model {_SPEC.hf_id}" in script | |
| 93 | + assert '--adapter-path "$SCRIPT_DIR/adapter"' in script | |
| 94 | + | |
| 95 | + export_manifest = load_export_manifest(prepared.export_dir) | |
| 96 | + assert manifest_path == prepared.manifest_path | |
| 97 | + assert export_manifest.target == "mlx-serve" | |
| 98 | + assert export_manifest.quant == "hf" | |
| 99 | + assert export_manifest.adapter_version == 3 | |
| 100 | + assert any(artifact.path == "mlx_serve_launch.sh" for artifact in export_manifest.artifacts) | |
| 101 | + assert any( | |
| 102 | + artifact.path == "adapter/adapters.safetensors" | |
| 103 | + for artifact in export_manifest.artifacts | |
| 104 | + ) | |
| 105 | + | |
| 106 | + store_manifest = load_manifest(store.manifest) | |
| 107 | + assert store_manifest.exports[-1].target == "mlx-serve" | |
| 108 | + assert store_manifest.exports[-1].quant == "hf" | |
| 109 | + assert store_manifest.exports[-1].smoke_output_first_line == "hello from mlx" | |
| 110 | + | |
| 111 | + def test_multi_adapter_export_requires_explicit_selection( | |
| 112 | + self, tmp_path: Path, monkeypatch: object | |
| 113 | + ) -> None: | |
| 114 | + store = _setup_named_store(tmp_path) | |
| 115 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.is_apple_silicon", lambda: True) | |
| 116 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.mlx_available", lambda: True) | |
| 117 | + | |
| 118 | + with pytest.raises(ExportError, match="one adapter at a time"): | |
| 119 | + prepare_mlx_serve_export( | |
| 120 | + store=store, | |
| 121 | + spec=_SPEC, | |
| 122 | + adapter_name=None, | |
| 123 | + adapter_path_override=None, | |
| 124 | + declared_adapter_names=("knowledge", "tone"), | |
| 125 | + ) | |
| 126 | + | |
| 127 | + def test_refuses_without_apple_silicon_runtime( | |
| 128 | + self, tmp_path: Path, monkeypatch: object | |
| 129 | + ) -> None: | |
| 130 | + store = _setup_flat_store(tmp_path) | |
| 131 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.is_apple_silicon", lambda: False) | |
| 132 | + | |
| 133 | + with pytest.raises(ExportError, match="Apple Silicon"): | |
| 134 | + prepare_mlx_serve_export( | |
| 135 | + store=store, | |
| 136 | + spec=_SPEC, | |
| 137 | + adapter_name=None, | |
| 138 | + adapter_path_override=None, | |
| 139 | + declared_adapter_names=None, | |
| 140 | + ) | |
| 141 | + | |
| 142 | + | |
| 143 | +class TestMlxServeSmoke: | |
| 144 | + def test_smoke_uses_absolute_runtime_paths(self, tmp_path: Path, monkeypatch: object) -> None: | |
| 145 | + store = _setup_flat_store(tmp_path) | |
| 146 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.is_apple_silicon", lambda: True) | |
| 147 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.mlx_available", lambda: True) | |
| 148 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.stage_mlx_adapter_dir", _fake_stage_mlx) | |
| 149 | + prepared = prepare_mlx_serve_export( | |
| 150 | + store=store, | |
| 151 | + spec=_SPEC, | |
| 152 | + adapter_name=None, | |
| 153 | + adapter_path_override=None, | |
| 154 | + declared_adapter_names=None, | |
| 155 | + ) | |
| 156 | + seen: list[list[str]] = [] | |
| 157 | + | |
| 158 | + def _fake_smoke(argv: list[str], **_: object) -> str: | |
| 159 | + seen.append(list(argv)) | |
| 160 | + return "mlx replied" | |
| 161 | + | |
| 162 | + monkeypatch.setattr("dlm.export.targets.mlx_serve.smoke_openai_compat_server", _fake_smoke) | |
| 163 | + | |
| 164 | + result = MLX_SERVE_TARGET.smoke_test(prepared) | |
| 165 | + | |
| 166 | + assert result.attempted is True | |
| 167 | + assert result.ok is True | |
| 168 | + assert result.detail == "mlx replied" | |
| 169 | + argv = seen[0] | |
| 170 | + assert argv[:3] == ["python", "-m", "mlx_lm.server"] | |
| 171 | + assert "$SCRIPT_DIR" not in " ".join(argv) | |
| 172 | + assert _SPEC.hf_id in argv | |
| 173 | + assert str(prepared.export_dir / "adapter") in argv | |
tests/unit/export/targets/test_registry.pymodified@@ -19,12 +19,13 @@ class TestRegistry: | ||
| 19 | 19 | assert TARGETS["ollama"] is target |
| 20 | 20 | assert "llama-server" in TARGETS |
| 21 | 21 | assert "vllm" in TARGETS |
| 22 | - assert available_targets() == ("ollama", "llama-server", "vllm") | |
| 22 | + assert "mlx-serve" in TARGETS | |
| 23 | + assert available_targets() == ("ollama", "llama-server", "vllm", "mlx-serve") | |
| 23 | 24 | |
| 24 | 25 | def test_unknown_target_lists_available_targets(self) -> None: |
| 25 | 26 | with pytest.raises( |
| 26 | 27 | UnknownExportTargetError, |
| 27 | - match="available targets: ollama, llama-server, vllm", | |
| 28 | + match="available targets: ollama, llama-server, vllm, mlx-serve", | |
| 28 | 29 | ): |
| 29 | 30 | resolve_target("sglang") |
| 30 | 31 | |