sway Public

Watch 0 Fork 0 Star 0
Python · 52765 bytes Raw Blame History
  
        1
        """Command implementations for the ``sway`` CLI.
      
        2
        
        3
        Each function here is wired to a subcommand in :mod:`dlm_sway.cli.app`.
      
        4
        Commands deliberately do as little as possible themselves — the real
      
        5
        work lives in :mod:`dlm_sway.suite`, :mod:`dlm_sway.backends`, and the
      
        6
        probes package.
      
        7
        """
      
        8
        
        9
        from __future__ import annotations
      
        10
        
        11
        import json
      
        12
        import sys
      
        13
        from enum import StrEnum
      
        14
        from pathlib import Path
      
        15
        from typing import Annotated, Any
      
        16
        
        17
        import typer
      
        18
        from rich.console import Console
      
        19
        
        20
        from dlm_sway import __version__
      
        21
        from dlm_sway.core.errors import SwayError
      
        22
        from dlm_sway.core.result import SuiteResult, SwayScore, Verdict
      
        23
        
        24
        
        25
        def run_cmd(
      
        26
            spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
      
        27
            json_out: Annotated[
      
        28
                Path | None,
      
        29
                typer.Option(
      
        30
                    "--json",
      
        31
                    "-j",
      
        32
                    help="Write the JSON report to this path in addition to the terminal render.",
      
        33
                ),
      
        34
            ] = None,
      
        35
            markdown_out: Annotated[
      
        36
                Path | None,
      
        37
                typer.Option("--markdown", "-m", help="Write a markdown report to this path."),
      
        38
            ] = None,
      
        39
            weights: Annotated[
      
        40
                str | None,
      
        41
                typer.Option(
      
        42
                    "--weights",
      
        43
                    help=(
      
        44
                        "Override composite-score category weights. Format: "
      
        45
                        "'adherence=0.4,attribution=0.3,calibration=0.2,ablation=0.1'. "
      
        46
                        "Unspecified categories keep their defaults."
      
        47
                    ),
      
        48
                ),
      
        49
            ] = None,
      
        50
            dry_run: Annotated[
      
        51
                bool,
      
        52
                typer.Option(
      
        53
                    "--dry-run",
      
        54
                    help=(
      
        55
                        "Validate the spec, list the probes that would run with their "
      
        56
                        "category, and exit 0 — no backend is built (D6)."
      
        57
                    ),
      
        58
                ),
      
        59
            ] = False,
      
        60
            trace: Annotated[
      
        61
                Path | None,
      
        62
                typer.Option(
      
        63
                    "--trace",
      
        64
                    help=(
      
        65
                        "Write a forward-pass trace (JSONL) to this path — one event "
      
        66
                        "per backend scoring call with probe / view / cache-hit info. "
      
        67
                        "Useful for perf investigation; zero overhead when unset."
      
        68
                    ),
      
        69
                ),
      
        70
            ] = None,
      
        71
        ) -> None:
      
        72
            """Execute a suite and render a terminal report."""
      
        73
            if dry_run:
      
        74
                _print_dry_run(spec)
      
        75
                return
      
        76
            try:
      
        77
                weights_override = _parse_weights_flag(weights)
      
        78
                result, score_obj = _execute_spec(spec, weights_override=weights_override, trace_path=trace)
      
        79
            except SwayError as exc:
      
        80
                typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
      
        81
                raise typer.Exit(code=2) from exc
      
        82
        
        83
            from dlm_sway.suite import report
      
        84
        
        85
            console = Console()
      
        86
            report.to_terminal(result, score_obj, console=console)
      
        87
        
        88
            if json_out is not None:
      
        89
                json_out.write_text(report.to_json(result, score_obj), encoding="utf-8")
      
        90
                console.print(f"\n[dim]wrote JSON → {json_out}[/dim]")
      
        91
            if markdown_out is not None:
      
        92
                markdown_out.write_text(report.to_markdown(result, score_obj), encoding="utf-8")
      
        93
                console.print(f"[dim]wrote markdown → {markdown_out}[/dim]")
      
        94
        
        95
        
        96
        def _print_dry_run(spec_path: Path) -> None:
      
        97
            """D6: load + validate the spec, print the probe table, exit cleanly.
      
        98
        
        99
            No backend construction — useful for fast feedback on spec edits
      
        100
            before paying for a model load.
      
        101
            """
      
        102
            from rich.table import Table
      
        103
        
        104
            from dlm_sway.probes.base import build_probe, registry, validate_all_probes
      
        105
            from dlm_sway.suite.loader import load_spec
      
        106
        
        107
            try:
      
        108
                spec = load_spec(spec_path)
      
        109
                validate_all_probes(spec.suite)
      
        110
            except SwayError as exc:
      
        111
                typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
      
        112
                raise typer.Exit(code=2) from exc
      
        113
        
        114
            console = Console()
      
        115
            console.print(f"[bold]dry-run for {spec_path}[/bold] — {len(spec.suite)} probe(s)")
      
        116
            console.print()
      
        117
        
        118
            table = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
      
        119
            table.add_column("#", style="dim")
      
        120
            table.add_column("name", style="cyan")
      
        121
            table.add_column("kind")
      
        122
            table.add_column("category", style="dim")
      
        123
            table.add_column("enabled", style="dim")
      
        124
            registered = registry()
      
        125
            for idx, raw in enumerate(spec.suite, start=1):
      
        126
                probe, probe_spec = build_probe(raw)
      
        127
                cls = registered.get(probe.kind)
      
        128
                category = cls.category if cls is not None else "?"
      
        129
                table.add_row(
      
        130
                    str(idx),
      
        131
                    probe_spec.name,
      
        132
                    probe.kind,
      
        133
                    category,
      
        134
                    "yes" if probe_spec.enabled else "no",
      
        135
                )
      
        136
            console.print(table)
      
        137
        
        138
        
        139
        def list_probes_cmd() -> None:
      
        140
            """List every shipped probe kind with its category + one-line summary (D6)."""
      
        141
            import sys
      
        142
        
        143
            from rich.table import Table
      
        144
        
        145
            # Make sure every probe module has been imported and registered.
      
        146
            import dlm_sway.probes  # noqa: F401
      
        147
            from dlm_sway.probes.base import registry
      
        148
        
        149
            table = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
      
        150
            table.add_column("kind", style="cyan")
      
        151
            table.add_column("category", style="dim")
      
        152
            table.add_column("summary")
      
        153
            for kind in sorted(registry()):
      
        154
                cls = registry()[kind]
      
        155
                # Prefer the class-level docstring, then fall back to the
      
        156
                # defining module's module-level docstring. Most probe modules
      
        157
                # lead with a solid one-liner at the top; the class body often
      
        158
                # skips a docstring to avoid repeating it.
      
        159
                summary = _first_doc_line(cls.__doc__)
      
        160
                if not summary:
      
        161
                    module = sys.modules.get(cls.__module__)
      
        162
                    summary = _first_doc_line(getattr(module, "__doc__", None))
      
        163
                table.add_row(kind, cls.category, summary)
      
        164
            Console().print(table)
      
        165
        
        166
        
        167
        def _first_doc_line(doc: str | None) -> str:
      
        168
            """Return the first non-empty line of ``doc``, stripped."""
      
        169
            if not doc:
      
        170
                return ""
      
        171
            for line in doc.splitlines():
      
        172
                stripped = line.strip()
      
        173
                if stripped:
      
        174
                    return stripped
      
        175
            return ""
      
        176
        
        177
        
        178
        def gate_cmd(
      
        179
            spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
      
        180
            junit_out: Annotated[
      
        181
                Path | None, typer.Option("--junit", help="Write JUnit XML for CI ingestion.")
      
        182
            ] = None,
      
        183
            coverage_threshold: Annotated[
      
        184
                float | None,
      
        185
                typer.Option(
      
        186
                    "--threshold",
      
        187
                    help="Override the spec's coverage_threshold. Exit non-zero below it.",
      
        188
                ),
      
        189
            ] = None,
      
        190
            weights: Annotated[
      
        191
                str | None,
      
        192
                typer.Option(
      
        193
                    "--weights",
      
        194
                    help=(
      
        195
                        "Override composite-score category weights. Format: "
      
        196
                        "'adherence=0.4,attribution=0.3,calibration=0.2,ablation=0.1'. "
      
        197
                        "Unspecified categories keep their defaults."
      
        198
                    ),
      
        199
                ),
      
        200
            ] = None,
      
        201
        ) -> None:
      
        202
            """Execute a suite and exit non-zero on failure (CI gate)."""
      
        203
            try:
      
        204
                weights_override = _parse_weights_flag(weights)
      
        205
                result, score_obj = _execute_spec(spec, weights_override=weights_override)
      
        206
            except SwayError as exc:
      
        207
                typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
      
        208
                raise typer.Exit(code=2) from exc
      
        209
        
        210
            from dlm_sway.suite import report
      
        211
            from dlm_sway.suite.loader import load_spec as _load_spec
      
        212
        
        213
            console = Console()
      
        214
            report.to_terminal(result, score_obj, console=console)
      
        215
        
        216
            if junit_out is not None:
      
        217
                junit_out.write_text(report.to_junit(result, score_obj), encoding="utf-8")
      
        218
                console.print(f"[dim]wrote JUnit → {junit_out}[/dim]")
      
        219
        
        220
            threshold = (
      
        221
                coverage_threshold
      
        222
                if coverage_threshold is not None
      
        223
                else _load_spec(spec).defaults.coverage_threshold
      
        224
            )
      
        225
            has_failures = any(p.verdict == Verdict.FAIL for p in result.probes)
      
        226
            below_threshold = score_obj.overall < threshold
      
        227
            if has_failures or below_threshold:
      
        228
                console.print(
      
        229
                    f"\n[red]gate FAILED[/red] — overall={score_obj.overall:.2f} < {threshold:.2f}"
      
        230
                    if below_threshold
      
        231
                    else "\n[red]gate FAILED[/red] — at least one probe reported FAIL"
      
        232
                )
      
        233
                raise typer.Exit(code=1)
      
        234
            console.print(f"\n[green]gate passed[/green] — overall={score_obj.overall:.2f}")
      
        235
        
        236
        
        237
        def _infer_base_from_adapter_config(adapter_dir: Path) -> str | None:
      
        238
            """Read ``base_model_name_or_path`` from ``adapter_config.json``.
      
        239
        
        240
            Returns ``None`` when the file is missing, malformed, or doesn't
      
        241
            expose the field. Used by ``sway check`` to make ``--base`` optional
      
        242
            in the common case where PEFT already wrote the base id on training
      
        243
            (D4).
      
        244
            """
      
        245
            cfg_path = adapter_dir / "adapter_config.json"
      
        246
            if not cfg_path.exists():
      
        247
                return None
      
        248
            try:
      
        249
                data = json.loads(cfg_path.read_text(encoding="utf-8"))
      
        250
            except (OSError, json.JSONDecodeError):
      
        251
                return None
      
        252
            base = data.get("base_model_name_or_path")
      
        253
            if isinstance(base, str) and base:
      
        254
                return base
      
        255
            return None
      
        256
        
        257
        
        258
        def _check_banner(score_obj: SwayScore, result: SuiteResult) -> tuple[str, str]:
      
        259
            """Compute the (text, rich-style) check verdict banner (D12).
      
        260
        
        261
            Calibrated on the delta_kl z-score: ≥3σ is green ("above noise"),
      
        262
            ≥1σ is yellow ("marginal"), and below that is red. When no z-score
      
        263
            is available (no null calibration ran), falls back to the raw
      
        264
            score band.
      
        265
            """
      
        266
            z = next(
      
        267
                (p.z_score for p in result.probes if p.kind == "delta_kl" and p.z_score is not None),
      
        268
                None,
      
        269
            )
      
        270
            if z is not None:
      
        271
                if z >= 3.0:
      
        272
                    return f"✅ adapter is {z:+.2f}σ above noise", "bold green"
      
        273
                if z >= 1.0:
      
        274
                    return f"⚠️ adapter is {z:+.2f}σ above noise — marginal", "bold yellow"
      
        275
                return f"❌ adapter is {z:+.2f}σ — indistinguishable from noise", "bold red"
      
        276
        
        277
            # Fallback: composite score band.
      
        278
            if score_obj.overall >= 0.6:
      
        279
                return f"✅ adapter scored {score_obj.overall:.2f} — looks healthy", "bold green"
      
        280
            if score_obj.overall >= 0.3:
      
        281
                return f"⚠️ adapter scored {score_obj.overall:.2f} — partial fit", "bold yellow"
      
        282
            return f"❌ adapter scored {score_obj.overall:.2f} — noise band", "bold red"
      
        283
        
        284
        
        285
        def check_cmd(
      
        286
            adapter: Annotated[Path, typer.Argument(help="Path to a PEFT adapter directory.")],
      
        287
            base: Annotated[
      
        288
                str | None,
      
        289
                typer.Option(
      
        290
                    "--base",
      
        291
                    help=(
      
        292
                        "HuggingFace base model id or local path. Inferred from "
      
        293
                        "the adapter's ``adapter_config.json`` when omitted (D4)."
      
        294
                    ),
      
        295
                ),
      
        296
            ] = None,
      
        297
            prompts: Annotated[
      
        298
                Path | None,
      
        299
                typer.Option(
      
        300
                    "--prompts",
      
        301
                    help="File with one prompt per line. Defaults to sway's built-in quick set.",
      
        302
                ),
      
        303
            ] = None,
      
        304
        ) -> None:
      
        305
            """<60s smoke test: "is this adapter doing anything at all?".
      
        306
        
        307
            Runs A1 DeltaKL + C2 CalibrationDrift on a small prompt set. No
      
        308
            spec file required.
      
        309
        
        310
            **Banner semantics (F20 clarification).** The ``+N.NNσ above noise``
      
        311
            header appears only when ``null_adapter`` actually calibrated this
      
        312
            run — i.e., when the backend implements ``NullCalibratedBackend``.
      
        313
            Without null calibration (non-HF backends like the HTTP API or MLX
      
        314
            inference), the banner falls back to the composite score band
      
        315
            ("healthy", "partial fit", "noise band") and the σ wording is
      
        316
            suppressed to avoid a false-precision claim.
      
        317
            """
      
        318
            from dlm_sway.backends import build as build_backend
      
        319
            from dlm_sway.core.model import ModelSpec
      
        320
            from dlm_sway.suite import report
      
        321
            from dlm_sway.suite.runner import run as run_suite
      
        322
            from dlm_sway.suite.score import compute as compute_score
      
        323
            from dlm_sway.suite.spec import SuiteDefaults, SuiteModels, SwaySpec
      
        324
        
        325
            # D4: try to infer base model from adapter_config.json before
      
        326
            # erroring out on a missing --base.
      
        327
            if base is None:
      
        328
                inferred = _infer_base_from_adapter_config(adapter)
      
        329
                if inferred is None:
      
        330
                    typer.secho(
      
        331
                        f"error: --base not given and adapter at {adapter} doesn't carry a "
      
        332
                        f"base_model_name_or_path in adapter_config.json. Pass --base "
      
        333
                        f"explicitly.",
      
        334
                        err=True,
      
        335
                        fg=typer.colors.RED,
      
        336
                    )
      
        337
                    raise typer.Exit(code=2)
      
        338
                base = inferred
      
        339
                typer.secho(f"(inferred base model: {base})", err=True, fg=typer.colors.CYAN)
      
        340
        
        341
            quick_prompts = _load_prompts(prompts) if prompts else _BUILTIN_QUICK_PROMPTS
      
        342
        
        343
            base_spec = ModelSpec(base=base, kind="hf")
      
        344
            ft_spec = ModelSpec(base=base, kind="hf", adapter=adapter)
      
        345
            spec = SwaySpec(
      
        346
                version=1,
      
        347
                models=SuiteModels(base=base_spec, ft=ft_spec),
      
        348
                defaults=SuiteDefaults(seed=0),
      
        349
                suite=[
      
        350
                    # S25: pre-run training-health check first. SKIPs cleanly
      
        351
                    # when the adapter wasn't produced by dlm (no
      
        352
                    # training_state.pt); FAILs loudly on severely-undertrained
      
        353
                    # adapters with a banner before the rest of the output.
      
        354
                    {
      
        355
                        "name": "quick_gradient_ghost",
      
        356
                        "kind": "gradient_ghost",
      
        357
                        "adapter_path": str(adapter),
      
        358
                    },
      
        359
                    # Calibrate first so delta_kl can publish a z-score the
      
        360
                    # banner reads off.
      
        361
                    {"name": "quick_null", "kind": "null_adapter", "runs": 3},
      
        362
                    {
      
        363
                        "name": "quick_delta_kl",
      
        364
                        "kind": "delta_kl",
      
        365
                        "prompts": list(quick_prompts),
      
        366
                        "assert_mean_gte": 0.01,
      
        367
                    },
      
        368
                    {
      
        369
                        "name": "quick_calibration",
      
        370
                        "kind": "calibration_drift",
      
        371
                        "items_limit": 10,
      
        372
                    },
      
        373
                ],
      
        374
            )
      
        375
            try:
      
        376
                backend = build_backend(ft_spec)
      
        377
            except SwayError as exc:
      
        378
                typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
      
        379
                raise typer.Exit(code=2) from exc
      
        380
        
        381
            try:
      
        382
                result = run_suite(spec, backend, spec_path="<check>")
      
        383
            finally:
      
        384
                _close_if_possible(backend)
      
        385
            score_obj = compute_score(result)
      
        386
        
        387
            # D12: top-line banner before the full report so a user looking
      
        388
            # only at the first line still gets the verdict.
      
        389
            console = Console()
      
        390
        
        391
            # S25 — pre-flight gradient_ghost banner. Fires BEFORE the verdict
      
        392
            # banner so the user sees "this adapter is undertrained" first;
      
        393
            # the rest of the check output stays for context (the user might
      
        394
            # still want to see how badly the other probes scored).
      
        395
            _emit_gradient_ghost_banner(result, console)
      
        396
        
        397
            banner_text, banner_style = _check_banner(score_obj, result)
      
        398
            console.print()
      
        399
            console.print(banner_text, style=banner_style)
      
        400
            console.print()
      
        401
            report.to_terminal(result, score_obj, console=console)
      
        402
        
        403
        
        404
        def _emit_gradient_ghost_banner(result: object, console: Console) -> None:
      
        405
            """Print a yellow/red ⚠️ banner if gradient_ghost FAILed (S25 P6).
      
        406
        
        407
            Reaches into ``result.probes`` for any probe with
      
        408
            ``kind=gradient_ghost`` and verdict FAIL. Informational — no
      
        409
            effect on exit code; the user might still want to inspect the
      
        410
            other probes' verdicts.
      
        411
            """
      
        412
            probes = getattr(result, "probes", ()) or ()
      
        413
            for p in probes:
      
        414
                if getattr(p, "kind", "") != "gradient_ghost":
      
        415
                    continue
      
        416
                verdict_str = str(getattr(p, "verdict", "")).lower()
      
        417
                if verdict_str == "fail":
      
        418
                    console.print()
      
        419
                    console.print(
      
        420
                        "⚠️  PRE-RUN ALERT — gradient_ghost flagged severe undertraining",
      
        421
                        style="bold red",
      
        422
                    )
      
        423
                    msg = getattr(p, "message", "")
      
        424
                    if msg:
      
        425
                        console.print(f"   {msg}", style="red")
      
        426
                    console.print(
      
        427
                        "   The probe scores below may be unreliable. Consider retraining.",
      
        428
                        style="dim red",
      
        429
                    )
      
        430
                    return
      
        431
                if verdict_str == "warn":
      
        432
                    console.print()
      
        433
                    console.print(
      
        434
                        "⚠️  gradient_ghost: training may not have fully converged",
      
        435
                        style="bold yellow",
      
        436
                    )
      
        437
                    msg = getattr(p, "message", "")
      
        438
                    if msg:
      
        439
                        console.print(f"   {msg}", style="yellow")
      
        440
                    return
      
        441
        
        442
        
        443
        def diff_cmd(
      
        444
            spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
      
        445
            adapter_a: Annotated[Path, typer.Option("--a", help="First adapter path.")],
      
        446
            adapter_b: Annotated[Path, typer.Option("--b", help="Second adapter path.")],
      
        447
        ) -> None:
      
        448
            """Run the same suite against two adapters and show per-probe deltas."""
      
        449
            from dlm_sway.backends import build as build_backend
      
        450
            from dlm_sway.suite.loader import load_spec
      
        451
            from dlm_sway.suite.runner import run as run_suite
      
        452
            from dlm_sway.suite.score import compute as compute_score
      
        453
        
        454
            sway_spec = load_spec(spec)
      
        455
            console = Console()
      
        456
        
        457
            def _score_for(adapter_path: Path) -> tuple[float, dict[str, float]]:
      
        458
                ft_spec = sway_spec.models.ft.model_copy(update={"adapter": adapter_path})
      
        459
                backend = build_backend(ft_spec)
      
        460
                try:
      
        461
                    result = run_suite(sway_spec, backend, spec_path=str(spec))
      
        462
                finally:
      
        463
                    _close_if_possible(backend)
      
        464
                scored = compute_score(result)
      
        465
                per_probe = {p.name: (p.score or 0.0) for p in result.probes}
      
        466
                return scored.overall, per_probe
      
        467
        
        468
            try:
      
        469
                overall_a, per_a = _score_for(adapter_a)
      
        470
                overall_b, per_b = _score_for(adapter_b)
      
        471
            except SwayError as exc:
      
        472
                typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
      
        473
                raise typer.Exit(code=2) from exc
      
        474
        
        475
            console.print(f"[bold]overall[/bold]  A: {overall_a:.2f}   B: {overall_b:.2f}")
      
        476
            console.print()
      
        477
            console.print("[bold]per-probe[/bold] (A → B, Δ):")
      
        478
            regressed_small = 0  # |Δ| > 0.10 in the wrong direction
      
        479
            regressed_large = 0  # |Δ| > 0.20 in the wrong direction
      
        480
            for name in sorted(per_a.keys() | per_b.keys()):
      
        481
                a = per_a.get(name, 0.0)
      
        482
                b = per_b.get(name, 0.0)
      
        483
                delta = b - a
      
        484
                sign = "+" if delta >= 0 else ""
      
        485
                console.print(f"  {name:<30}  {a:.2f}  →  {b:.2f}   ({sign}{delta:+.2f})")
      
        486
                if delta < -0.10:
      
        487
                    regressed_small += 1
      
        488
                if delta < -0.20:
      
        489
                    regressed_large += 1
      
        490
        
        491
            # D13: regression summary line. The audit's example phrasing was
      
        492
            # "A→B: 3 probes regressed >0.10, 1 regressed >0.20, composite Δ=+0.02".
      
        493
            # Color cue tracks the composite delta: green for any improvement,
      
        494
            # red on regression, yellow on flat-with-regressions.
      
        495
            composite_delta = overall_b - overall_a
      
        496
            if composite_delta > 0.0:
      
        497
                summary_style = "bold green"
      
        498
            elif regressed_small or regressed_large:
      
        499
                summary_style = "bold red" if composite_delta < 0.0 else "bold yellow"
      
        500
            else:
      
        501
                summary_style = "dim"
      
        502
        
        503
            console.print()
      
        504
            console.print(
      
        505
                f"A→B: {regressed_small} probe(s) regressed >0.10, "
      
        506
                f"{regressed_large} regressed >0.20, "
      
        507
                f"composite Δ={composite_delta:+.2f}",
      
        508
                style=summary_style,
      
        509
            )
      
        510
        
        511
        
        512
        def autogen_cmd(
      
        513
            dlm_path: Annotated[Path, typer.Argument(help="Path to a .dlm file.")],
      
        514
            out: Annotated[
      
        515
                Path,
      
        516
                typer.Option("--out", "-o", help="Where to write the generated sway.yaml."),
      
        517
            ] = Path("sway.yaml"),
      
        518
        ) -> None:
      
        519
            """Generate a sway.yaml from a .dlm file (requires the ``dlm-sway[dlm]`` extra)."""
      
        520
            import importlib
      
        521
        
        522
            try:
      
        523
                autogen_mod = importlib.import_module("dlm_sway.integrations.dlm.autogen")
      
        524
            except ImportError as exc:
      
        525
                typer.secho(
      
        526
                    "dlm integration not installed — run: pip install 'dlm-sway[dlm]'",
      
        527
                    err=True,
      
        528
                    fg=typer.colors.RED,
      
        529
                )
      
        530
                raise typer.Exit(code=2) from exc
      
        531
        
        532
            try:
      
        533
                autogen_mod.write_sway_yaml(dlm_path, out)
      
        534
            except SwayError as exc:
      
        535
                typer.secho(f"error: {exc}", err=True, fg=typer.colors.RED)
      
        536
                raise typer.Exit(code=2) from exc
      
        537
        
        538
            typer.echo(f"wrote {out}")
      
        539
        
        540
        
        541
        _DOCTOR_BACKENDS: dict[str, tuple[str, ...]] = {
      
        542
            "hf": ("torch", "transformers", "peft"),
      
        543
            "mlx": ("mlx", "mlx_lm"),
      
        544
            # ``sklearn`` is S16's cluster_kl dep; shipped under [semsim] so it
      
        545
            # rides the same 80 MB MiniLM load adapter_revert already pulls.
      
        546
            "semsim": ("sentence_transformers", "sklearn"),
      
        547
            "style": ("spacy", "textstat", "nlpaug"),
      
        548
            "dlm": ("dlm",),
      
        549
            # ``plotly`` is the load-bearing dep for ``sway report --format html``;
      
        550
            # S12 docs listed it but doctor never probed it before F04.
      
        551
            "viz": ("matplotlib", "plotly"),
      
        552
            # S13 API backend.
      
        553
            "api": ("httpx", "tenacity"),
      
        554
            "pytest": ("pytest",),
      
        555
        }
      
        556
        
        557
        
        558
        def _doctor_payload() -> dict[str, Any]:
      
        559
            """Build the JSON-friendly doctor payload (used by both render paths)."""
      
        560
            extras: dict[str, dict[str, str | None]] = {}
      
        561
            for extra, modules in _DOCTOR_BACKENDS.items():
      
        562
                extras[extra] = {mod: _module_version(mod) for mod in modules}
      
        563
            return {
      
        564
                "sway_version": __version__,
      
        565
                "python": sys.version.split()[0],
      
        566
                "platform": sys.platform,
      
        567
                "extras": extras,
      
        568
            }
      
        569
        
        570
        
        571
        def _module_version(name: str) -> str | None:
      
        572
            """Return the installed module's ``__version__`` string, or ``None``."""
      
        573
            import importlib
      
        574
        
        575
            try:
      
        576
                mod = importlib.import_module(name)
      
        577
            except ImportError:
      
        578
                return None
      
        579
            return str(getattr(mod, "__version__", "installed"))
      
        580
        
        581
        
        582
        def doctor_cmd(
      
        583
            json_out: Annotated[
      
        584
                bool,
      
        585
                typer.Option(
      
        586
                    "--json",
      
        587
                    help=(
      
        588
                        "Emit a machine-readable JSON payload instead of the rich "
      
        589
                        "terminal layout (D7). CI-grep-friendly."
      
        590
                    ),
      
        591
                ),
      
        592
            ] = False,
      
        593
        ) -> None:
      
        594
            """Print backend availability and version info."""
      
        595
            payload = _doctor_payload()
      
        596
            if json_out:
      
        597
                typer.echo(json.dumps(payload, indent=2, sort_keys=True))
      
        598
                return
      
        599
        
        600
            console = Console()
      
        601
            console.print(f"[bold]sway[/bold] {payload['sway_version']}")
      
        602
            console.print(f"  python:    {payload['python']}")
      
        603
            console.print(f"  platform:  {payload['platform']}")
      
        604
            console.print()
      
        605
            console.print("[bold]backends[/bold]")
      
        606
            for extra, modules in payload["extras"].items():
      
        607
                parts = []
      
        608
                for mod, ver in modules.items():
      
        609
                    if ver is None:
      
        610
                        parts.append(f"[red]{mod}: missing[/red]")
      
        611
                    else:
      
        612
                        parts.append(f"[green]{mod}: {ver}[/green]")
      
        613
                console.print(f"  {extra:<8}  {' '.join(parts)}")
      
        614
        
        615
        
        616
        class ReportFormat(StrEnum):
      
        617
            """Allowed values for ``sway report --format`` (D11).
      
        618
        
        619
            Typer enforces the enum at parse time, so unknown formats produce
      
        620
            a clear ``Invalid value`` error instead of silently falling back
      
        621
            to the terminal renderer.
      
        622
            """
      
        623
        
        624
            TERMINAL = "terminal"
      
        625
            MARKDOWN = "md"
      
        626
            MARKDOWN_LONG = "markdown"  # alias kept for muscle memory
      
        627
            JUNIT = "junit"
      
        628
            JSON = "json"
      
        629
            HTML = "html"
      
        630
        
        631
        
        632
        def report_cmd(
      
        633
            result_json: Annotated[Path, typer.Argument(help="Path to a saved result JSON.")],
      
        634
            format: Annotated[
      
        635
                ReportFormat,
      
        636
                typer.Option(
      
        637
                    "--format",
      
        638
                    help="Output format: terminal, md (alias: markdown), junit, json, or html.",
      
        639
                ),
      
        640
            ] = ReportFormat.TERMINAL,
      
        641
            out: Annotated[
      
        642
                Path | None,
      
        643
                typer.Option(
      
        644
                    "--out",
      
        645
                    "-o",
      
        646
                    help=(
      
        647
                        "Write the rendered output to this path instead of stdout. "
      
        648
                        "Required for --format html (Plotly's inlined JS is ~3 MB)."
      
        649
                    ),
      
        650
                ),
      
        651
            ] = None,
      
        652
        ) -> None:
      
        653
            """Re-render a previously saved run (for history tracking / dashboards).
      
        654
        
        655
            The CLI deserializes the JSON back into the canonical
      
        656
            ``(SuiteResult, SwayScore)`` pair via :func:`report.from_json`,
      
        657
            then routes through the same renderers as a fresh ``sway run``.
      
        658
            Single source for every format keeps terminal / md / junit /
      
        659
            json / html output identical regardless of where they came from (B16).
      
        660
            """
      
        661
            from dlm_sway.suite import report
      
        662
        
        663
            raw: dict[str, Any] = json.loads(result_json.read_text(encoding="utf-8"))
      
        664
        
        665
            if format is ReportFormat.JSON:
      
        666
                # Pass-through: the saved file *is* the canonical JSON. Re-emit
      
        667
                # via to_json against the round-tripped pair so any schema
      
        668
                # additions land consistently.
      
        669
                suite, score = report.from_json(raw)
      
        670
                _emit(report.to_json(suite, score), out)
      
        671
                return
      
        672
        
        673
            suite, score = report.from_json(raw)
      
        674
            if format in (ReportFormat.MARKDOWN, ReportFormat.MARKDOWN_LONG):
      
        675
                _emit(report.to_markdown(suite, score), out)
      
        676
                return
      
        677
            if format is ReportFormat.JUNIT:
      
        678
                _emit(report.to_junit(suite, score), out)
      
        679
                return
      
        680
            if format is ReportFormat.HTML:
      
        681
                try:
      
        682
                    from dlm_sway.suite import report_html
      
        683
                except ImportError as exc:  # pragma: no cover — graceful install hint
      
        684
                    typer.echo(f"sway report --format html: {exc}", err=True)
      
        685
                    raise typer.Exit(code=2) from exc
      
        686
                try:
      
        687
                    html_text = report_html.to_html(suite, score)
      
        688
                except RuntimeError as exc:
      
        689
                    typer.echo(f"sway report --format html: {exc}", err=True)
      
        690
                    raise typer.Exit(code=2) from exc
      
        691
                if out is None:
      
        692
                    # Refuse to dump 3 MB of HTML to stdout by default — the
      
        693
                    # user almost always wants a file.
      
        694
                    typer.echo(
      
        695
                        "sway report --format html requires --out PATH "
      
        696
                        "(Plotly JS bundle is ~3 MB; stdout is not an HTML viewer)",
      
        697
                        err=True,
      
        698
                    )
      
        699
                    raise typer.Exit(code=2)
      
        700
                out.write_text(html_text, encoding="utf-8")
      
        701
                typer.echo(f"wrote HTML → {out}", err=True)
      
        702
                return
      
        703
            # ReportFormat.TERMINAL.
      
        704
            if out is not None:
      
        705
                typer.echo(
      
        706
                    "sway report --format terminal does not support --out; "
      
        707
                    "use --format md or --format html for file output.",
      
        708
                    err=True,
      
        709
                )
      
        710
                raise typer.Exit(code=2)
      
        711
            report.to_terminal(suite, score, console=Console())
      
        712
        
        713
        
        714
        def _emit(text: str, out: Path | None) -> None:
      
        715
            """Either write to the target path or ``typer.echo`` to stdout."""
      
        716
            if out is None:
      
        717
                typer.echo(text)
      
        718
            else:
      
        719
                out.write_text(text, encoding="utf-8")
      
        720
                typer.echo(f"wrote {out}", err=True)
      
        721
        
        722
        
        723
        class CompareFormat(StrEnum):
      
        724
            """Allowed values for ``sway compare --format``."""
      
        725
        
        726
            TERMINAL = "terminal"
      
        727
            MARKDOWN = "md"
      
        728
            MARKDOWN_LONG = "markdown"  # alias kept for muscle memory
      
        729
            JSON = "json"
      
        730
        
        731
        
        732
        def compare_cmd(
      
        733
            result_jsons: Annotated[
      
        734
                list[Path],
      
        735
                typer.Argument(help="Two or more saved result JSONs, in chronological order."),
      
        736
            ],
      
        737
            format: Annotated[
      
        738
                CompareFormat,
      
        739
                typer.Option(
      
        740
                    "--format",
      
        741
                    help="Output format: terminal, md (alias: markdown), or json.",
      
        742
                ),
      
        743
            ] = CompareFormat.TERMINAL,
      
        744
            fail_on_regression: Annotated[
      
        745
                float,
      
        746
                typer.Option(
      
        747
                    "--fail-on-regression",
      
        748
                    help=(
      
        749
                        "Exit non-zero when any probe's score in the newest run dropped "
      
        750
                        "by ≥ this threshold vs the previous run. 0 disables the gate."
      
        751
                    ),
      
        752
                ),
      
        753
            ] = 0.0,
      
        754
        ) -> None:
      
        755
            """Compare N saved runs side-by-side (regression dashboard).
      
        756
        
        757
            Rehydrates each JSON via :func:`report.from_json`, folds the runs
      
        758
            into a :class:`CompareMatrix`, and renders the score table + delta
      
        759
            columns + composite timeline. Intended for CI: point at a history
      
        760
            directory (``sway-history/*.json``) and pipe the output into the
      
        761
            build's log, or set ``--fail-on-regression`` to make the build red
      
        762
            on a real drop.
      
        763
            """
      
        764
            from dlm_sway.suite import compare, report
      
        765
        
        766
            if len(result_jsons) < 2:
      
        767
                typer.echo("sway compare: need at least two result JSONs", err=True)
      
        768
                raise typer.Exit(code=2)
      
        769
        
        770
            pairs: list[tuple[SuiteResult, SwayScore]] = []
      
        771
            labels: list[str] = []
      
        772
            for path in result_jsons:
      
        773
                try:
      
        774
                    raw: dict[str, Any] = json.loads(path.read_text(encoding="utf-8"))
      
        775
                except (OSError, json.JSONDecodeError) as exc:
      
        776
                    typer.echo(f"sway compare: cannot read {path}: {exc}", err=True)
      
        777
                    raise typer.Exit(code=2) from exc
      
        778
                pairs.append(report.from_json(raw))
      
        779
                # Short label — the filename without the ``.json`` suffix.
      
        780
                labels.append(path.stem)
      
        781
        
        782
            matrix = compare.build_matrix(pairs, labels=labels)
      
        783
        
        784
            if format is CompareFormat.JSON:
      
        785
                typer.echo(compare.render_json(matrix, regression_threshold=fail_on_regression))
      
        786
            elif format in (CompareFormat.MARKDOWN, CompareFormat.MARKDOWN_LONG):
      
        787
                typer.echo(compare.render_markdown(matrix, regression_threshold=fail_on_regression))
      
        788
            else:
      
        789
                compare.render_terminal(
      
        790
                    matrix,
      
        791
                    console=Console(),
      
        792
                    regression_threshold=fail_on_regression,
      
        793
                )
      
        794
        
        795
            # Exit-code gate: any probe whose last-run delta is ≤ -threshold is a
      
        796
            # regression. ``fail_on_regression=0`` disables the gate entirely.
      
        797
            if fail_on_regression > 0.0:
      
        798
                regressions = matrix.latest_regressions(fail_on_regression)
      
        799
                if regressions:
      
        800
                    raise typer.Exit(code=1)
      
        801
        
        802
        
        803
        class TraceFormat(StrEnum):
      
        804
            """Allowed values for ``sway trace --format``."""
      
        805
        
        806
            TERMINAL = "terminal"
      
        807
            MARKDOWN = "md"
      
        808
            MARKDOWN_LONG = "markdown"  # alias kept for muscle memory
      
        809
            JSON = "json"
      
        810
        
        811
        
        812
        def trace_cmd(
      
        813
            trace_file: Annotated[
      
        814
                Path,
      
        815
                typer.Argument(
      
        816
                    help=("Path to a forward-pass trace JSONL produced by `sway run --trace <path>`."),
      
        817
                ),
      
        818
            ],
      
        819
            format: Annotated[
      
        820
                TraceFormat,
      
        821
                typer.Option(
      
        822
                    "--format",
      
        823
                    help="Output format: terminal, md (alias: markdown), or json.",
      
        824
                ),
      
        825
            ] = TraceFormat.TERMINAL,
      
        826
            slowest: Annotated[
      
        827
                int,
      
        828
                typer.Option(
      
        829
                    "--slowest",
      
        830
                    help="How many slowest-events rows to show. 0 hides that table.",
      
        831
                ),
      
        832
            ] = 10,
      
        833
        ) -> None:
      
        834
            """Analyze a forward-pass trace JSONL.
      
        835
        
        836
            Reads the per-event JSONL `sway run --trace` writes, aggregates
      
        837
            into per-probe + per-view summaries, and surfaces the top-N
      
        838
            slowest events. Intended for suite-performance investigation:
      
        839
            point at a captured trace and see which probe × view pair
      
        840
            dominated wall time, whether the S07 cache helped, and which
      
        841
            individual prompts took the longest.
      
        842
            """
      
        843
            from dlm_sway.suite import trace_analysis
      
        844
        
        845
            try:
      
        846
                events = trace_analysis.load(trace_file)
      
        847
            except OSError as exc:
      
        848
                typer.echo(f"sway trace: cannot read {trace_file}: {exc}", err=True)
      
        849
                raise typer.Exit(code=2) from exc
      
        850
            if not events:
      
        851
                typer.echo(f"sway trace: no events in {trace_file}", err=True)
      
        852
                raise typer.Exit(code=1)
      
        853
        
        854
            report = trace_analysis.build_report(events, slowest_k=max(0, slowest))
      
        855
        
        856
            if format is TraceFormat.JSON:
      
        857
                typer.echo(trace_analysis.render_json(report))
      
        858
            elif format in (TraceFormat.MARKDOWN, TraceFormat.MARKDOWN_LONG):
      
        859
                typer.echo(trace_analysis.render_markdown(report))
      
        860
            else:
      
        861
                trace_analysis.render_terminal(report, console=Console())
      
        862
        
        863
        
        864
        class MineMode(StrEnum):
      
        865
            """``sway mine`` operates in one of two modes."""
      
        866
        
        867
            PARAPHRASE = "paraphrase"
      
        868
            OUTLIERS = "outliers"
      
        869
        
        870
        
        871
        def mine_cmd(
      
        872
            spec: Annotated[Path, typer.Argument(help="Path to a sway.yaml spec.")],
      
        873
            mode: Annotated[
      
        874
                MineMode,
      
        875
                typer.Option(
      
        876
                    "--mode",
      
        877
                    help=(
      
        878
                        "``paraphrase``: sharpen every paraphrase_invariance case with mined "
      
        879
                        "adversarial paraphrases. ``outliers``: rank the spec's delta_kl prompts "
      
        880
                        "(or a corpus-derived pool) by per-prompt raw."
      
        881
                    ),
      
        882
                ),
      
        883
            ] = MineMode.PARAPHRASE,
      
        884
            out: Annotated[
      
        885
                Path | None,
      
        886
                typer.Option(
      
        887
                    "--out",
      
        888
                    "-o",
      
        889
                    help=(
      
        890
                        "Where to write the mined YAML fragment. Defaults to "
      
        891
                        "``sway-mined-<mode>.yaml`` in the current directory."
      
        892
                    ),
      
        893
                ),
      
        894
            ] = None,
      
        895
            top_k: Annotated[
      
        896
                int,
      
        897
                typer.Option(
      
        898
                    "--top-k", help="Keep the top-K candidates per case (paraphrase) or pool (outliers)."
      
        899
                ),
      
        900
            ] = 10,
      
        901
            n_candidates: Annotated[
      
        902
                int,
      
        903
                typer.Option(
      
        904
                    "--n-candidates",
      
        905
                    help=(
      
        906
                        "Paraphrase mode only — generate this many raw candidates before the "
      
        907
                        "diversity filter. Higher = more coverage at more wall-time cost."
      
        908
                    ),
      
        909
                ),
      
        910
            ] = 50,
      
        911
            from_corpus: Annotated[
      
        912
                str | None,
      
        913
                typer.Option(
      
        914
                    "--from-corpus",
      
        915
                    help=(
      
        916
                        "Outliers mode — draw the candidate pool from a packaged corpus "
      
        917
                        "(``public_domain_en``) instead of the spec's own prompts."
      
        918
                    ),
      
        919
                ),
      
        920
            ] = None,
      
        921
            seed: Annotated[
      
        922
                int,
      
        923
                typer.Option(
      
        924
                    "--seed",
      
        925
                    help=(
      
        926
                        "Seed for the generator + probe RNGs. Keep fixed to reproduce a "
      
        927
                        "previous mining run — nlpaug's synonym and back-translation picks "
      
        928
                        "are deterministic under this seed."
      
        929
                    ),
      
        930
                ),
      
        931
            ] = 0,
      
        932
        ) -> None:
      
        933
            """Mine adversarial paraphrases or outlier prompts from a spec.
      
        934
        
        935
            **Paraphrase mode** (``--mode paraphrase``). For each
      
        936
            ``paraphrase_invariance`` case in the spec, generate candidate
      
        937
            paraphrases, diversity-filter them, and rank by the gap between
      
        938
            verbatim and paraphrased lift. The emitted YAML fragment contains
      
        939
            updated ``cases:`` that you can paste over the originals in your
      
        940
            spec — a memorizing adapter that passed the hand-written list will
      
        941
            typically fail the mined list.
      
        942
        
        943
            **Outliers mode** (``--mode outliers``). Rank the spec's
      
        944
            ``delta_kl`` prompt pool (or a corpus-derived pool via
      
        945
            ``--from-corpus``) by per-prompt raw divergence. Emitted fragment
      
        946
            lists top-K and bottom-K prompts, split into two blocks.
      
        947
        
        948
            The mined output is paste-compatible with the spec loader — no
      
        949
            schema bumps. Re-run ``sway gate`` after merging the mined list
      
        950
            to confirm the gate's behavior changed as expected.
      
        951
            """
      
        952
            import yaml
      
        953
        
        954
            from dlm_sway.mining.outlier_miner import corpus_prompts, mine_outliers
      
        955
            from dlm_sway.mining.paraphrase_miner import mine_paraphrases
      
        956
            from dlm_sway.suite.loader import load_spec
      
        957
        
        958
            loaded_spec = load_spec(spec)
      
        959
            out_path = out or Path(f"sway-mined-{mode.value}.yaml")
      
        960
        
        961
            # Materialize the backend. Reuses ``_execute_spec``'s factory so the
      
        962
            # HF / API / MLX selection matches what ``sway run`` would do.
      
        963
            from dlm_sway.backends import build as build_backend
      
        964
        
        965
            backend = build_backend(loaded_spec.models.ft)
      
        966
        
        967
            if mode is MineMode.PARAPHRASE:
      
        968
                payload = _mine_paraphrase_payload(
      
        969
                    loaded_spec,
      
        970
                    backend,
      
        971
                    mine_paraphrases,
      
        972
                    top_k=top_k,
      
        973
                    n_candidates=n_candidates,
      
        974
                    seed=seed,
      
        975
                )
      
        976
            else:
      
        977
                candidate_pool = (
      
        978
                    corpus_prompts(from_corpus) if from_corpus else _collect_delta_kl_prompts(loaded_spec)
      
        979
                )
      
        980
                if not candidate_pool:
      
        981
                    typer.secho(
      
        982
                        "sway mine --outliers: no candidate prompts found. Either add delta_kl "
      
        983
                        "prompts to the spec or pass --from-corpus.",
      
        984
                        err=True,
      
        985
                        fg=typer.colors.RED,
      
        986
                    )
      
        987
                    raise typer.Exit(code=2)
      
        988
                result = mine_outliers(
      
        989
                    probe_kind="delta_kl",
      
        990
                    candidate_prompts=candidate_pool,
      
        991
                    backend=backend,
      
        992
                    top_k=top_k,
      
        993
                    seed=seed,
      
        994
                )
      
        995
                payload = _outlier_result_to_yaml(result)
      
        996
        
        997
            out_path.write_text(yaml.safe_dump(payload, sort_keys=False), encoding="utf-8")
      
        998
            typer.echo(f"wrote {out_path}")
      
        999
        
        1000
        
        1001
        def _mine_paraphrase_payload(
      
        1002
            spec: Any,
      
        1003
            backend: Any,
      
        1004
            miner: Any,
      
        1005
            *,
      
        1006
            top_k: int,
      
        1007
            n_candidates: int,
      
        1008
            seed: int,
      
        1009
        ) -> dict[str, Any]:
      
        1010
            """Run paraphrase mining on every paraphrase_invariance entry; shape into YAML."""
      
        1011
            out_cases: list[dict[str, Any]] = []
      
        1012
            for entry in spec.suite:
      
        1013
                if entry.get("kind") != "paraphrase_invariance":
      
        1014
                    continue
      
        1015
                for case in entry.get("cases", []):
      
        1016
                    prompt = case.get("prompt")
      
        1017
                    gold = case.get("gold")
      
        1018
                    if not prompt or not gold:
      
        1019
                        continue
      
        1020
                    mined = miner(
      
        1021
                        prompt=prompt,
      
        1022
                        gold=gold,
      
        1023
                        backend=backend,
      
        1024
                        n_candidates=n_candidates,
      
        1025
                        top_k=top_k,
      
        1026
                        seed=seed,
      
        1027
                    )
      
        1028
                    out_cases.append(
      
        1029
                        {
      
        1030
                            "prompt": mined.seed_prompt,
      
        1031
                            "gold": mined.gold,
      
        1032
                            "paraphrases": [c.prompt for c in mined.candidates],
      
        1033
                            "_mining_meta": {
      
        1034
                                "top_gaps": [round(c.gap, 6) for c in mined.candidates],
      
        1035
                                "verbatim_lift": (
      
        1036
                                    round(mined.candidates[0].verbatim_lift, 6)
      
        1037
                                    if mined.candidates
      
        1038
                                    else None
      
        1039
                                ),
      
        1040
                            },
      
        1041
                        }
      
        1042
                    )
      
        1043
            return {"mined_cases": out_cases}
      
        1044
        
        1045
        
        1046
        def _collect_delta_kl_prompts(spec: Any) -> list[str]:
      
        1047
            """Pull every ``delta_kl`` entry's prompt pool into one flat list."""
      
        1048
            seen: set[str] = set()
      
        1049
            out: list[str] = []
      
        1050
            for entry in spec.suite:
      
        1051
                if entry.get("kind") != "delta_kl":
      
        1052
                    continue
      
        1053
                for p in entry.get("prompts", []):
      
        1054
                    if p not in seen:
      
        1055
                        seen.add(p)
      
        1056
                        out.append(p)
      
        1057
            return out
      
        1058
        
        1059
        
        1060
        def _outlier_result_to_yaml(result: Any) -> dict[str, Any]:
      
        1061
            """Format an :class:`OutlierResult` as a YAML-friendly dict."""
      
        1062
            return {
      
        1063
                "mined_outliers": {
      
        1064
                    "probe_kind": result.probe_kind,
      
        1065
                    "top": [
      
        1066
                        {"prompt": c.prompt, "raw": round(c.raw, 6), "index": c.index} for c in result.top
      
        1067
                    ],
      
        1068
                    "bottom": [
      
        1069
                        {"prompt": c.prompt, "raw": round(c.raw, 6), "index": c.index}
      
        1070
                        for c in result.bottom
      
        1071
                    ],
      
        1072
                }
      
        1073
            }
      
        1074
        
        1075
        
        1076
        # -- helpers -----------------------------------------------------------
      
        1077
        
        1078
        
        1079
        _BUILTIN_QUICK_PROMPTS: tuple[str, ...] = (
      
        1080
            "The quick brown fox",
      
        1081
            "Once upon a time",
      
        1082
            "The answer to the question is",
      
        1083
            "One important lesson is",
      
        1084
            "In my opinion,",
      
        1085
            "The first step is to",
      
        1086
            "Remember that",
      
        1087
            "A common mistake is",
      
        1088
        )
      
        1089
        
        1090
        
        1091
        def _load_prompts(path: Path) -> tuple[str, ...]:
      
        1092
            return tuple(
      
        1093
                line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()
      
        1094
            )
      
        1095
        
        1096
        
        1097
        def _execute_spec(
      
        1098
            path: Path,
      
        1099
            *,
      
        1100
            weights_override: dict[str, float] | None = None,
      
        1101
            trace_path: Path | None = None,
      
        1102
        ) -> tuple[SuiteResult, SwayScore]:
      
        1103
            """Load a spec, build a backend, run the suite, fold scores. Shared
      
        1104
            by ``run`` and ``gate``. Picks up .dlm-derived sections when the
      
        1105
            spec's ``dlm_source`` is set.
      
        1106
        
        1107
            ``weights_override`` takes precedence over ``spec.defaults.score_weights``
      
        1108
            (which itself takes precedence over the compile-time defaults). The
      
        1109
            CLI hands through ``--weights k=v,k=v`` via this parameter.
      
        1110
            """
      
        1111
            from dlm_sway.backends import build as build_backend
      
        1112
            from dlm_sway.backends import build_two_separate
      
        1113
            from dlm_sway.probes.base import validate_all_probes
      
        1114
            from dlm_sway.suite.loader import load_spec
      
        1115
            from dlm_sway.suite.runner import run as run_suite
      
        1116
            from dlm_sway.suite.score import compute as compute_score
      
        1117
        
        1118
            spec = load_spec(path)
      
        1119
            # B7: validate every probe entry before paying the cost of loading
      
        1120
            # a backend. A user with a typo in `kind:` shouldn't wait minutes
      
        1121
            # for the model to download just to learn they spelled the probe
      
        1122
            # name wrong.
      
        1123
            validate_all_probes(spec.suite)
      
        1124
            sections = None
      
        1125
            doc_text = None
      
        1126
            if spec.dlm_source is not None:
      
        1127
                import importlib
      
        1128
        
        1129
                try:
      
        1130
                    resolver = importlib.import_module("dlm_sway.integrations.dlm.resolver")
      
        1131
                    handle = resolver.resolve_dlm(Path(spec.dlm_source))
      
        1132
                    sections = handle.sections
      
        1133
                    doc_text = handle.doc_text
      
        1134
                except ImportError:
      
        1135
                    # D8: don't silently swallow. The user wrote ``dlm_source``
      
        1136
                    # in their YAML expecting the bridge to populate sections;
      
        1137
                    # warn loudly so they know why downstream attribution
      
        1138
                    # probes are SKIPping.
      
        1139
                    typer.secho(
      
        1140
                        f"warning: spec sets dlm_source={spec.dlm_source!r} but the "
      
        1141
                        f"[dlm] extra is not installed — sections not provided "
      
        1142
                        f"(pip install 'dlm-sway[dlm]')",
      
        1143
                        err=True,
      
        1144
                        fg=typer.colors.YELLOW,
      
        1145
                    )
      
        1146
                    sections = None
      
        1147
                except SwayError as exc:
      
        1148
                    # The bridge imported but failed (no adapter, malformed
      
        1149
                    # .dlm, etc). Same surface — warn, don't crash the suite.
      
        1150
                    typer.secho(
      
        1151
                        f"warning: dlm_source={spec.dlm_source!r} did not resolve: {exc}",
      
        1152
                        err=True,
      
        1153
                        fg=typer.colors.YELLOW,
      
        1154
                    )
      
        1155
                    sections = None
      
        1156
            if spec.defaults.differential:
      
        1157
                backend: Any = build_backend(spec.models.ft)
      
        1158
            else:
      
        1159
                backend = build_two_separate(spec.models)
      
        1160
            try:
      
        1161
                result = run_suite(
      
        1162
                    spec,
      
        1163
                    backend,
      
        1164
                    spec_path=str(path),
      
        1165
                    sections=sections,
      
        1166
                    doc_text=doc_text,
      
        1167
                    trace_path=trace_path,
      
        1168
                )
      
        1169
            finally:
      
        1170
                _close_if_possible(backend)
      
        1171
            effective_weights = weights_override or spec.defaults.score_weights
      
        1172
            score_obj = compute_score(result, weights=effective_weights)
      
        1173
            return result, score_obj
      
        1174
        
        1175
        
        1176
        def _parse_weights_flag(raw: str | None) -> dict[str, float] | None:
      
        1177
            """Parse ``--weights k=v,k=v`` into a dict; pydantic validates on use.
      
        1178
        
        1179
            Returns ``None`` when the flag is empty / unset. Pydantic's
      
        1180
            ``SuiteDefaults._validate_weights`` is re-invoked indirectly via
      
        1181
            ``SwayScore`` — so any unknown category or negative value surfaces
      
        1182
            the same error whether set in YAML or on the command line.
      
        1183
            """
      
        1184
            if not raw:
      
        1185
                return None
      
        1186
            out: dict[str, float] = {}
      
        1187
            for pair in raw.split(","):
      
        1188
                pair = pair.strip()
      
        1189
                if not pair:
      
        1190
                    continue
      
        1191
                if "=" not in pair:
      
        1192
                    raise typer.BadParameter(
      
        1193
                        f"--weights: expected 'key=value' pairs, got {pair!r}. "
      
        1194
                        "Example: --weights adherence=0.4,attribution=0.3"
      
        1195
                    )
      
        1196
                key, _, value = pair.partition("=")
      
        1197
                key = key.strip()
      
        1198
                try:
      
        1199
                    out[key] = float(value.strip())
      
        1200
                except ValueError as exc:
      
        1201
                    raise typer.BadParameter(f"--weights: {value!r} for {key!r} is not a number") from exc
      
        1202
            return out or None
      
        1203
        
        1204
        
        1205
        def _close_if_possible(backend: object) -> None:
      
        1206
            close = getattr(backend, "close", None)
      
        1207
            if callable(close):
      
        1208
                close()
      
        1209
        
        1210
        
        1211
        # --- convert-adapter (S24, F01) ------------------------------------------
      
        1212
        
        1213
        
        1214
        class ConvertTarget(StrEnum):
      
        1215
            MLX = "mlx"
      
        1216
        
        1217
        
        1218
        def convert_adapter_cmd(
      
        1219
            src: Annotated[Path, typer.Argument(help="PEFT adapter directory to convert.")],
      
        1220
            dst: Annotated[Path, typer.Argument(help="Output directory for the converted adapter.")],
      
        1221
            target: Annotated[
      
        1222
                ConvertTarget,
      
        1223
                typer.Option("--target", help="Output adapter format. Currently only 'mlx'."),
      
        1224
            ] = ConvertTarget.MLX,
      
        1225
            overwrite: Annotated[
      
        1226
                bool,
      
        1227
                typer.Option("--overwrite", help="Replace any existing adapter at dst."),
      
        1228
            ] = False,
      
        1229
        ) -> None:
      
        1230
            """Convert a PEFT LoRA adapter to another backend's format.
      
        1231
        
        1232
            Today the only target is ``mlx`` — converts ``adapter_model.safetensors`` +
      
        1233
            ``adapter_config.json`` (PEFT) to ``adapters.safetensors`` +
      
        1234
            ``adapter_config.json`` (mlx-lm). Closes the F01 doc-vs-code gap so
      
        1235
            the MLX backend works on dlm-trained / any PEFT-trained adapters
      
        1236
            without manual conversion.
      
        1237
            """
      
        1238
            from dlm_sway.backends._mlx_convert import MlxConvertError, convert_peft_to_mlx
      
        1239
        
        1240
            if target is not ConvertTarget.MLX:
      
        1241
                raise typer.BadParameter(f"unsupported target {target!r}")
      
        1242
            try:
      
        1243
                report = convert_peft_to_mlx(src, dst, overwrite=overwrite)
      
        1244
            except MlxConvertError as exc:
      
        1245
                typer.secho(f"convert-adapter: {exc}", fg=typer.colors.RED, err=True)
      
        1246
                raise typer.Exit(code=1) from exc
      
        1247
            except SwayError as exc:
      
        1248
                typer.secho(f"convert-adapter: {exc}", fg=typer.colors.RED, err=True)
      
        1249
                raise typer.Exit(code=1) from exc
      
        1250
        
        1251
            src_kb = report["src_bytes"] / 1024
      
        1252
            dst_kb = report["dst_bytes"] / 1024
      
        1253
            typer.echo(
      
        1254
                f"converted: {src} → {dst}  rank={report['rank']}  "
      
        1255
                f"scale={report['scale']:.3f}  num_keys={report['num_keys']}  "
      
        1256
                f"({src_kb:.1f} KB → {dst_kb:.1f} KB)"
      
        1257
            )
      
        1258
            if report["modules_to_save_skipped"]:
      
        1259
                typer.secho(
      
        1260
                    f"warning: {len(report['modules_to_save_skipped'])} modules_to_save tensor(s) "
      
        1261
                    f"skipped (mlx-lm's LoRA loader doesn't apply full-weight overrides). "
      
        1262
                    f"Sample: {report['modules_to_save_skipped'][:1]!r}",
      
        1263
                    fg=typer.colors.YELLOW,
      
        1264
                    err=True,
      
        1265
                )
      
        1266
        
        1267
        
        1268
        # --- pack / unpack (S26, X3) ----------------------------------------------
      
        1269
        
        1270
        
        1271
        def pack_cmd(
      
        1272
            spec_path: Annotated[Path, typer.Argument(help="Path to a sway.yaml to pack.")],
      
        1273
            out: Annotated[
      
        1274
                Path | None,
      
        1275
                typer.Option(
      
        1276
                    "--out",
      
        1277
                    "-o",
      
        1278
                    help="Output tarball path. Defaults to <spec-stem>.swaypack.tar.gz next to spec.",
      
        1279
                ),
      
        1280
            ] = None,
      
        1281
            include_golden: Annotated[
      
        1282
                Path | None,
      
        1283
                typer.Option(
      
        1284
                    "--include-golden",
      
        1285
                    help=(
      
        1286
                        "Path to a JSON sway-run report (from `sway run --json` or "
      
        1287
                        "`sway report --format json`) to bundle for verification."
      
        1288
                    ),
      
        1289
                ),
      
        1290
            ] = None,
      
        1291
            include_null_cache: Annotated[
      
        1292
                bool,
      
        1293
                typer.Option(
      
        1294
                    "--include-null-cache/--no-include-null-cache",
      
        1295
                    help=(
      
        1296
                        "Bundle ~/.dlm-sway/null-stats/*.json into the pack so the "
      
        1297
                        "consumer doesn't need to re-calibrate. Default: include."
      
        1298
                    ),
      
        1299
                ),
      
        1300
            ] = True,
      
        1301
            max_size_mb: Annotated[
      
        1302
                int,
      
        1303
                typer.Option(
      
        1304
                    "--max-size-mb",
      
        1305
                    help="Refuse to write a pack larger than this (default 50 MB).",
      
        1306
                ),
      
        1307
            ] = 50,
      
        1308
        ) -> None:
      
        1309
            """Bundle a spec + its inputs + null-stats cache into a portable swaypack tarball.
      
        1310
        
        1311
            The result is a single ``.swaypack.tar.gz`` you can share with a
      
        1312
            coworker or check into a release repo. The receiver runs
      
        1313
            ``sway unpack <pack>`` and then ``sway run`` against the unpacked
      
        1314
            spec — identical verdict to the original run, no live dlm or
      
        1315
            network needed.
      
        1316
            """
      
        1317
            from dlm_sway.cli._pack import PackError, pack_spec
      
        1318
        
        1319
            if out is None:
      
        1320
                out = spec_path.with_suffix("").with_name(
      
        1321
                    f"{spec_path.with_suffix('').name}.swaypack.tar.gz"
      
        1322
                )
      
        1323
        
        1324
            try:
      
        1325
                report = pack_spec(
      
        1326
                    spec_path,
      
        1327
                    out_path=out,
      
        1328
                    include_golden=include_golden,
      
        1329
                    include_null_cache=include_null_cache,
      
        1330
                    max_size_bytes=max_size_mb * 1024 * 1024,
      
        1331
                )
      
        1332
            except PackError as exc:
      
        1333
                typer.secho(f"pack: {exc}", fg=typer.colors.RED, err=True)
      
        1334
                raise typer.Exit(code=1) from exc
      
        1335
            except SwayError as exc:
      
        1336
                typer.secho(f"pack: {exc}", fg=typer.colors.RED, err=True)
      
        1337
                raise typer.Exit(code=1) from exc
      
        1338
        
        1339
            size_kb = report.size_bytes / 1024
      
        1340
            typer.echo(
      
        1341
                f"wrote {report.out_path} ({size_kb:.1f} KB)  "
      
        1342
                f"sections={report.section_bytes}b  "
      
        1343
                f"null_stats={report.null_stats_count}  "
      
        1344
                f"golden={'yes' if report.golden_included else 'no'}"
      
        1345
            )
      
        1346
        
        1347
        
        1348
        def unpack_cmd(
      
        1349
            pack_path: Annotated[Path, typer.Argument(help="Path to a *.swaypack.tar.gz.")],
      
        1350
            out: Annotated[
      
        1351
                Path | None,
      
        1352
                typer.Option(
      
        1353
                    "--out",
      
        1354
                    "-o",
      
        1355
                    help="Parent directory to extract into. Default: cwd.",
      
        1356
                ),
      
        1357
            ] = None,
      
        1358
        ) -> None:
      
        1359
            """Extract a swaypack into ``out``; print the next ``sway run`` invocation.
      
        1360
        
        1361
            The pack lands at ``<out>/swaypack/`` containing ``sway.yaml`` plus
      
        1362
            bundled artifacts. A ready-to-run command line is printed at the
      
        1363
            end including the ``SWAY_NULL_CACHE_DIR=...`` env var that
      
        1364
            redirects null-stats lookups at the bundled cache instead of the
      
        1365
            user's home directory.
      
        1366
            """
      
        1367
            from dlm_sway.cli._unpack import UnpackError, unpack_swaypack
      
        1368
        
        1369
            target_dir = out if out is not None else Path.cwd()
      
        1370
            try:
      
        1371
                report = unpack_swaypack(pack_path, target_dir=target_dir)
      
        1372
            except UnpackError as exc:
      
        1373
                typer.secho(f"unpack: {exc}", fg=typer.colors.RED, err=True)
      
        1374
                raise typer.Exit(code=1) from exc
      
        1375
            except SwayError as exc:
      
        1376
                typer.secho(f"unpack: {exc}", fg=typer.colors.RED, err=True)
      
        1377
                raise typer.Exit(code=1) from exc
      
        1378
        
        1379
            typer.echo(f"extracted: {report.out_dir}")
      
        1380
            typer.echo(f"  spec_path: {report.spec_path}")
      
        1381
            typer.echo(f"  null_stats: {report.null_stats_dir or '(none in pack)'}")
      
        1382
            typer.echo(f"  swaypack_version: {report.manifest.get('swaypack_version')}")
      
        1383
            typer.echo(f"  packed_at: {report.manifest.get('packed_at')}")
      
        1384
            typer.echo("")
      
        1385
            typer.echo("To run the bundled spec:")
      
        1386
            if report.null_stats_dir is not None:
      
        1387
                typer.echo(f"  SWAY_NULL_CACHE_DIR={report.null_stats_dir} sway run {report.spec_path}")
      
        1388
            else:
      
        1389
                typer.echo(f"  sway run {report.spec_path}")
      
        1390
        
        1391
        
        1392
        def serve_cmd(
      
        1393
            host: Annotated[
      
        1394
                str,
      
        1395
                typer.Option(
      
        1396
                    "--host",
      
        1397
                    help=(
      
        1398
                        "Interface to bind. Default 127.0.0.1 (localhost only). "
      
        1399
                        "Binding to 0.0.0.0 requires --api-key."
      
        1400
                    ),
      
        1401
                ),
      
        1402
            ] = "127.0.0.1",
      
        1403
            port: Annotated[
      
        1404
                int,
      
        1405
                typer.Option("--port", help="TCP port to bind."),
      
        1406
            ] = 8787,
      
        1407
            max_loaded_models: Annotated[
      
        1408
                int,
      
        1409
                typer.Option(
      
        1410
                    "--max-loaded-models",
      
        1411
                    help=(
      
        1412
                        "How many backends to keep warm in memory. Each loaded "
      
        1413
                        "model holds its own VRAM/RAM; default 2 fits a 16 GB GPU "
      
        1414
                        "with two ~1.5B fp16 adapters."
      
        1415
                    ),
      
        1416
                ),
      
        1417
            ] = 2,
      
        1418
            api_key: Annotated[
      
        1419
                str | None,
      
        1420
                typer.Option(
      
        1421
                    "--api-key",
      
        1422
                    help=(
      
        1423
                        "Bearer token required on every non-/health request. "
      
        1424
                        "Required when --host is not loopback."
      
        1425
                    ),
      
        1426
                ),
      
        1427
            ] = None,
      
        1428
            log_level: Annotated[
      
        1429
                str,
      
        1430
                typer.Option("--log-level", help="uvicorn log level."),
      
        1431
            ] = "info",
      
        1432
        ) -> None:
      
        1433
            """Run the warm-backend HTTP daemon (S36).
      
        1434
        
        1435
            First call loads the backend (~15s); subsequent calls reuse it
      
        1436
            (~2s). See ``sway run`` for the equivalent one-shot CLI.
      
        1437
            """
      
        1438
            try:
      
        1439
                import uvicorn  # noqa: F401  — presence check
      
        1440
            except ImportError as exc:
      
        1441
                typer.secho(
      
        1442
                    "sway serve requires the [serve] extra: pip install 'dlm-sway[serve]'",
      
        1443
                    fg=typer.colors.RED,
      
        1444
                    err=True,
      
        1445
                )
      
        1446
                raise typer.Exit(code=2) from exc
      
        1447
        
        1448
            from dlm_sway.serve.app import create_app, parse_host_port
      
        1449
            from dlm_sway.serve.cache import BackendCache
      
        1450
        
        1451
            # Public-bind safety — refuse before any uvicorn startup work.
      
        1452
            loopback = host in ("127.0.0.1", "::1", "localhost")
      
        1453
            if not loopback and api_key is None:
      
        1454
                typer.secho(
      
        1455
                    f"refusing to bind {host}:{port} without --api-key. "
      
        1456
                    "Either pass --api-key <key> or use --host 127.0.0.1.",
      
        1457
                    fg=typer.colors.RED,
      
        1458
                    err=True,
      
        1459
                )
      
        1460
                raise typer.Exit(code=2)
      
        1461
        
        1462
            parse_host_port(host, port)
      
        1463
            if max_loaded_models < 1:
      
        1464
                typer.secho("--max-loaded-models must be >= 1", fg=typer.colors.RED, err=True)
      
        1465
                raise typer.Exit(code=2)
      
        1466
        
        1467
            cache = BackendCache(max_size=max_loaded_models)
      
        1468
            app = create_app(cache=cache, api_key=api_key)
      
        1469
        
        1470
            typer.echo(f"sway serve {__version__} listening on http://{host}:{port}")
      
        1471
            typer.echo(f"  max_loaded_models={max_loaded_models}  auth={'yes' if api_key else 'no'}")
      
        1472
            if not loopback:
      
        1473
                typer.secho(
      
        1474
                    "  WARNING: bound to a non-loopback interface — anyone on "
      
        1475
                    "this network with the API key can drive your GPU.",
      
        1476
                    fg=typer.colors.YELLOW,
      
        1477
                )
      
        1478
        
        1479
            import uvicorn as _uvicorn
      
        1480
        
        1481
            _uvicorn.run(app, host=host, port=port, log_level=log_level)