sway Public

Watch 0 Fork 0 Star 0
Python · 27421 bytes Raw Blame History
  
        1
        """Report emitters: terminal (rich), JSON, JUnit XML, markdown.
      
        2
        
        3
        The terminal renderer is the one a user sees; it's the product surface.
      
        4
        It must communicate the verdict *and* the supporting evidence without
      
        5
        forcing the user to open the JSON.
      
        6
        
        7
        JSON is the machine-readable source of truth — same fields as the
      
        8
        :class:`SuiteResult` dataclass but flattened for easy downstream parsing
      
        9
        (dashboards, diff tools, history tracking).
      
        10
        
        11
        JUnit XML exists to drop into CI pipelines so ``sway gate``
      
        12
        integrates with existing test dashboards with no extra glue.
      
        13
        """
      
        14
        
        15
        from __future__ import annotations
      
        16
        
        17
        import json
      
        18
        import math
      
        19
        import re
      
        20
        import xml.etree.ElementTree as ET
      
        21
        from io import StringIO
      
        22
        from typing import Any
      
        23
        
        24
        from rich.console import Console
      
        25
        from rich.panel import Panel
      
        26
        from rich.table import Table
      
        27
        from rich.text import Text
      
        28
        
        29
        from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
      
        30
        from dlm_sway.probes._zscore import format_z_profile
      
        31
        
        32
        _VERDICT_STYLE = {
      
        33
            Verdict.PASS: "bold green",
      
        34
            Verdict.FAIL: "bold red",
      
        35
            Verdict.WARN: "bold yellow",
      
        36
            Verdict.SKIP: "dim",
      
        37
            Verdict.ERROR: "bold magenta",
      
        38
        }
      
        39
        
        40
        #: Sentinel character all renderers use for "no numeric value available."
      
        41
        #: Single source prevents drift between surfaces (terminal vs markdown
      
        42
        #: vs JSON downstream consumers that copy the rendered strings).
      
        43
        _NONE_GLYPH = "—"
      
        44
        
        45
        
        46
        # -- unified number formatters (S06.10) --------------------------------
      
        47
        #
      
        48
        # Every surface that prints a number routes through one of these. A
      
        49
        # tests snapshot that locks report output catches any drift before it
      
        50
        # ships. Typing is wide (``float | int | None``) so callers don't have
      
        51
        # to special-case ``None`` at every site.
      
        52
        
        53
        
        54
        def format_score(v: float | int | None) -> str:
      
        55
            """Two-decimal score, ``—`` when missing or non-finite."""
      
        56
            if v is None or not math.isfinite(float(v)):
      
        57
                return _NONE_GLYPH
      
        58
            return f"{float(v):.2f}"
      
        59
        
        60
        
        61
        def format_raw(v: float | int | None) -> str:
      
        62
            """Three-decimal raw metric, ``—`` when missing or non-finite.
      
        63
        
        64
            Uses thousands separators at magnitude ≥ 1 000 so half-life outputs
      
        65
            from ``prompt_collapse`` don't render as ``1945.473`` (hard to eyeball).
      
        66
            """
      
        67
            if v is None or not math.isfinite(float(v)):
      
        68
                return _NONE_GLYPH
      
        69
            return f"{float(v):,.3f}"
      
        70
        
        71
        
        72
        def format_z(v: float | int | None) -> str:
      
        73
            """Signed z-score with ``σ`` suffix and thousands separator, ``—`` on None."""
      
        74
            if v is None or not math.isfinite(float(v)):
      
        75
                return _NONE_GLYPH
      
        76
            return f"{float(v):+,.2f}σ"
      
        77
        
        78
        
        79
        def format_ci(ci: tuple[float, float] | None) -> str:
      
        80
            """Percentile-bootstrap 95% CI as ``[lo, hi]``; ``—`` on None / non-finite."""
      
        81
            if ci is None:
      
        82
                return _NONE_GLYPH
      
        83
            lo, hi = ci
      
        84
            if not (math.isfinite(float(lo)) and math.isfinite(float(hi))):
      
        85
                return _NONE_GLYPH
      
        86
            return f"[{float(lo):.3f}, {float(hi):.3f}]"
      
        87
        
        88
        
        89
        def _message_with_rank_profile(r: ProbeResult) -> str:
      
        90
            """Append the per-rank z-profile to a probe's message when present.
      
        91
        
        92
            Renders as ``"<message> | rank profile: +4.2σ @ 1x / +6.8σ @ 0.5x"``.
      
        93
            When the probe didn't run under multi-rank calibration (``z_by_rank``
      
        94
            is ``None`` or has a single rank), returns the message unchanged.
      
        95
            """
      
        96
            base = r.message or ""
      
        97
            z_by_rank = r.evidence.get("z_by_rank")
      
        98
            if not z_by_rank or len(z_by_rank) < 2:
      
        99
                return base
      
        100
            profile = format_z_profile(z_by_rank)
      
        101
            if not profile:
      
        102
                return base
      
        103
            return f"{base} | rank profile: {profile}" if base else f"rank profile: {profile}"
      
        104
        
        105
        
        106
        def format_duration_s(v: float | int | None) -> str:
      
        107
            """Wall-time display. ``1.23s`` for sub-second, ``12.3s`` above 10, ``—`` on None."""
      
        108
            if v is None or not math.isfinite(float(v)):
      
        109
                return _NONE_GLYPH
      
        110
            f = float(v)
      
        111
            if f < 10.0:
      
        112
                return f"{f:.2f}s"
      
        113
            if f < 100.0:
      
        114
                return f"{f:.1f}s"
      
        115
            return f"{f:,.0f}s"
      
        116
        
        117
        
        118
        # -- extras-rollup helpers (S06.6) -------------------------------------
      
        119
        
        120
        _MISSING_EXTRA_RE = re.compile(r"install the \[([^\]]+)\] extra", re.IGNORECASE)
      
        121
        
        122
        
        123
        def collect_missing_extras(suite: SuiteResult) -> list[str]:
      
        124
            """Parse SKIP messages for ``install the [X] extra`` hints.
      
        125
        
        126
            Returns a deduplicated, sorted list of extra names that would
      
        127
            unskip probes. ``BackendNotAvailableError`` formats messages with
      
        128
            ``install the [<extra>] extra`` so we can lift them out without
      
        129
            wiring a new field through.
      
        130
            """
      
        131
            found: set[str] = set()
      
        132
            for p in suite.probes:
      
        133
                if p.verdict != Verdict.SKIP or not p.message:
      
        134
                    continue
      
        135
                for match in _MISSING_EXTRA_RE.finditer(p.message):
      
        136
                    found.add(match.group(1))
      
        137
            return sorted(found)
      
        138
        
        139
        
        140
        def collect_degenerate_null_kinds(suite: SuiteResult) -> list[str]:
      
        141
            """Probe kinds whose null-calibration stats were flagged degenerate.
      
        142
        
        143
            ``null_adapter`` marks a kind's stats with ``degenerate: 1.0`` when
      
        144
            the calibration ran but the baseline was too narrow for the z-score
      
        145
            path to fire (``runs: 1``, or a multi-seed run whose raws collapsed
      
        146
            to an effectively-zero variance — F02 from Audit 03). Unlike
      
        147
            :func:`collect_null_opt_outs` (which surfaces probes that opted
      
        148
            out at spec-build time), this surface catches the case where the
      
        149
            null *did* run but wasn't useful. Both cases fall back to fixed
      
        150
            thresholds; the report distinguishes them so users can act:
      
        151
            ``opt_out`` → expected for probes like ``adapter_revert``;
      
        152
            ``degenerate`` → bump ``runs:`` in the spec.
      
        153
            """
      
        154
            found: set[str] = set()
      
        155
            for probe in suite.probes:
      
        156
                if probe.kind != "null_adapter":
      
        157
                    continue
      
        158
                # ``null_adapter`` writes per-kind stats into
      
        159
                # ``SuiteResult.null_stats``, not the probe's evidence — the
      
        160
                # suite-level field is the canonical place the runner threads
      
        161
                # calibration across probes.
      
        162
                stats_by_kind = suite.null_stats or {}
      
        163
                for kind, kind_stats in stats_by_kind.items():
      
        164
                    if not isinstance(kind_stats, dict):
      
        165
                        continue
      
        166
                    if kind_stats.get("degenerate", 0.0) >= 0.5:
      
        167
                        found.add(kind)
      
        168
            return sorted(found)
      
        169
        
        170
        
        171
        def collect_null_opt_outs(suite: SuiteResult) -> list[str]:
      
        172
            """Probe kinds that opted out of null calibration.
      
        173
        
        174
            ``null_adapter`` publishes ``evidence["skipped_kinds"]`` with the
      
        175
            probe kinds whose ``calibrate_spec`` returned ``None`` (e.g.
      
        176
            ``adapter_revert`` — no embedder on the null proxy;
      
        177
            ``prompt_collapse`` — noise can't fit an exponential decay).
      
        178
            Returns a deduplicated, sorted list of those kinds, or an empty
      
        179
            list when no null_adapter ran in the suite.
      
        180
            """
      
        181
            found: set[str] = set()
      
        182
            for p in suite.probes:
      
        183
                if p.kind != "null_adapter":
      
        184
                    continue
      
        185
                skipped = p.evidence.get("skipped_kinds")
      
        186
                if not skipped:
      
        187
                    continue
      
        188
                for kind in skipped:
      
        189
                    if isinstance(kind, str):
      
        190
                        found.add(kind)
      
        191
            return sorted(found)
      
        192
        
        193
        
        194
        def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None:
      
        195
            """Render the report to a rich Console (stdout by default)."""
      
        196
            c = console or Console()
      
        197
        
        198
            header = Text.assemble(
      
        199
                ("sway report — ", "bold"),
      
        200
                (suite.base_model_id, "cyan"),
      
        201
                ("  vs  ", "dim"),
      
        202
                (_adapter_label(suite.adapter_id), "cyan"),
      
        203
            )
      
        204
            c.print(Panel(header, expand=False, border_style="blue"))
      
        205
        
        206
            c.print()
      
        207
            c.print(
      
        208
                Text.assemble(
      
        209
                    ("overall: ", "bold"),
      
        210
                    (format_score(score.overall), _score_style(score.overall)),
      
        211
                    ("  ", ""),
      
        212
                    (f"[ {score.band} ]", _band_style(score.band)),
      
        213
                )
      
        214
            )
      
        215
        
        216
            # Component breakdown. Order matches ``DEFAULT_COMPONENT_WEIGHTS``
      
        217
            # (the extensibility point) and appends any categories present in
      
        218
            # ``score.components`` but not in the default weights — so a custom
      
        219
            # Probe subclass with a new category still renders.
      
        220
            comp_table = Table.grid(padding=(0, 2))
      
        221
            comp_table.add_column(justify="left")
      
        222
            comp_table.add_column(justify="right")
      
        223
            comp_table.add_column()
      
        224
            comp_table.add_column(style="dim")
      
        225
            for cat in _category_order(score):
      
        226
                if cat not in score.components:
      
        227
                    continue
      
        228
                v = score.components[cat]
      
        229
                weight = score.weights.get(cat, 0.0)
      
        230
                # S03 / B18: a zero-weight category contributes nothing to the
      
        231
                # composite; label explicitly so users don't mistake the visible
      
        232
                # bar for judgment.
      
        233
                label = "(informational, weight=0)" if weight == 0.0 else ""
      
        234
                comp_table.add_row(cat, format_score(v), _bar(v), label)
      
        235
            c.print(comp_table)
      
        236
        
        237
            c.print()
      
        238
            # Per-probe detail.
      
        239
            detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
      
        240
            detail.add_column("name", style="cyan")
      
        241
            detail.add_column("kind", style="dim")
      
        242
            detail.add_column("verdict")
      
        243
            detail.add_column("score", justify="right")
      
        244
            detail.add_column("raw", justify="right")
      
        245
            detail.add_column("ci95", justify="right", style="dim")
      
        246
            detail.add_column("z", justify="right")
      
        247
            # D15: let Rich wrap long messages instead of hard-truncating at 80
      
        248
            # chars with an ellipsis. ``overflow="fold"`` + ``no_wrap=False``
      
        249
            # preserves the full text across multiple terminal lines.
      
        250
            detail.add_column("note", style="dim", overflow="fold", no_wrap=False)
      
        251
            for r in suite.probes:
      
        252
                detail.add_row(
      
        253
                    r.name,
      
        254
                    r.kind,
      
        255
                    Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]),
      
        256
                    format_score(r.score),
      
        257
                    format_raw(r.raw),
      
        258
                    format_ci(r.ci_95),
      
        259
                    format_z(r.z_score),
      
        260
                    Text(_message_with_rank_profile(r)),
      
        261
                )
      
        262
            c.print(detail)
      
        263
        
        264
            if score.findings:
      
        265
                c.print()
      
        266
                c.print(Text("top findings:", style="bold"))
      
        267
                for i, f in enumerate(score.findings, start=1):
      
        268
                    c.print(f"  {i}. {f}")
      
        269
        
        270
            # D3: missing-extras rollup. When probes SKIPped because their
      
        271
            # backend extras aren't installed, collapse the hints into one
      
        272
            # actionable footer rather than forcing the user to scan per-row.
      
        273
            extras = collect_missing_extras(suite)
      
        274
            if extras:
      
        275
                c.print()
      
        276
                skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)
      
        277
                c.print(
      
        278
                    Text(
      
        279
                        f"{skipped_ct} probe(s) skipped due to missing extras: "
      
        280
                        f"pip install 'dlm-sway[{','.join(extras)}]'",
      
        281
                        style="dim",
      
        282
                    )
      
        283
                )
      
        284
        
        285
            # F15: null-calibration opt-outs rollup. Probes whose
      
        286
            # ``calibrate_spec`` returns ``None`` fall back to fixed-threshold
      
        287
            # verdicts. Surface the list in the footer so users understand
      
        288
            # why those rows read ``(no calibration)`` in the message column.
      
        289
            opt_outs = collect_null_opt_outs(suite)
      
        290
            if opt_outs:
      
        291
                c.print()
      
        292
                c.print(
      
        293
                    Text(
      
        294
                        f"{len(opt_outs)} probe(s) opted out of null calibration "
      
        295
                        f"(using fixed thresholds): {', '.join(opt_outs)}",
      
        296
                        style="dim",
      
        297
                    )
      
        298
                )
      
        299
        
        300
            # F02 (Audit 03): null-calibration-degenerate rollup. Distinct from
      
        301
            # opt-outs — the null *did* run, but its baseline was too narrow
      
        302
            # (``runs: 1`` or coincidentally-identical seeds). Users see this
      
        303
            # and bump ``runs:`` in the spec; the fix is actionable.
      
        304
            degenerate = collect_degenerate_null_kinds(suite)
      
        305
            if degenerate:
      
        306
                c.print()
      
        307
                c.print(
      
        308
                    Text(
      
        309
                        f"{len(degenerate)} probe kind(s) had a degenerate null "
      
        310
                        f"baseline (std ≈ 0, insufficient for z-scoring): "
      
        311
                        f"{', '.join(degenerate)} — bump ``runs:`` in null_adapter spec.",
      
        312
                        style="dim",
      
        313
                    )
      
        314
                )
      
        315
        
        316
            c.print()
      
        317
            footer_parts = [f"wall: {format_duration_s(suite.wall_seconds)}", f"sway {suite.sway_version}"]
      
        318
            if suite.determinism is not None:
      
        319
                footer_parts.append(f"det: {suite.determinism.class_} (seed={suite.determinism.seed})")
      
        320
            cache_line = _cache_line(suite)
      
        321
            if cache_line is not None:
      
        322
                footer_parts.append(cache_line)
      
        323
            c.print(Text("  |  ".join(footer_parts), style="dim"))
      
        324
        
        325
        
        326
        def to_json(suite: SuiteResult, score: SwayScore) -> str:
      
        327
            """Serialize the suite + composite score as JSON.
      
        328
        
        329
            Stable schema; downstream tools rely on it. Breaking changes bump a
      
        330
            ``schema_version`` field (not yet present — this is v0.1).
      
        331
            """
      
        332
            return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True)
      
        333
        
        334
        
        335
        def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]:
      
        336
            determinism: dict[str, Any] | None = None
      
        337
            if suite.determinism is not None:
      
        338
                determinism = {
      
        339
                    "class": suite.determinism.class_,
      
        340
                    "seed": suite.determinism.seed,
      
        341
                    "notes": list(suite.determinism.notes),
      
        342
                }
      
        343
            return {
      
        344
                "schema_version": 1,
      
        345
                "sway_version": suite.sway_version,
      
        346
                "spec_path": suite.spec_path,
      
        347
                "base_model_id": suite.base_model_id,
      
        348
                "adapter_id": suite.adapter_id,
      
        349
                "started_at": suite.started_at.isoformat(),
      
        350
                "finished_at": suite.finished_at.isoformat(),
      
        351
                "wall_seconds": suite.wall_seconds,
      
        352
                "determinism": determinism,
      
        353
                "backend_stats": dict(suite.backend_stats) if suite.backend_stats else {},
      
        354
                "score": {
      
        355
                    "overall": score.overall,
      
        356
                    "band": score.band,
      
        357
                    "components": score.components,
      
        358
                    "weights": score.weights,
      
        359
                    "findings": list(score.findings),
      
        360
                },
      
        361
                "null_stats": suite.null_stats,
      
        362
                "probes": [_probe_to_jsonable(p) for p in suite.probes],
      
        363
            }
      
        364
        
        365
        
        366
        def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]:
      
        367
            return {
      
        368
                "name": r.name,
      
        369
                "kind": r.kind,
      
        370
                "verdict": r.verdict.value,
      
        371
                "score": r.score,
      
        372
                "raw": r.raw,
      
        373
                "z_score": r.z_score,
      
        374
                "base_value": r.base_value,
      
        375
                "ft_value": r.ft_value,
      
        376
                "evidence": r.evidence,
      
        377
                "message": r.message,
      
        378
                "duration_s": r.duration_s,
      
        379
                # S14: bootstrap 95% CI on ``raw``. Serialized as a two-list
      
        380
                # [lo, hi] so JSON stays tuple-free (match numpy convention).
      
        381
                "ci_95": list(r.ci_95) if r.ci_95 is not None else None,
      
        382
            }
      
        383
        
        384
        
        385
        def from_json(raw: dict[str, Any]) -> tuple[SuiteResult, SwayScore]:
      
        386
            """Reconstruct a ``(SuiteResult, SwayScore)`` pair from saved JSON.
      
        387
        
        388
            Inverse of :func:`to_json` for the fields the renderers consume.
      
        389
            Missing fields are tolerated — older snapshots predate
      
        390
            ``determinism`` and ``schema_version`` — so this helper stays
      
        391
            backward-compatible by default. ``sway report --format X`` uses
      
        392
            this so all four formats (terminal / md / junit / json) flow
      
        393
            through the same renderers as a fresh ``sway run`` (B16).
      
        394
            """
      
        395
            from datetime import datetime
      
        396
        
        397
            from dlm_sway.core.result import (
      
        398
                DEFAULT_COMPONENT_WEIGHTS,
      
        399
                DeterminismReport,
      
        400
                ProbeResult,
      
        401
                SuiteResult,
      
        402
                SwayScore,
      
        403
                Verdict,
      
        404
            )
      
        405
        
        406
            def _ts(s: str | None) -> datetime:
      
        407
                if s:
      
        408
                    return datetime.fromisoformat(s)
      
        409
                # Snapshots that predate the field — give the renderer a
      
        410
                # well-defined zero so wall-time displays as 0.00s.
      
        411
                return datetime.fromtimestamp(0).astimezone()
      
        412
        
        413
            def _ci_95(v: Any) -> tuple[float, float] | None:
      
        414
                if v is None:
      
        415
                    return None
      
        416
                try:
      
        417
                    lo, hi = v
      
        418
                    return (float(lo), float(hi))
      
        419
                except (TypeError, ValueError):
      
        420
                    return None
      
        421
        
        422
            probes = tuple(
      
        423
                ProbeResult(
      
        424
                    name=p["name"],
      
        425
                    kind=p["kind"],
      
        426
                    verdict=Verdict(p["verdict"]),
      
        427
                    score=p.get("score"),
      
        428
                    raw=p.get("raw"),
      
        429
                    z_score=p.get("z_score"),
      
        430
                    base_value=p.get("base_value"),
      
        431
                    ft_value=p.get("ft_value"),
      
        432
                    evidence=dict(p.get("evidence") or {}),
      
        433
                    message=p.get("message", ""),
      
        434
                    duration_s=float(p.get("duration_s", 0.0)),
      
        435
                    ci_95=_ci_95(p.get("ci_95")),
      
        436
                )
      
        437
                for p in raw.get("probes", [])
      
        438
            )
      
        439
        
        440
            determinism: DeterminismReport | None = None
      
        441
            det_raw = raw.get("determinism")
      
        442
            if isinstance(det_raw, dict):
      
        443
                determinism = DeterminismReport(
      
        444
                    class_=det_raw.get("class", "best_effort"),
      
        445
                    seed=int(det_raw.get("seed", 0)),
      
        446
                    notes=tuple(det_raw.get("notes") or ()),
      
        447
                )
      
        448
        
        449
            suite = SuiteResult(
      
        450
                spec_path=raw.get("spec_path", ""),
      
        451
                started_at=_ts(raw.get("started_at")),
      
        452
                finished_at=_ts(raw.get("finished_at")),
      
        453
                base_model_id=raw.get("base_model_id", ""),
      
        454
                adapter_id=raw.get("adapter_id", ""),
      
        455
                sway_version=raw.get("sway_version", "?"),
      
        456
                probes=probes,
      
        457
                null_stats=dict(raw.get("null_stats") or {}),
      
        458
                determinism=determinism,
      
        459
                backend_stats=dict(raw.get("backend_stats") or {}),
      
        460
            )
      
        461
        
        462
            score_raw: dict[str, Any] = raw.get("score") or {}
      
        463
            score = SwayScore(
      
        464
                overall=float(score_raw.get("overall", 0.0)),
      
        465
                components=dict(score_raw.get("components") or {}),
      
        466
                weights=dict(score_raw.get("weights") or DEFAULT_COMPONENT_WEIGHTS),
      
        467
                band=score_raw.get("band", ""),
      
        468
                findings=tuple(score_raw.get("findings") or ()),
      
        469
            )
      
        470
            return suite, score
      
        471
        
        472
        
        473
        def to_junit(suite: SuiteResult, score: SwayScore) -> str:
      
        474
            """Serialize as JUnit XML. One ``<testcase>`` per probe."""
      
        475
            testsuite = ET.Element(
      
        476
                "testsuite",
      
        477
                {
      
        478
                    "name": "sway",
      
        479
                    "tests": str(len(suite.probes)),
      
        480
                    "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
      
        481
                    "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
      
        482
                    "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)),
      
        483
                    "time": f"{suite.wall_seconds:.3f}",
      
        484
                },
      
        485
            )
      
        486
            # Properties — the composite score and category breakdown.
      
        487
            props = ET.SubElement(testsuite, "properties")
      
        488
            ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"})
      
        489
            ET.SubElement(props, "property", {"name": "band", "value": score.band})
      
        490
            for cat, v in score.components.items():
      
        491
                ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"})
      
        492
        
        493
            for r in suite.probes:
      
        494
                tc = ET.SubElement(
      
        495
                    testsuite,
      
        496
                    "testcase",
      
        497
                    {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"},
      
        498
                )
      
        499
                if r.verdict == Verdict.FAIL:
      
        500
                    ET.SubElement(tc, "failure", {"message": r.message or "failed"})
      
        501
                elif r.verdict == Verdict.ERROR:
      
        502
                    ET.SubElement(tc, "error", {"message": r.message or "errored"})
      
        503
                elif r.verdict == Verdict.SKIP:
      
        504
                    ET.SubElement(tc, "skipped", {"message": r.message or "skipped"})
      
        505
        
        506
            return ET.tostring(testsuite, encoding="unicode")
      
        507
        
        508
        
        509
        def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
      
        510
            """A portable, CI-friendly markdown report.
      
        511
        
        512
            The single source of the markdown emit (B16): both
      
        513
            ``sway run --markdown`` and ``sway report --format md`` route
      
        514
            through this function. No second ``_render_markdown_from_json``.
      
        515
            """
      
        516
            buf = StringIO()
      
        517
            buf.write("# sway report\n\n")
      
        518
            buf.write(f"**Overall:** {format_score(score.overall)} (`{score.band}`)  \n")
      
        519
            buf.write(f"**Base:** `{suite.base_model_id}`  \n")
      
        520
            buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}`  \n")
      
        521
            buf.write(f"**Wall:** {format_duration_s(suite.wall_seconds)}  \n")
      
        522
            if suite.determinism is not None:
      
        523
                buf.write(
      
        524
                    f"**Determinism:** `{suite.determinism.class_}` (seed={suite.determinism.seed})  \n"
      
        525
                )
      
        526
            cache_line = _cache_line(suite)
      
        527
            if cache_line is not None:
      
        528
                buf.write(f"**Backend:** {cache_line}  \n")
      
        529
            buf.write("\n")
      
        530
        
        531
            buf.write("## Components\n\n")
      
        532
            buf.write("| category | score | weight | |\n|---|---:|---:|---|\n")
      
        533
            for cat in _category_order(score):
      
        534
                if cat not in score.components:
      
        535
                    continue
      
        536
                v = score.components[cat]
      
        537
                weight = score.weights.get(cat, 0.0)
      
        538
                label = "(informational, weight=0)" if weight == 0.0 else ""
      
        539
                buf.write(f"| {cat} | {format_score(v)} | {format_score(weight)} | {label} |\n")
      
        540
        
        541
            # D9: markdown must reach parity with the terminal table — raw,
      
        542
            # z_score, duration_s all shown. Findings are appended as a section
      
        543
            # below so CI log consumers can see them without opening the JSON.
      
        544
            buf.write("\n## Probes\n\n")
      
        545
            buf.write(
      
        546
                "| name | kind | verdict | score | raw | ci95 | z | duration | note |\n"
      
        547
                "|---|---|---|---:|---:|---:|---:|---:|---|\n"
      
        548
            )
      
        549
            for r in suite.probes:
      
        550
                # Escape pipes in messages so markdown doesn't treat them as
      
        551
                # column separators. Leading/trailing whitespace collapsed.
      
        552
                note = _message_with_rank_profile(r).replace("|", "\\|").replace("\n", " ").strip()
      
        553
                buf.write(
      
        554
                    f"| {r.name} | `{r.kind}` | {r.verdict.value} | "
      
        555
                    f"{format_score(r.score)} | {format_raw(r.raw)} | "
      
        556
                    f"{format_ci(r.ci_95)} | {format_z(r.z_score)} | "
      
        557
                    f"{format_duration_s(r.duration_s)} | {note} |\n"
      
        558
                )
      
        559
        
        560
            if score.findings:
      
        561
                buf.write("\n## Top findings\n\n")
      
        562
                for f in score.findings:
      
        563
                    buf.write(f"- {f}\n")
      
        564
        
        565
            # D3: missing-extras rollup.
      
        566
            extras = collect_missing_extras(suite)
      
        567
            if extras:
      
        568
                skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)
      
        569
                buf.write("\n## Skipped probes\n\n")
      
        570
                buf.write(f"{skipped_ct} probe(s) skipped due to missing extras. Install with:\n\n")
      
        571
                buf.write(f"```\npip install 'dlm-sway[{','.join(extras)}]'\n```\n")
      
        572
        
        573
            # F15: null-calibration opt-outs rollup.
      
        574
            opt_outs = collect_null_opt_outs(suite)
      
        575
            if opt_outs:
      
        576
                buf.write("\n## Null-calibration opt-outs\n\n")
      
        577
                buf.write(
      
        578
                    f"{len(opt_outs)} probe(s) fall back to fixed thresholds because "
      
        579
                    f"their `calibrate_spec` returns `None`:\n\n"
      
        580
                )
      
        581
                for kind in opt_outs:
      
        582
                    buf.write(f"- `{kind}`\n")
      
        583
        
        584
            # F02 (Audit 03) — degenerate null-calibration rollup.
      
        585
            degenerate = collect_degenerate_null_kinds(suite)
      
        586
            if degenerate:
      
        587
                buf.write("\n## Degenerate null calibration\n\n")
      
        588
                buf.write(
      
        589
                    f"{len(degenerate)} probe kind(s) ran null_adapter but the "
      
        590
                    f"resulting baseline was too narrow for z-scoring "
      
        591
                    f"(std ≈ 0, typically `runs: 1` or coincidentally-matched "
      
        592
                    f"seeds). Fix: bump `runs:` in the `null_adapter` spec "
      
        593
                    f"entry. Affected kinds:\n\n"
      
        594
                )
      
        595
                for kind in degenerate:
      
        596
                    buf.write(f"- `{kind}`\n")
      
        597
        
        598
            # F07 — cluster_kl sub-line: expand the per-cluster breakdown so
      
        599
            # the reader can answer "which topic moved?" without cracking open
      
        600
            # the JSON. The row itself already carries ``k=N, spec=X.XX`` in
      
        601
            # the message; this section adds the per-cluster mean KL + top
      
        602
            # exemplars.
      
        603
            ck_probes = [p for p in suite.probes if p.kind == "cluster_kl" and p.evidence]
      
        604
            if ck_probes:
      
        605
                buf.write("\n## Cluster breakdown (cluster_kl)\n\n")
      
        606
                for p in ck_probes:
      
        607
                    per_cluster = p.evidence.get("per_cluster_mean_kl", [])
      
        608
                    sizes = p.evidence.get("per_cluster_size", [])
      
        609
                    exemplars = p.evidence.get("cluster_exemplars", [])
      
        610
                    buf.write(f"### `{p.name}`\n\n")
      
        611
                    buf.write("| cluster | size | mean KL | exemplars |\n")
      
        612
                    buf.write("|---:|---:|---:|---|\n")
      
        613
                    for i, (mean, size, ex) in enumerate(zip(per_cluster, sizes, exemplars, strict=False)):
      
        614
                        mean_str = "—" if not isinstance(mean, int | float) else f"{mean:.3f}"
      
        615
                        ex_str = "; ".join(e.replace("|", "\\|") for e in (ex or [])) or "—"
      
        616
                        buf.write(f"| {i} | {size} | {mean_str} | {ex_str} |\n")
      
        617
                    buf.write("\n")
      
        618
        
        619
            return buf.getvalue()
      
        620
        
        621
        
        622
        # -- helpers -----------------------------------------------------------
      
        623
        
        624
        
        625
        def _category_order(score: SwayScore) -> list[str]:
      
        626
            """Unified render order for component categories.
      
        627
        
        628
            Falls back through two sources, in priority order:
      
        629
        
        630
            1. Keys of :data:`core.result.DEFAULT_COMPONENT_WEIGHTS` — the
      
        631
               canonical category list every first-party probe slots into.
      
        632
            2. Any category present in ``score.components`` that isn't in the
      
        633
               default weights — so a custom :class:`Probe` subclass declaring
      
        634
               a brand-new category still renders (F16).
      
        635
        
        636
            Keeps the renderer loop in terminal + markdown identical so future
      
        637
            additions flow through both surfaces without a second code path.
      
        638
            """
      
        639
            from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS
      
        640
        
        641
            order: list[str] = list(DEFAULT_COMPONENT_WEIGHTS.keys())
      
        642
            order.extend(cat for cat in score.components if cat not in DEFAULT_COMPONENT_WEIGHTS)
      
        643
            return order
      
        644
        
        645
        
        646
        def _cache_line(suite: SuiteResult) -> str | None:
      
        647
            """Format the cache-hit-rate footer line, or ``None`` when no stats.
      
        648
        
        649
            S23 — suffixes a ``batches: N (avg=K)`` segment when the suite
      
        650
            fired any batched forward calls. Runs that only use single-prompt
      
        651
            scoring (older probes, opt-out probes) render the cache line
      
        652
            alone, preserving pre-S23 footer shape.
      
        653
            """
      
        654
            stats = suite.backend_stats
      
        655
            if not stats:
      
        656
                return None
      
        657
            hits = int(stats.get("cache_hits", 0))
      
        658
            misses = int(stats.get("cache_misses", 0))
      
        659
            total = hits + misses
      
        660
            if total == 0:
      
        661
                return None
      
        662
            pct = 100.0 * hits / total
      
        663
            line = f"cache: {hits}/{total} = {pct:.0f}%"
      
        664
            batches = int(stats.get("batches_sent", 0))
      
        665
            if batches > 0:
      
        666
                avg = float(stats.get("avg_batch_size", 0.0))
      
        667
                line = f"{line} | batches: {batches} (avg={avg:.1f})"
      
        668
            return line
      
        669
        
        670
        
        671
        def _adapter_label(adapter_id: str) -> str:
      
        672
            """Truncate the adapter path for display; quote when whitespace is present.
      
        673
        
        674
            D14: a path containing spaces (``/Users/me/My Adapters/v1``) was
      
        675
            rendering ambiguously in the header. Quote it whenever any
      
        676
            whitespace appears so the trailing path is unmistakable.
      
        677
            """
      
        678
            if not adapter_id:
      
        679
                return "(base only)"
      
        680
            parts = adapter_id.rstrip("/").split("/")
      
        681
            label = "/".join(parts[-3:]) if len(parts) > 3 else adapter_id
      
        682
            if any(ch.isspace() for ch in label):
      
        683
                # Use double quotes so the result drops cleanly into a CLI
      
        684
                # invocation if a user copy-pastes it.
      
        685
                return f'"{label}"'
      
        686
            return label
      
        687
        
        688
        
        689
        def _score_style(v: float) -> str:
      
        690
            if v >= 0.6:
      
        691
                return "bold green"
      
        692
            if v >= 0.3:
      
        693
                return "bold yellow"
      
        694
            return "bold red"
      
        695
        
        696
        
        697
        def _band_style(band: str) -> str:
      
        698
            return {
      
        699
                "noise": "red",
      
        700
                "partial": "yellow",
      
        701
                "healthy": "green",
      
        702
                "suspicious": "magenta",
      
        703
            }.get(band, "white")
      
        704
        
        705
        
        706
        def _bar(v: float, *, width: int = 10) -> str:
      
        707
            clamped = max(0.0, min(1.0, v))
      
        708
            filled = int(round(clamped * width))
      
        709
            return "█" * filled + "░" * (width - filled)
      
        710
        
        711
        
        712
        __all__ = [
      
        713
            "collect_degenerate_null_kinds",
      
        714
            "collect_missing_extras",
      
        715
            "collect_null_opt_outs",
      
        716
            "format_duration_s",
      
        717
            "format_raw",
      
        718
            "format_score",
      
        719
            "format_z",
      
        720
            "from_json",
      
        721
            "to_json",
      
        722
            "to_junit",
      
        723
            "to_markdown",
      
        724
            "to_terminal",
      
        725
        ]