Python · 27421 bytes Raw Blame History
1 """Report emitters: terminal (rich), JSON, JUnit XML, markdown.
2
3 The terminal renderer is the one a user sees; it's the product surface.
4 It must communicate the verdict *and* the supporting evidence without
5 forcing the user to open the JSON.
6
7 JSON is the machine-readable source of truth — same fields as the
8 :class:`SuiteResult` dataclass but flattened for easy downstream parsing
9 (dashboards, diff tools, history tracking).
10
11 JUnit XML exists to drop into CI pipelines so ``sway gate``
12 integrates with existing test dashboards with no extra glue.
13 """
14
15 from __future__ import annotations
16
17 import json
18 import math
19 import re
20 import xml.etree.ElementTree as ET
21 from io import StringIO
22 from typing import Any
23
24 from rich.console import Console
25 from rich.panel import Panel
26 from rich.table import Table
27 from rich.text import Text
28
29 from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict
30 from dlm_sway.probes._zscore import format_z_profile
31
32 _VERDICT_STYLE = {
33 Verdict.PASS: "bold green",
34 Verdict.FAIL: "bold red",
35 Verdict.WARN: "bold yellow",
36 Verdict.SKIP: "dim",
37 Verdict.ERROR: "bold magenta",
38 }
39
40 #: Sentinel character all renderers use for "no numeric value available."
41 #: Single source prevents drift between surfaces (terminal vs markdown
42 #: vs JSON downstream consumers that copy the rendered strings).
43 _NONE_GLYPH = "—"
44
45
46 # -- unified number formatters (S06.10) --------------------------------
47 #
48 # Every surface that prints a number routes through one of these. A
49 # tests snapshot that locks report output catches any drift before it
50 # ships. Typing is wide (``float | int | None``) so callers don't have
51 # to special-case ``None`` at every site.
52
53
54 def format_score(v: float | int | None) -> str:
55 """Two-decimal score, ``—`` when missing or non-finite."""
56 if v is None or not math.isfinite(float(v)):
57 return _NONE_GLYPH
58 return f"{float(v):.2f}"
59
60
61 def format_raw(v: float | int | None) -> str:
62 """Three-decimal raw metric, ``—`` when missing or non-finite.
63
64 Uses thousands separators at magnitude ≥ 1 000 so half-life outputs
65 from ``prompt_collapse`` don't render as ``1945.473`` (hard to eyeball).
66 """
67 if v is None or not math.isfinite(float(v)):
68 return _NONE_GLYPH
69 return f"{float(v):,.3f}"
70
71
72 def format_z(v: float | int | None) -> str:
73 """Signed z-score with ``σ`` suffix and thousands separator, ``—`` on None."""
74 if v is None or not math.isfinite(float(v)):
75 return _NONE_GLYPH
76 return f"{float(v):+,.2f}σ"
77
78
79 def format_ci(ci: tuple[float, float] | None) -> str:
80 """Percentile-bootstrap 95% CI as ``[lo, hi]``; ``—`` on None / non-finite."""
81 if ci is None:
82 return _NONE_GLYPH
83 lo, hi = ci
84 if not (math.isfinite(float(lo)) and math.isfinite(float(hi))):
85 return _NONE_GLYPH
86 return f"[{float(lo):.3f}, {float(hi):.3f}]"
87
88
89 def _message_with_rank_profile(r: ProbeResult) -> str:
90 """Append the per-rank z-profile to a probe's message when present.
91
92 Renders as ``"<message> | rank profile: +4.2σ @ 1x / +6.8σ @ 0.5x"``.
93 When the probe didn't run under multi-rank calibration (``z_by_rank``
94 is ``None`` or has a single rank), returns the message unchanged.
95 """
96 base = r.message or ""
97 z_by_rank = r.evidence.get("z_by_rank")
98 if not z_by_rank or len(z_by_rank) < 2:
99 return base
100 profile = format_z_profile(z_by_rank)
101 if not profile:
102 return base
103 return f"{base} | rank profile: {profile}" if base else f"rank profile: {profile}"
104
105
106 def format_duration_s(v: float | int | None) -> str:
107 """Wall-time display. ``1.23s`` for sub-second, ``12.3s`` above 10, ``—`` on None."""
108 if v is None or not math.isfinite(float(v)):
109 return _NONE_GLYPH
110 f = float(v)
111 if f < 10.0:
112 return f"{f:.2f}s"
113 if f < 100.0:
114 return f"{f:.1f}s"
115 return f"{f:,.0f}s"
116
117
118 # -- extras-rollup helpers (S06.6) -------------------------------------
119
120 _MISSING_EXTRA_RE = re.compile(r"install the \[([^\]]+)\] extra", re.IGNORECASE)
121
122
123 def collect_missing_extras(suite: SuiteResult) -> list[str]:
124 """Parse SKIP messages for ``install the [X] extra`` hints.
125
126 Returns a deduplicated, sorted list of extra names that would
127 unskip probes. ``BackendNotAvailableError`` formats messages with
128 ``install the [<extra>] extra`` so we can lift them out without
129 wiring a new field through.
130 """
131 found: set[str] = set()
132 for p in suite.probes:
133 if p.verdict != Verdict.SKIP or not p.message:
134 continue
135 for match in _MISSING_EXTRA_RE.finditer(p.message):
136 found.add(match.group(1))
137 return sorted(found)
138
139
140 def collect_degenerate_null_kinds(suite: SuiteResult) -> list[str]:
141 """Probe kinds whose null-calibration stats were flagged degenerate.
142
143 ``null_adapter`` marks a kind's stats with ``degenerate: 1.0`` when
144 the calibration ran but the baseline was too narrow for the z-score
145 path to fire (``runs: 1``, or a multi-seed run whose raws collapsed
146 to an effectively-zero variance — F02 from Audit 03). Unlike
147 :func:`collect_null_opt_outs` (which surfaces probes that opted
148 out at spec-build time), this surface catches the case where the
149 null *did* run but wasn't useful. Both cases fall back to fixed
150 thresholds; the report distinguishes them so users can act:
151 ``opt_out`` → expected for probes like ``adapter_revert``;
152 ``degenerate`` → bump ``runs:`` in the spec.
153 """
154 found: set[str] = set()
155 for probe in suite.probes:
156 if probe.kind != "null_adapter":
157 continue
158 # ``null_adapter`` writes per-kind stats into
159 # ``SuiteResult.null_stats``, not the probe's evidence — the
160 # suite-level field is the canonical place the runner threads
161 # calibration across probes.
162 stats_by_kind = suite.null_stats or {}
163 for kind, kind_stats in stats_by_kind.items():
164 if not isinstance(kind_stats, dict):
165 continue
166 if kind_stats.get("degenerate", 0.0) >= 0.5:
167 found.add(kind)
168 return sorted(found)
169
170
171 def collect_null_opt_outs(suite: SuiteResult) -> list[str]:
172 """Probe kinds that opted out of null calibration.
173
174 ``null_adapter`` publishes ``evidence["skipped_kinds"]`` with the
175 probe kinds whose ``calibrate_spec`` returned ``None`` (e.g.
176 ``adapter_revert`` — no embedder on the null proxy;
177 ``prompt_collapse`` — noise can't fit an exponential decay).
178 Returns a deduplicated, sorted list of those kinds, or an empty
179 list when no null_adapter ran in the suite.
180 """
181 found: set[str] = set()
182 for p in suite.probes:
183 if p.kind != "null_adapter":
184 continue
185 skipped = p.evidence.get("skipped_kinds")
186 if not skipped:
187 continue
188 for kind in skipped:
189 if isinstance(kind, str):
190 found.add(kind)
191 return sorted(found)
192
193
194 def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None:
195 """Render the report to a rich Console (stdout by default)."""
196 c = console or Console()
197
198 header = Text.assemble(
199 ("sway report — ", "bold"),
200 (suite.base_model_id, "cyan"),
201 (" vs ", "dim"),
202 (_adapter_label(suite.adapter_id), "cyan"),
203 )
204 c.print(Panel(header, expand=False, border_style="blue"))
205
206 c.print()
207 c.print(
208 Text.assemble(
209 ("overall: ", "bold"),
210 (format_score(score.overall), _score_style(score.overall)),
211 (" ", ""),
212 (f"[ {score.band} ]", _band_style(score.band)),
213 )
214 )
215
216 # Component breakdown. Order matches ``DEFAULT_COMPONENT_WEIGHTS``
217 # (the extensibility point) and appends any categories present in
218 # ``score.components`` but not in the default weights — so a custom
219 # Probe subclass with a new category still renders.
220 comp_table = Table.grid(padding=(0, 2))
221 comp_table.add_column(justify="left")
222 comp_table.add_column(justify="right")
223 comp_table.add_column()
224 comp_table.add_column(style="dim")
225 for cat in _category_order(score):
226 if cat not in score.components:
227 continue
228 v = score.components[cat]
229 weight = score.weights.get(cat, 0.0)
230 # S03 / B18: a zero-weight category contributes nothing to the
231 # composite; label explicitly so users don't mistake the visible
232 # bar for judgment.
233 label = "(informational, weight=0)" if weight == 0.0 else ""
234 comp_table.add_row(cat, format_score(v), _bar(v), label)
235 c.print(comp_table)
236
237 c.print()
238 # Per-probe detail.
239 detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1))
240 detail.add_column("name", style="cyan")
241 detail.add_column("kind", style="dim")
242 detail.add_column("verdict")
243 detail.add_column("score", justify="right")
244 detail.add_column("raw", justify="right")
245 detail.add_column("ci95", justify="right", style="dim")
246 detail.add_column("z", justify="right")
247 # D15: let Rich wrap long messages instead of hard-truncating at 80
248 # chars with an ellipsis. ``overflow="fold"`` + ``no_wrap=False``
249 # preserves the full text across multiple terminal lines.
250 detail.add_column("note", style="dim", overflow="fold", no_wrap=False)
251 for r in suite.probes:
252 detail.add_row(
253 r.name,
254 r.kind,
255 Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]),
256 format_score(r.score),
257 format_raw(r.raw),
258 format_ci(r.ci_95),
259 format_z(r.z_score),
260 Text(_message_with_rank_profile(r)),
261 )
262 c.print(detail)
263
264 if score.findings:
265 c.print()
266 c.print(Text("top findings:", style="bold"))
267 for i, f in enumerate(score.findings, start=1):
268 c.print(f" {i}. {f}")
269
270 # D3: missing-extras rollup. When probes SKIPped because their
271 # backend extras aren't installed, collapse the hints into one
272 # actionable footer rather than forcing the user to scan per-row.
273 extras = collect_missing_extras(suite)
274 if extras:
275 c.print()
276 skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)
277 c.print(
278 Text(
279 f"{skipped_ct} probe(s) skipped due to missing extras: "
280 f"pip install 'dlm-sway[{','.join(extras)}]'",
281 style="dim",
282 )
283 )
284
285 # F15: null-calibration opt-outs rollup. Probes whose
286 # ``calibrate_spec`` returns ``None`` fall back to fixed-threshold
287 # verdicts. Surface the list in the footer so users understand
288 # why those rows read ``(no calibration)`` in the message column.
289 opt_outs = collect_null_opt_outs(suite)
290 if opt_outs:
291 c.print()
292 c.print(
293 Text(
294 f"{len(opt_outs)} probe(s) opted out of null calibration "
295 f"(using fixed thresholds): {', '.join(opt_outs)}",
296 style="dim",
297 )
298 )
299
300 # F02 (Audit 03): null-calibration-degenerate rollup. Distinct from
301 # opt-outs — the null *did* run, but its baseline was too narrow
302 # (``runs: 1`` or coincidentally-identical seeds). Users see this
303 # and bump ``runs:`` in the spec; the fix is actionable.
304 degenerate = collect_degenerate_null_kinds(suite)
305 if degenerate:
306 c.print()
307 c.print(
308 Text(
309 f"{len(degenerate)} probe kind(s) had a degenerate null "
310 f"baseline (std ≈ 0, insufficient for z-scoring): "
311 f"{', '.join(degenerate)} — bump ``runs:`` in null_adapter spec.",
312 style="dim",
313 )
314 )
315
316 c.print()
317 footer_parts = [f"wall: {format_duration_s(suite.wall_seconds)}", f"sway {suite.sway_version}"]
318 if suite.determinism is not None:
319 footer_parts.append(f"det: {suite.determinism.class_} (seed={suite.determinism.seed})")
320 cache_line = _cache_line(suite)
321 if cache_line is not None:
322 footer_parts.append(cache_line)
323 c.print(Text(" | ".join(footer_parts), style="dim"))
324
325
326 def to_json(suite: SuiteResult, score: SwayScore) -> str:
327 """Serialize the suite + composite score as JSON.
328
329 Stable schema; downstream tools rely on it. Breaking changes bump a
330 ``schema_version`` field (not yet present — this is v0.1).
331 """
332 return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True)
333
334
335 def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]:
336 determinism: dict[str, Any] | None = None
337 if suite.determinism is not None:
338 determinism = {
339 "class": suite.determinism.class_,
340 "seed": suite.determinism.seed,
341 "notes": list(suite.determinism.notes),
342 }
343 return {
344 "schema_version": 1,
345 "sway_version": suite.sway_version,
346 "spec_path": suite.spec_path,
347 "base_model_id": suite.base_model_id,
348 "adapter_id": suite.adapter_id,
349 "started_at": suite.started_at.isoformat(),
350 "finished_at": suite.finished_at.isoformat(),
351 "wall_seconds": suite.wall_seconds,
352 "determinism": determinism,
353 "backend_stats": dict(suite.backend_stats) if suite.backend_stats else {},
354 "score": {
355 "overall": score.overall,
356 "band": score.band,
357 "components": score.components,
358 "weights": score.weights,
359 "findings": list(score.findings),
360 },
361 "null_stats": suite.null_stats,
362 "probes": [_probe_to_jsonable(p) for p in suite.probes],
363 }
364
365
366 def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]:
367 return {
368 "name": r.name,
369 "kind": r.kind,
370 "verdict": r.verdict.value,
371 "score": r.score,
372 "raw": r.raw,
373 "z_score": r.z_score,
374 "base_value": r.base_value,
375 "ft_value": r.ft_value,
376 "evidence": r.evidence,
377 "message": r.message,
378 "duration_s": r.duration_s,
379 # S14: bootstrap 95% CI on ``raw``. Serialized as a two-list
380 # [lo, hi] so JSON stays tuple-free (match numpy convention).
381 "ci_95": list(r.ci_95) if r.ci_95 is not None else None,
382 }
383
384
385 def from_json(raw: dict[str, Any]) -> tuple[SuiteResult, SwayScore]:
386 """Reconstruct a ``(SuiteResult, SwayScore)`` pair from saved JSON.
387
388 Inverse of :func:`to_json` for the fields the renderers consume.
389 Missing fields are tolerated — older snapshots predate
390 ``determinism`` and ``schema_version`` — so this helper stays
391 backward-compatible by default. ``sway report --format X`` uses
392 this so all four formats (terminal / md / junit / json) flow
393 through the same renderers as a fresh ``sway run`` (B16).
394 """
395 from datetime import datetime
396
397 from dlm_sway.core.result import (
398 DEFAULT_COMPONENT_WEIGHTS,
399 DeterminismReport,
400 ProbeResult,
401 SuiteResult,
402 SwayScore,
403 Verdict,
404 )
405
406 def _ts(s: str | None) -> datetime:
407 if s:
408 return datetime.fromisoformat(s)
409 # Snapshots that predate the field — give the renderer a
410 # well-defined zero so wall-time displays as 0.00s.
411 return datetime.fromtimestamp(0).astimezone()
412
413 def _ci_95(v: Any) -> tuple[float, float] | None:
414 if v is None:
415 return None
416 try:
417 lo, hi = v
418 return (float(lo), float(hi))
419 except (TypeError, ValueError):
420 return None
421
422 probes = tuple(
423 ProbeResult(
424 name=p["name"],
425 kind=p["kind"],
426 verdict=Verdict(p["verdict"]),
427 score=p.get("score"),
428 raw=p.get("raw"),
429 z_score=p.get("z_score"),
430 base_value=p.get("base_value"),
431 ft_value=p.get("ft_value"),
432 evidence=dict(p.get("evidence") or {}),
433 message=p.get("message", ""),
434 duration_s=float(p.get("duration_s", 0.0)),
435 ci_95=_ci_95(p.get("ci_95")),
436 )
437 for p in raw.get("probes", [])
438 )
439
440 determinism: DeterminismReport | None = None
441 det_raw = raw.get("determinism")
442 if isinstance(det_raw, dict):
443 determinism = DeterminismReport(
444 class_=det_raw.get("class", "best_effort"),
445 seed=int(det_raw.get("seed", 0)),
446 notes=tuple(det_raw.get("notes") or ()),
447 )
448
449 suite = SuiteResult(
450 spec_path=raw.get("spec_path", ""),
451 started_at=_ts(raw.get("started_at")),
452 finished_at=_ts(raw.get("finished_at")),
453 base_model_id=raw.get("base_model_id", ""),
454 adapter_id=raw.get("adapter_id", ""),
455 sway_version=raw.get("sway_version", "?"),
456 probes=probes,
457 null_stats=dict(raw.get("null_stats") or {}),
458 determinism=determinism,
459 backend_stats=dict(raw.get("backend_stats") or {}),
460 )
461
462 score_raw: dict[str, Any] = raw.get("score") or {}
463 score = SwayScore(
464 overall=float(score_raw.get("overall", 0.0)),
465 components=dict(score_raw.get("components") or {}),
466 weights=dict(score_raw.get("weights") or DEFAULT_COMPONENT_WEIGHTS),
467 band=score_raw.get("band", ""),
468 findings=tuple(score_raw.get("findings") or ()),
469 )
470 return suite, score
471
472
473 def to_junit(suite: SuiteResult, score: SwayScore) -> str:
474 """Serialize as JUnit XML. One ``<testcase>`` per probe."""
475 testsuite = ET.Element(
476 "testsuite",
477 {
478 "name": "sway",
479 "tests": str(len(suite.probes)),
480 "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)),
481 "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)),
482 "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)),
483 "time": f"{suite.wall_seconds:.3f}",
484 },
485 )
486 # Properties — the composite score and category breakdown.
487 props = ET.SubElement(testsuite, "properties")
488 ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"})
489 ET.SubElement(props, "property", {"name": "band", "value": score.band})
490 for cat, v in score.components.items():
491 ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"})
492
493 for r in suite.probes:
494 tc = ET.SubElement(
495 testsuite,
496 "testcase",
497 {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"},
498 )
499 if r.verdict == Verdict.FAIL:
500 ET.SubElement(tc, "failure", {"message": r.message or "failed"})
501 elif r.verdict == Verdict.ERROR:
502 ET.SubElement(tc, "error", {"message": r.message or "errored"})
503 elif r.verdict == Verdict.SKIP:
504 ET.SubElement(tc, "skipped", {"message": r.message or "skipped"})
505
506 return ET.tostring(testsuite, encoding="unicode")
507
508
509 def to_markdown(suite: SuiteResult, score: SwayScore) -> str:
510 """A portable, CI-friendly markdown report.
511
512 The single source of the markdown emit (B16): both
513 ``sway run --markdown`` and ``sway report --format md`` route
514 through this function. No second ``_render_markdown_from_json``.
515 """
516 buf = StringIO()
517 buf.write("# sway report\n\n")
518 buf.write(f"**Overall:** {format_score(score.overall)} (`{score.band}`) \n")
519 buf.write(f"**Base:** `{suite.base_model_id}` \n")
520 buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}` \n")
521 buf.write(f"**Wall:** {format_duration_s(suite.wall_seconds)} \n")
522 if suite.determinism is not None:
523 buf.write(
524 f"**Determinism:** `{suite.determinism.class_}` (seed={suite.determinism.seed}) \n"
525 )
526 cache_line = _cache_line(suite)
527 if cache_line is not None:
528 buf.write(f"**Backend:** {cache_line} \n")
529 buf.write("\n")
530
531 buf.write("## Components\n\n")
532 buf.write("| category | score | weight | |\n|---|---:|---:|---|\n")
533 for cat in _category_order(score):
534 if cat not in score.components:
535 continue
536 v = score.components[cat]
537 weight = score.weights.get(cat, 0.0)
538 label = "(informational, weight=0)" if weight == 0.0 else ""
539 buf.write(f"| {cat} | {format_score(v)} | {format_score(weight)} | {label} |\n")
540
541 # D9: markdown must reach parity with the terminal table — raw,
542 # z_score, duration_s all shown. Findings are appended as a section
543 # below so CI log consumers can see them without opening the JSON.
544 buf.write("\n## Probes\n\n")
545 buf.write(
546 "| name | kind | verdict | score | raw | ci95 | z | duration | note |\n"
547 "|---|---|---|---:|---:|---:|---:|---:|---|\n"
548 )
549 for r in suite.probes:
550 # Escape pipes in messages so markdown doesn't treat them as
551 # column separators. Leading/trailing whitespace collapsed.
552 note = _message_with_rank_profile(r).replace("|", "\\|").replace("\n", " ").strip()
553 buf.write(
554 f"| {r.name} | `{r.kind}` | {r.verdict.value} | "
555 f"{format_score(r.score)} | {format_raw(r.raw)} | "
556 f"{format_ci(r.ci_95)} | {format_z(r.z_score)} | "
557 f"{format_duration_s(r.duration_s)} | {note} |\n"
558 )
559
560 if score.findings:
561 buf.write("\n## Top findings\n\n")
562 for f in score.findings:
563 buf.write(f"- {f}\n")
564
565 # D3: missing-extras rollup.
566 extras = collect_missing_extras(suite)
567 if extras:
568 skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)
569 buf.write("\n## Skipped probes\n\n")
570 buf.write(f"{skipped_ct} probe(s) skipped due to missing extras. Install with:\n\n")
571 buf.write(f"```\npip install 'dlm-sway[{','.join(extras)}]'\n```\n")
572
573 # F15: null-calibration opt-outs rollup.
574 opt_outs = collect_null_opt_outs(suite)
575 if opt_outs:
576 buf.write("\n## Null-calibration opt-outs\n\n")
577 buf.write(
578 f"{len(opt_outs)} probe(s) fall back to fixed thresholds because "
579 f"their `calibrate_spec` returns `None`:\n\n"
580 )
581 for kind in opt_outs:
582 buf.write(f"- `{kind}`\n")
583
584 # F02 (Audit 03) — degenerate null-calibration rollup.
585 degenerate = collect_degenerate_null_kinds(suite)
586 if degenerate:
587 buf.write("\n## Degenerate null calibration\n\n")
588 buf.write(
589 f"{len(degenerate)} probe kind(s) ran null_adapter but the "
590 f"resulting baseline was too narrow for z-scoring "
591 f"(std ≈ 0, typically `runs: 1` or coincidentally-matched "
592 f"seeds). Fix: bump `runs:` in the `null_adapter` spec "
593 f"entry. Affected kinds:\n\n"
594 )
595 for kind in degenerate:
596 buf.write(f"- `{kind}`\n")
597
598 # F07 — cluster_kl sub-line: expand the per-cluster breakdown so
599 # the reader can answer "which topic moved?" without cracking open
600 # the JSON. The row itself already carries ``k=N, spec=X.XX`` in
601 # the message; this section adds the per-cluster mean KL + top
602 # exemplars.
603 ck_probes = [p for p in suite.probes if p.kind == "cluster_kl" and p.evidence]
604 if ck_probes:
605 buf.write("\n## Cluster breakdown (cluster_kl)\n\n")
606 for p in ck_probes:
607 per_cluster = p.evidence.get("per_cluster_mean_kl", [])
608 sizes = p.evidence.get("per_cluster_size", [])
609 exemplars = p.evidence.get("cluster_exemplars", [])
610 buf.write(f"### `{p.name}`\n\n")
611 buf.write("| cluster | size | mean KL | exemplars |\n")
612 buf.write("|---:|---:|---:|---|\n")
613 for i, (mean, size, ex) in enumerate(zip(per_cluster, sizes, exemplars, strict=False)):
614 mean_str = "—" if not isinstance(mean, int | float) else f"{mean:.3f}"
615 ex_str = "; ".join(e.replace("|", "\\|") for e in (ex or [])) or "—"
616 buf.write(f"| {i} | {size} | {mean_str} | {ex_str} |\n")
617 buf.write("\n")
618
619 return buf.getvalue()
620
621
622 # -- helpers -----------------------------------------------------------
623
624
625 def _category_order(score: SwayScore) -> list[str]:
626 """Unified render order for component categories.
627
628 Falls back through two sources, in priority order:
629
630 1. Keys of :data:`core.result.DEFAULT_COMPONENT_WEIGHTS` — the
631 canonical category list every first-party probe slots into.
632 2. Any category present in ``score.components`` that isn't in the
633 default weights — so a custom :class:`Probe` subclass declaring
634 a brand-new category still renders (F16).
635
636 Keeps the renderer loop in terminal + markdown identical so future
637 additions flow through both surfaces without a second code path.
638 """
639 from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS
640
641 order: list[str] = list(DEFAULT_COMPONENT_WEIGHTS.keys())
642 order.extend(cat for cat in score.components if cat not in DEFAULT_COMPONENT_WEIGHTS)
643 return order
644
645
646 def _cache_line(suite: SuiteResult) -> str | None:
647 """Format the cache-hit-rate footer line, or ``None`` when no stats.
648
649 S23 — suffixes a ``batches: N (avg=K)`` segment when the suite
650 fired any batched forward calls. Runs that only use single-prompt
651 scoring (older probes, opt-out probes) render the cache line
652 alone, preserving pre-S23 footer shape.
653 """
654 stats = suite.backend_stats
655 if not stats:
656 return None
657 hits = int(stats.get("cache_hits", 0))
658 misses = int(stats.get("cache_misses", 0))
659 total = hits + misses
660 if total == 0:
661 return None
662 pct = 100.0 * hits / total
663 line = f"cache: {hits}/{total} = {pct:.0f}%"
664 batches = int(stats.get("batches_sent", 0))
665 if batches > 0:
666 avg = float(stats.get("avg_batch_size", 0.0))
667 line = f"{line} | batches: {batches} (avg={avg:.1f})"
668 return line
669
670
671 def _adapter_label(adapter_id: str) -> str:
672 """Truncate the adapter path for display; quote when whitespace is present.
673
674 D14: a path containing spaces (``/Users/me/My Adapters/v1``) was
675 rendering ambiguously in the header. Quote it whenever any
676 whitespace appears so the trailing path is unmistakable.
677 """
678 if not adapter_id:
679 return "(base only)"
680 parts = adapter_id.rstrip("/").split("/")
681 label = "/".join(parts[-3:]) if len(parts) > 3 else adapter_id
682 if any(ch.isspace() for ch in label):
683 # Use double quotes so the result drops cleanly into a CLI
684 # invocation if a user copy-pastes it.
685 return f'"{label}"'
686 return label
687
688
689 def _score_style(v: float) -> str:
690 if v >= 0.6:
691 return "bold green"
692 if v >= 0.3:
693 return "bold yellow"
694 return "bold red"
695
696
697 def _band_style(band: str) -> str:
698 return {
699 "noise": "red",
700 "partial": "yellow",
701 "healthy": "green",
702 "suspicious": "magenta",
703 }.get(band, "white")
704
705
706 def _bar(v: float, *, width: int = 10) -> str:
707 clamped = max(0.0, min(1.0, v))
708 filled = int(round(clamped * width))
709 return "█" * filled + "░" * (width - filled)
710
711
712 __all__ = [
713 "collect_degenerate_null_kinds",
714 "collect_missing_extras",
715 "collect_null_opt_outs",
716 "format_duration_s",
717 "format_raw",
718 "format_score",
719 "format_z",
720 "from_json",
721 "to_json",
722 "to_junit",
723 "to_markdown",
724 "to_terminal",
725 ]