| 1 | """Report emitters: terminal (rich), JSON, JUnit XML, markdown. |
| 2 | |
| 3 | The terminal renderer is the one a user sees; it's the product surface. |
| 4 | It must communicate the verdict *and* the supporting evidence without |
| 5 | forcing the user to open the JSON. |
| 6 | |
| 7 | JSON is the machine-readable source of truth — same fields as the |
| 8 | :class:`SuiteResult` dataclass but flattened for easy downstream parsing |
| 9 | (dashboards, diff tools, history tracking). |
| 10 | |
| 11 | JUnit XML exists to drop into CI pipelines so ``sway gate`` |
| 12 | integrates with existing test dashboards with no extra glue. |
| 13 | """ |
| 14 | |
| 15 | from __future__ import annotations |
| 16 | |
| 17 | import json |
| 18 | import math |
| 19 | import re |
| 20 | import xml.etree.ElementTree as ET |
| 21 | from io import StringIO |
| 22 | from typing import Any |
| 23 | |
| 24 | from rich.console import Console |
| 25 | from rich.panel import Panel |
| 26 | from rich.table import Table |
| 27 | from rich.text import Text |
| 28 | |
| 29 | from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict |
| 30 | from dlm_sway.probes._zscore import format_z_profile |
| 31 | |
| 32 | _VERDICT_STYLE = { |
| 33 | Verdict.PASS: "bold green", |
| 34 | Verdict.FAIL: "bold red", |
| 35 | Verdict.WARN: "bold yellow", |
| 36 | Verdict.SKIP: "dim", |
| 37 | Verdict.ERROR: "bold magenta", |
| 38 | } |
| 39 | |
| 40 | #: Sentinel character all renderers use for "no numeric value available." |
| 41 | #: Single source prevents drift between surfaces (terminal vs markdown |
| 42 | #: vs JSON downstream consumers that copy the rendered strings). |
| 43 | _NONE_GLYPH = "—" |
| 44 | |
| 45 | |
| 46 | # -- unified number formatters (S06.10) -------------------------------- |
| 47 | # |
| 48 | # Every surface that prints a number routes through one of these. A |
| 49 | # tests snapshot that locks report output catches any drift before it |
| 50 | # ships. Typing is wide (``float | int | None``) so callers don't have |
| 51 | # to special-case ``None`` at every site. |
| 52 | |
| 53 | |
| 54 | def format_score(v: float | int | None) -> str: |
| 55 | """Two-decimal score, ``—`` when missing or non-finite.""" |
| 56 | if v is None or not math.isfinite(float(v)): |
| 57 | return _NONE_GLYPH |
| 58 | return f"{float(v):.2f}" |
| 59 | |
| 60 | |
| 61 | def format_raw(v: float | int | None) -> str: |
| 62 | """Three-decimal raw metric, ``—`` when missing or non-finite. |
| 63 | |
| 64 | Uses thousands separators at magnitude ≥ 1 000 so half-life outputs |
| 65 | from ``prompt_collapse`` don't render as ``1945.473`` (hard to eyeball). |
| 66 | """ |
| 67 | if v is None or not math.isfinite(float(v)): |
| 68 | return _NONE_GLYPH |
| 69 | return f"{float(v):,.3f}" |
| 70 | |
| 71 | |
| 72 | def format_z(v: float | int | None) -> str: |
| 73 | """Signed z-score with ``σ`` suffix and thousands separator, ``—`` on None.""" |
| 74 | if v is None or not math.isfinite(float(v)): |
| 75 | return _NONE_GLYPH |
| 76 | return f"{float(v):+,.2f}σ" |
| 77 | |
| 78 | |
| 79 | def format_ci(ci: tuple[float, float] | None) -> str: |
| 80 | """Percentile-bootstrap 95% CI as ``[lo, hi]``; ``—`` on None / non-finite.""" |
| 81 | if ci is None: |
| 82 | return _NONE_GLYPH |
| 83 | lo, hi = ci |
| 84 | if not (math.isfinite(float(lo)) and math.isfinite(float(hi))): |
| 85 | return _NONE_GLYPH |
| 86 | return f"[{float(lo):.3f}, {float(hi):.3f}]" |
| 87 | |
| 88 | |
| 89 | def _message_with_rank_profile(r: ProbeResult) -> str: |
| 90 | """Append the per-rank z-profile to a probe's message when present. |
| 91 | |
| 92 | Renders as ``"<message> | rank profile: +4.2σ @ 1x / +6.8σ @ 0.5x"``. |
| 93 | When the probe didn't run under multi-rank calibration (``z_by_rank`` |
| 94 | is ``None`` or has a single rank), returns the message unchanged. |
| 95 | """ |
| 96 | base = r.message or "" |
| 97 | z_by_rank = r.evidence.get("z_by_rank") |
| 98 | if not z_by_rank or len(z_by_rank) < 2: |
| 99 | return base |
| 100 | profile = format_z_profile(z_by_rank) |
| 101 | if not profile: |
| 102 | return base |
| 103 | return f"{base} | rank profile: {profile}" if base else f"rank profile: {profile}" |
| 104 | |
| 105 | |
| 106 | def format_duration_s(v: float | int | None) -> str: |
| 107 | """Wall-time display. ``1.23s`` for sub-second, ``12.3s`` above 10, ``—`` on None.""" |
| 108 | if v is None or not math.isfinite(float(v)): |
| 109 | return _NONE_GLYPH |
| 110 | f = float(v) |
| 111 | if f < 10.0: |
| 112 | return f"{f:.2f}s" |
| 113 | if f < 100.0: |
| 114 | return f"{f:.1f}s" |
| 115 | return f"{f:,.0f}s" |
| 116 | |
| 117 | |
| 118 | # -- extras-rollup helpers (S06.6) ------------------------------------- |
| 119 | |
| 120 | _MISSING_EXTRA_RE = re.compile(r"install the \[([^\]]+)\] extra", re.IGNORECASE) |
| 121 | |
| 122 | |
| 123 | def collect_missing_extras(suite: SuiteResult) -> list[str]: |
| 124 | """Parse SKIP messages for ``install the [X] extra`` hints. |
| 125 | |
| 126 | Returns a deduplicated, sorted list of extra names that would |
| 127 | unskip probes. ``BackendNotAvailableError`` formats messages with |
| 128 | ``install the [<extra>] extra`` so we can lift them out without |
| 129 | wiring a new field through. |
| 130 | """ |
| 131 | found: set[str] = set() |
| 132 | for p in suite.probes: |
| 133 | if p.verdict != Verdict.SKIP or not p.message: |
| 134 | continue |
| 135 | for match in _MISSING_EXTRA_RE.finditer(p.message): |
| 136 | found.add(match.group(1)) |
| 137 | return sorted(found) |
| 138 | |
| 139 | |
| 140 | def collect_degenerate_null_kinds(suite: SuiteResult) -> list[str]: |
| 141 | """Probe kinds whose null-calibration stats were flagged degenerate. |
| 142 | |
| 143 | ``null_adapter`` marks a kind's stats with ``degenerate: 1.0`` when |
| 144 | the calibration ran but the baseline was too narrow for the z-score |
| 145 | path to fire (``runs: 1``, or a multi-seed run whose raws collapsed |
| 146 | to an effectively-zero variance — F02 from Audit 03). Unlike |
| 147 | :func:`collect_null_opt_outs` (which surfaces probes that opted |
| 148 | out at spec-build time), this surface catches the case where the |
| 149 | null *did* run but wasn't useful. Both cases fall back to fixed |
| 150 | thresholds; the report distinguishes them so users can act: |
| 151 | ``opt_out`` → expected for probes like ``adapter_revert``; |
| 152 | ``degenerate`` → bump ``runs:`` in the spec. |
| 153 | """ |
| 154 | found: set[str] = set() |
| 155 | for probe in suite.probes: |
| 156 | if probe.kind != "null_adapter": |
| 157 | continue |
| 158 | # ``null_adapter`` writes per-kind stats into |
| 159 | # ``SuiteResult.null_stats``, not the probe's evidence — the |
| 160 | # suite-level field is the canonical place the runner threads |
| 161 | # calibration across probes. |
| 162 | stats_by_kind = suite.null_stats or {} |
| 163 | for kind, kind_stats in stats_by_kind.items(): |
| 164 | if not isinstance(kind_stats, dict): |
| 165 | continue |
| 166 | if kind_stats.get("degenerate", 0.0) >= 0.5: |
| 167 | found.add(kind) |
| 168 | return sorted(found) |
| 169 | |
| 170 | |
| 171 | def collect_null_opt_outs(suite: SuiteResult) -> list[str]: |
| 172 | """Probe kinds that opted out of null calibration. |
| 173 | |
| 174 | ``null_adapter`` publishes ``evidence["skipped_kinds"]`` with the |
| 175 | probe kinds whose ``calibrate_spec`` returned ``None`` (e.g. |
| 176 | ``adapter_revert`` — no embedder on the null proxy; |
| 177 | ``prompt_collapse`` — noise can't fit an exponential decay). |
| 178 | Returns a deduplicated, sorted list of those kinds, or an empty |
| 179 | list when no null_adapter ran in the suite. |
| 180 | """ |
| 181 | found: set[str] = set() |
| 182 | for p in suite.probes: |
| 183 | if p.kind != "null_adapter": |
| 184 | continue |
| 185 | skipped = p.evidence.get("skipped_kinds") |
| 186 | if not skipped: |
| 187 | continue |
| 188 | for kind in skipped: |
| 189 | if isinstance(kind, str): |
| 190 | found.add(kind) |
| 191 | return sorted(found) |
| 192 | |
| 193 | |
| 194 | def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None: |
| 195 | """Render the report to a rich Console (stdout by default).""" |
| 196 | c = console or Console() |
| 197 | |
| 198 | header = Text.assemble( |
| 199 | ("sway report — ", "bold"), |
| 200 | (suite.base_model_id, "cyan"), |
| 201 | (" vs ", "dim"), |
| 202 | (_adapter_label(suite.adapter_id), "cyan"), |
| 203 | ) |
| 204 | c.print(Panel(header, expand=False, border_style="blue")) |
| 205 | |
| 206 | c.print() |
| 207 | c.print( |
| 208 | Text.assemble( |
| 209 | ("overall: ", "bold"), |
| 210 | (format_score(score.overall), _score_style(score.overall)), |
| 211 | (" ", ""), |
| 212 | (f"[ {score.band} ]", _band_style(score.band)), |
| 213 | ) |
| 214 | ) |
| 215 | |
| 216 | # Component breakdown. Order matches ``DEFAULT_COMPONENT_WEIGHTS`` |
| 217 | # (the extensibility point) and appends any categories present in |
| 218 | # ``score.components`` but not in the default weights — so a custom |
| 219 | # Probe subclass with a new category still renders. |
| 220 | comp_table = Table.grid(padding=(0, 2)) |
| 221 | comp_table.add_column(justify="left") |
| 222 | comp_table.add_column(justify="right") |
| 223 | comp_table.add_column() |
| 224 | comp_table.add_column(style="dim") |
| 225 | for cat in _category_order(score): |
| 226 | if cat not in score.components: |
| 227 | continue |
| 228 | v = score.components[cat] |
| 229 | weight = score.weights.get(cat, 0.0) |
| 230 | # S03 / B18: a zero-weight category contributes nothing to the |
| 231 | # composite; label explicitly so users don't mistake the visible |
| 232 | # bar for judgment. |
| 233 | label = "(informational, weight=0)" if weight == 0.0 else "" |
| 234 | comp_table.add_row(cat, format_score(v), _bar(v), label) |
| 235 | c.print(comp_table) |
| 236 | |
| 237 | c.print() |
| 238 | # Per-probe detail. |
| 239 | detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1)) |
| 240 | detail.add_column("name", style="cyan") |
| 241 | detail.add_column("kind", style="dim") |
| 242 | detail.add_column("verdict") |
| 243 | detail.add_column("score", justify="right") |
| 244 | detail.add_column("raw", justify="right") |
| 245 | detail.add_column("ci95", justify="right", style="dim") |
| 246 | detail.add_column("z", justify="right") |
| 247 | # D15: let Rich wrap long messages instead of hard-truncating at 80 |
| 248 | # chars with an ellipsis. ``overflow="fold"`` + ``no_wrap=False`` |
| 249 | # preserves the full text across multiple terminal lines. |
| 250 | detail.add_column("note", style="dim", overflow="fold", no_wrap=False) |
| 251 | for r in suite.probes: |
| 252 | detail.add_row( |
| 253 | r.name, |
| 254 | r.kind, |
| 255 | Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]), |
| 256 | format_score(r.score), |
| 257 | format_raw(r.raw), |
| 258 | format_ci(r.ci_95), |
| 259 | format_z(r.z_score), |
| 260 | Text(_message_with_rank_profile(r)), |
| 261 | ) |
| 262 | c.print(detail) |
| 263 | |
| 264 | if score.findings: |
| 265 | c.print() |
| 266 | c.print(Text("top findings:", style="bold")) |
| 267 | for i, f in enumerate(score.findings, start=1): |
| 268 | c.print(f" {i}. {f}") |
| 269 | |
| 270 | # D3: missing-extras rollup. When probes SKIPped because their |
| 271 | # backend extras aren't installed, collapse the hints into one |
| 272 | # actionable footer rather than forcing the user to scan per-row. |
| 273 | extras = collect_missing_extras(suite) |
| 274 | if extras: |
| 275 | c.print() |
| 276 | skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP) |
| 277 | c.print( |
| 278 | Text( |
| 279 | f"{skipped_ct} probe(s) skipped due to missing extras: " |
| 280 | f"pip install 'dlm-sway[{','.join(extras)}]'", |
| 281 | style="dim", |
| 282 | ) |
| 283 | ) |
| 284 | |
| 285 | # F15: null-calibration opt-outs rollup. Probes whose |
| 286 | # ``calibrate_spec`` returns ``None`` fall back to fixed-threshold |
| 287 | # verdicts. Surface the list in the footer so users understand |
| 288 | # why those rows read ``(no calibration)`` in the message column. |
| 289 | opt_outs = collect_null_opt_outs(suite) |
| 290 | if opt_outs: |
| 291 | c.print() |
| 292 | c.print( |
| 293 | Text( |
| 294 | f"{len(opt_outs)} probe(s) opted out of null calibration " |
| 295 | f"(using fixed thresholds): {', '.join(opt_outs)}", |
| 296 | style="dim", |
| 297 | ) |
| 298 | ) |
| 299 | |
| 300 | # F02 (Audit 03): null-calibration-degenerate rollup. Distinct from |
| 301 | # opt-outs — the null *did* run, but its baseline was too narrow |
| 302 | # (``runs: 1`` or coincidentally-identical seeds). Users see this |
| 303 | # and bump ``runs:`` in the spec; the fix is actionable. |
| 304 | degenerate = collect_degenerate_null_kinds(suite) |
| 305 | if degenerate: |
| 306 | c.print() |
| 307 | c.print( |
| 308 | Text( |
| 309 | f"{len(degenerate)} probe kind(s) had a degenerate null " |
| 310 | f"baseline (std ≈ 0, insufficient for z-scoring): " |
| 311 | f"{', '.join(degenerate)} — bump ``runs:`` in null_adapter spec.", |
| 312 | style="dim", |
| 313 | ) |
| 314 | ) |
| 315 | |
| 316 | c.print() |
| 317 | footer_parts = [f"wall: {format_duration_s(suite.wall_seconds)}", f"sway {suite.sway_version}"] |
| 318 | if suite.determinism is not None: |
| 319 | footer_parts.append(f"det: {suite.determinism.class_} (seed={suite.determinism.seed})") |
| 320 | cache_line = _cache_line(suite) |
| 321 | if cache_line is not None: |
| 322 | footer_parts.append(cache_line) |
| 323 | c.print(Text(" | ".join(footer_parts), style="dim")) |
| 324 | |
| 325 | |
| 326 | def to_json(suite: SuiteResult, score: SwayScore) -> str: |
| 327 | """Serialize the suite + composite score as JSON. |
| 328 | |
| 329 | Stable schema; downstream tools rely on it. Breaking changes bump a |
| 330 | ``schema_version`` field (not yet present — this is v0.1). |
| 331 | """ |
| 332 | return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True) |
| 333 | |
| 334 | |
| 335 | def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]: |
| 336 | determinism: dict[str, Any] | None = None |
| 337 | if suite.determinism is not None: |
| 338 | determinism = { |
| 339 | "class": suite.determinism.class_, |
| 340 | "seed": suite.determinism.seed, |
| 341 | "notes": list(suite.determinism.notes), |
| 342 | } |
| 343 | return { |
| 344 | "schema_version": 1, |
| 345 | "sway_version": suite.sway_version, |
| 346 | "spec_path": suite.spec_path, |
| 347 | "base_model_id": suite.base_model_id, |
| 348 | "adapter_id": suite.adapter_id, |
| 349 | "started_at": suite.started_at.isoformat(), |
| 350 | "finished_at": suite.finished_at.isoformat(), |
| 351 | "wall_seconds": suite.wall_seconds, |
| 352 | "determinism": determinism, |
| 353 | "backend_stats": dict(suite.backend_stats) if suite.backend_stats else {}, |
| 354 | "score": { |
| 355 | "overall": score.overall, |
| 356 | "band": score.band, |
| 357 | "components": score.components, |
| 358 | "weights": score.weights, |
| 359 | "findings": list(score.findings), |
| 360 | }, |
| 361 | "null_stats": suite.null_stats, |
| 362 | "probes": [_probe_to_jsonable(p) for p in suite.probes], |
| 363 | } |
| 364 | |
| 365 | |
| 366 | def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]: |
| 367 | return { |
| 368 | "name": r.name, |
| 369 | "kind": r.kind, |
| 370 | "verdict": r.verdict.value, |
| 371 | "score": r.score, |
| 372 | "raw": r.raw, |
| 373 | "z_score": r.z_score, |
| 374 | "base_value": r.base_value, |
| 375 | "ft_value": r.ft_value, |
| 376 | "evidence": r.evidence, |
| 377 | "message": r.message, |
| 378 | "duration_s": r.duration_s, |
| 379 | # S14: bootstrap 95% CI on ``raw``. Serialized as a two-list |
| 380 | # [lo, hi] so JSON stays tuple-free (match numpy convention). |
| 381 | "ci_95": list(r.ci_95) if r.ci_95 is not None else None, |
| 382 | } |
| 383 | |
| 384 | |
| 385 | def from_json(raw: dict[str, Any]) -> tuple[SuiteResult, SwayScore]: |
| 386 | """Reconstruct a ``(SuiteResult, SwayScore)`` pair from saved JSON. |
| 387 | |
| 388 | Inverse of :func:`to_json` for the fields the renderers consume. |
| 389 | Missing fields are tolerated — older snapshots predate |
| 390 | ``determinism`` and ``schema_version`` — so this helper stays |
| 391 | backward-compatible by default. ``sway report --format X`` uses |
| 392 | this so all four formats (terminal / md / junit / json) flow |
| 393 | through the same renderers as a fresh ``sway run`` (B16). |
| 394 | """ |
| 395 | from datetime import datetime |
| 396 | |
| 397 | from dlm_sway.core.result import ( |
| 398 | DEFAULT_COMPONENT_WEIGHTS, |
| 399 | DeterminismReport, |
| 400 | ProbeResult, |
| 401 | SuiteResult, |
| 402 | SwayScore, |
| 403 | Verdict, |
| 404 | ) |
| 405 | |
| 406 | def _ts(s: str | None) -> datetime: |
| 407 | if s: |
| 408 | return datetime.fromisoformat(s) |
| 409 | # Snapshots that predate the field — give the renderer a |
| 410 | # well-defined zero so wall-time displays as 0.00s. |
| 411 | return datetime.fromtimestamp(0).astimezone() |
| 412 | |
| 413 | def _ci_95(v: Any) -> tuple[float, float] | None: |
| 414 | if v is None: |
| 415 | return None |
| 416 | try: |
| 417 | lo, hi = v |
| 418 | return (float(lo), float(hi)) |
| 419 | except (TypeError, ValueError): |
| 420 | return None |
| 421 | |
| 422 | probes = tuple( |
| 423 | ProbeResult( |
| 424 | name=p["name"], |
| 425 | kind=p["kind"], |
| 426 | verdict=Verdict(p["verdict"]), |
| 427 | score=p.get("score"), |
| 428 | raw=p.get("raw"), |
| 429 | z_score=p.get("z_score"), |
| 430 | base_value=p.get("base_value"), |
| 431 | ft_value=p.get("ft_value"), |
| 432 | evidence=dict(p.get("evidence") or {}), |
| 433 | message=p.get("message", ""), |
| 434 | duration_s=float(p.get("duration_s", 0.0)), |
| 435 | ci_95=_ci_95(p.get("ci_95")), |
| 436 | ) |
| 437 | for p in raw.get("probes", []) |
| 438 | ) |
| 439 | |
| 440 | determinism: DeterminismReport | None = None |
| 441 | det_raw = raw.get("determinism") |
| 442 | if isinstance(det_raw, dict): |
| 443 | determinism = DeterminismReport( |
| 444 | class_=det_raw.get("class", "best_effort"), |
| 445 | seed=int(det_raw.get("seed", 0)), |
| 446 | notes=tuple(det_raw.get("notes") or ()), |
| 447 | ) |
| 448 | |
| 449 | suite = SuiteResult( |
| 450 | spec_path=raw.get("spec_path", ""), |
| 451 | started_at=_ts(raw.get("started_at")), |
| 452 | finished_at=_ts(raw.get("finished_at")), |
| 453 | base_model_id=raw.get("base_model_id", ""), |
| 454 | adapter_id=raw.get("adapter_id", ""), |
| 455 | sway_version=raw.get("sway_version", "?"), |
| 456 | probes=probes, |
| 457 | null_stats=dict(raw.get("null_stats") or {}), |
| 458 | determinism=determinism, |
| 459 | backend_stats=dict(raw.get("backend_stats") or {}), |
| 460 | ) |
| 461 | |
| 462 | score_raw: dict[str, Any] = raw.get("score") or {} |
| 463 | score = SwayScore( |
| 464 | overall=float(score_raw.get("overall", 0.0)), |
| 465 | components=dict(score_raw.get("components") or {}), |
| 466 | weights=dict(score_raw.get("weights") or DEFAULT_COMPONENT_WEIGHTS), |
| 467 | band=score_raw.get("band", ""), |
| 468 | findings=tuple(score_raw.get("findings") or ()), |
| 469 | ) |
| 470 | return suite, score |
| 471 | |
| 472 | |
| 473 | def to_junit(suite: SuiteResult, score: SwayScore) -> str: |
| 474 | """Serialize as JUnit XML. One ``<testcase>`` per probe.""" |
| 475 | testsuite = ET.Element( |
| 476 | "testsuite", |
| 477 | { |
| 478 | "name": "sway", |
| 479 | "tests": str(len(suite.probes)), |
| 480 | "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)), |
| 481 | "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)), |
| 482 | "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)), |
| 483 | "time": f"{suite.wall_seconds:.3f}", |
| 484 | }, |
| 485 | ) |
| 486 | # Properties — the composite score and category breakdown. |
| 487 | props = ET.SubElement(testsuite, "properties") |
| 488 | ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"}) |
| 489 | ET.SubElement(props, "property", {"name": "band", "value": score.band}) |
| 490 | for cat, v in score.components.items(): |
| 491 | ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"}) |
| 492 | |
| 493 | for r in suite.probes: |
| 494 | tc = ET.SubElement( |
| 495 | testsuite, |
| 496 | "testcase", |
| 497 | {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"}, |
| 498 | ) |
| 499 | if r.verdict == Verdict.FAIL: |
| 500 | ET.SubElement(tc, "failure", {"message": r.message or "failed"}) |
| 501 | elif r.verdict == Verdict.ERROR: |
| 502 | ET.SubElement(tc, "error", {"message": r.message or "errored"}) |
| 503 | elif r.verdict == Verdict.SKIP: |
| 504 | ET.SubElement(tc, "skipped", {"message": r.message or "skipped"}) |
| 505 | |
| 506 | return ET.tostring(testsuite, encoding="unicode") |
| 507 | |
| 508 | |
| 509 | def to_markdown(suite: SuiteResult, score: SwayScore) -> str: |
| 510 | """A portable, CI-friendly markdown report. |
| 511 | |
| 512 | The single source of the markdown emit (B16): both |
| 513 | ``sway run --markdown`` and ``sway report --format md`` route |
| 514 | through this function. No second ``_render_markdown_from_json``. |
| 515 | """ |
| 516 | buf = StringIO() |
| 517 | buf.write("# sway report\n\n") |
| 518 | buf.write(f"**Overall:** {format_score(score.overall)} (`{score.band}`) \n") |
| 519 | buf.write(f"**Base:** `{suite.base_model_id}` \n") |
| 520 | buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}` \n") |
| 521 | buf.write(f"**Wall:** {format_duration_s(suite.wall_seconds)} \n") |
| 522 | if suite.determinism is not None: |
| 523 | buf.write( |
| 524 | f"**Determinism:** `{suite.determinism.class_}` (seed={suite.determinism.seed}) \n" |
| 525 | ) |
| 526 | cache_line = _cache_line(suite) |
| 527 | if cache_line is not None: |
| 528 | buf.write(f"**Backend:** {cache_line} \n") |
| 529 | buf.write("\n") |
| 530 | |
| 531 | buf.write("## Components\n\n") |
| 532 | buf.write("| category | score | weight | |\n|---|---:|---:|---|\n") |
| 533 | for cat in _category_order(score): |
| 534 | if cat not in score.components: |
| 535 | continue |
| 536 | v = score.components[cat] |
| 537 | weight = score.weights.get(cat, 0.0) |
| 538 | label = "(informational, weight=0)" if weight == 0.0 else "" |
| 539 | buf.write(f"| {cat} | {format_score(v)} | {format_score(weight)} | {label} |\n") |
| 540 | |
| 541 | # D9: markdown must reach parity with the terminal table — raw, |
| 542 | # z_score, duration_s all shown. Findings are appended as a section |
| 543 | # below so CI log consumers can see them without opening the JSON. |
| 544 | buf.write("\n## Probes\n\n") |
| 545 | buf.write( |
| 546 | "| name | kind | verdict | score | raw | ci95 | z | duration | note |\n" |
| 547 | "|---|---|---|---:|---:|---:|---:|---:|---|\n" |
| 548 | ) |
| 549 | for r in suite.probes: |
| 550 | # Escape pipes in messages so markdown doesn't treat them as |
| 551 | # column separators. Leading/trailing whitespace collapsed. |
| 552 | note = _message_with_rank_profile(r).replace("|", "\\|").replace("\n", " ").strip() |
| 553 | buf.write( |
| 554 | f"| {r.name} | `{r.kind}` | {r.verdict.value} | " |
| 555 | f"{format_score(r.score)} | {format_raw(r.raw)} | " |
| 556 | f"{format_ci(r.ci_95)} | {format_z(r.z_score)} | " |
| 557 | f"{format_duration_s(r.duration_s)} | {note} |\n" |
| 558 | ) |
| 559 | |
| 560 | if score.findings: |
| 561 | buf.write("\n## Top findings\n\n") |
| 562 | for f in score.findings: |
| 563 | buf.write(f"- {f}\n") |
| 564 | |
| 565 | # D3: missing-extras rollup. |
| 566 | extras = collect_missing_extras(suite) |
| 567 | if extras: |
| 568 | skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP) |
| 569 | buf.write("\n## Skipped probes\n\n") |
| 570 | buf.write(f"{skipped_ct} probe(s) skipped due to missing extras. Install with:\n\n") |
| 571 | buf.write(f"```\npip install 'dlm-sway[{','.join(extras)}]'\n```\n") |
| 572 | |
| 573 | # F15: null-calibration opt-outs rollup. |
| 574 | opt_outs = collect_null_opt_outs(suite) |
| 575 | if opt_outs: |
| 576 | buf.write("\n## Null-calibration opt-outs\n\n") |
| 577 | buf.write( |
| 578 | f"{len(opt_outs)} probe(s) fall back to fixed thresholds because " |
| 579 | f"their `calibrate_spec` returns `None`:\n\n" |
| 580 | ) |
| 581 | for kind in opt_outs: |
| 582 | buf.write(f"- `{kind}`\n") |
| 583 | |
| 584 | # F02 (Audit 03) — degenerate null-calibration rollup. |
| 585 | degenerate = collect_degenerate_null_kinds(suite) |
| 586 | if degenerate: |
| 587 | buf.write("\n## Degenerate null calibration\n\n") |
| 588 | buf.write( |
| 589 | f"{len(degenerate)} probe kind(s) ran null_adapter but the " |
| 590 | f"resulting baseline was too narrow for z-scoring " |
| 591 | f"(std ≈ 0, typically `runs: 1` or coincidentally-matched " |
| 592 | f"seeds). Fix: bump `runs:` in the `null_adapter` spec " |
| 593 | f"entry. Affected kinds:\n\n" |
| 594 | ) |
| 595 | for kind in degenerate: |
| 596 | buf.write(f"- `{kind}`\n") |
| 597 | |
| 598 | # F07 — cluster_kl sub-line: expand the per-cluster breakdown so |
| 599 | # the reader can answer "which topic moved?" without cracking open |
| 600 | # the JSON. The row itself already carries ``k=N, spec=X.XX`` in |
| 601 | # the message; this section adds the per-cluster mean KL + top |
| 602 | # exemplars. |
| 603 | ck_probes = [p for p in suite.probes if p.kind == "cluster_kl" and p.evidence] |
| 604 | if ck_probes: |
| 605 | buf.write("\n## Cluster breakdown (cluster_kl)\n\n") |
| 606 | for p in ck_probes: |
| 607 | per_cluster = p.evidence.get("per_cluster_mean_kl", []) |
| 608 | sizes = p.evidence.get("per_cluster_size", []) |
| 609 | exemplars = p.evidence.get("cluster_exemplars", []) |
| 610 | buf.write(f"### `{p.name}`\n\n") |
| 611 | buf.write("| cluster | size | mean KL | exemplars |\n") |
| 612 | buf.write("|---:|---:|---:|---|\n") |
| 613 | for i, (mean, size, ex) in enumerate(zip(per_cluster, sizes, exemplars, strict=False)): |
| 614 | mean_str = "—" if not isinstance(mean, int | float) else f"{mean:.3f}" |
| 615 | ex_str = "; ".join(e.replace("|", "\\|") for e in (ex or [])) or "—" |
| 616 | buf.write(f"| {i} | {size} | {mean_str} | {ex_str} |\n") |
| 617 | buf.write("\n") |
| 618 | |
| 619 | return buf.getvalue() |
| 620 | |
| 621 | |
| 622 | # -- helpers ----------------------------------------------------------- |
| 623 | |
| 624 | |
| 625 | def _category_order(score: SwayScore) -> list[str]: |
| 626 | """Unified render order for component categories. |
| 627 | |
| 628 | Falls back through two sources, in priority order: |
| 629 | |
| 630 | 1. Keys of :data:`core.result.DEFAULT_COMPONENT_WEIGHTS` — the |
| 631 | canonical category list every first-party probe slots into. |
| 632 | 2. Any category present in ``score.components`` that isn't in the |
| 633 | default weights — so a custom :class:`Probe` subclass declaring |
| 634 | a brand-new category still renders (F16). |
| 635 | |
| 636 | Keeps the renderer loop in terminal + markdown identical so future |
| 637 | additions flow through both surfaces without a second code path. |
| 638 | """ |
| 639 | from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS |
| 640 | |
| 641 | order: list[str] = list(DEFAULT_COMPONENT_WEIGHTS.keys()) |
| 642 | order.extend(cat for cat in score.components if cat not in DEFAULT_COMPONENT_WEIGHTS) |
| 643 | return order |
| 644 | |
| 645 | |
| 646 | def _cache_line(suite: SuiteResult) -> str | None: |
| 647 | """Format the cache-hit-rate footer line, or ``None`` when no stats. |
| 648 | |
| 649 | S23 — suffixes a ``batches: N (avg=K)`` segment when the suite |
| 650 | fired any batched forward calls. Runs that only use single-prompt |
| 651 | scoring (older probes, opt-out probes) render the cache line |
| 652 | alone, preserving pre-S23 footer shape. |
| 653 | """ |
| 654 | stats = suite.backend_stats |
| 655 | if not stats: |
| 656 | return None |
| 657 | hits = int(stats.get("cache_hits", 0)) |
| 658 | misses = int(stats.get("cache_misses", 0)) |
| 659 | total = hits + misses |
| 660 | if total == 0: |
| 661 | return None |
| 662 | pct = 100.0 * hits / total |
| 663 | line = f"cache: {hits}/{total} = {pct:.0f}%" |
| 664 | batches = int(stats.get("batches_sent", 0)) |
| 665 | if batches > 0: |
| 666 | avg = float(stats.get("avg_batch_size", 0.0)) |
| 667 | line = f"{line} | batches: {batches} (avg={avg:.1f})" |
| 668 | return line |
| 669 | |
| 670 | |
| 671 | def _adapter_label(adapter_id: str) -> str: |
| 672 | """Truncate the adapter path for display; quote when whitespace is present. |
| 673 | |
| 674 | D14: a path containing spaces (``/Users/me/My Adapters/v1``) was |
| 675 | rendering ambiguously in the header. Quote it whenever any |
| 676 | whitespace appears so the trailing path is unmistakable. |
| 677 | """ |
| 678 | if not adapter_id: |
| 679 | return "(base only)" |
| 680 | parts = adapter_id.rstrip("/").split("/") |
| 681 | label = "/".join(parts[-3:]) if len(parts) > 3 else adapter_id |
| 682 | if any(ch.isspace() for ch in label): |
| 683 | # Use double quotes so the result drops cleanly into a CLI |
| 684 | # invocation if a user copy-pastes it. |
| 685 | return f'"{label}"' |
| 686 | return label |
| 687 | |
| 688 | |
| 689 | def _score_style(v: float) -> str: |
| 690 | if v >= 0.6: |
| 691 | return "bold green" |
| 692 | if v >= 0.3: |
| 693 | return "bold yellow" |
| 694 | return "bold red" |
| 695 | |
| 696 | |
| 697 | def _band_style(band: str) -> str: |
| 698 | return { |
| 699 | "noise": "red", |
| 700 | "partial": "yellow", |
| 701 | "healthy": "green", |
| 702 | "suspicious": "magenta", |
| 703 | }.get(band, "white") |
| 704 | |
| 705 | |
| 706 | def _bar(v: float, *, width: int = 10) -> str: |
| 707 | clamped = max(0.0, min(1.0, v)) |
| 708 | filled = int(round(clamped * width)) |
| 709 | return "█" * filled + "░" * (width - filled) |
| 710 | |
| 711 | |
| 712 | __all__ = [ |
| 713 | "collect_degenerate_null_kinds", |
| 714 | "collect_missing_extras", |
| 715 | "collect_null_opt_outs", |
| 716 | "format_duration_s", |
| 717 | "format_raw", |
| 718 | "format_score", |
| 719 | "format_z", |
| 720 | "from_json", |
| 721 | "to_json", |
| 722 | "to_junit", |
| 723 | "to_markdown", |
| 724 | "to_terminal", |
| 725 | ] |