"""Report emitters: terminal (rich), JSON, JUnit XML, markdown. The terminal renderer is the one a user sees; it's the product surface. It must communicate the verdict *and* the supporting evidence without forcing the user to open the JSON. JSON is the machine-readable source of truth — same fields as the :class:`SuiteResult` dataclass but flattened for easy downstream parsing (dashboards, diff tools, history tracking). JUnit XML exists to drop into CI pipelines so ``sway gate`` integrates with existing test dashboards with no extra glue. """ from __future__ import annotations import json import math import re import xml.etree.ElementTree as ET from io import StringIO from typing import Any from rich.console import Console from rich.panel import Panel from rich.table import Table from rich.text import Text from dlm_sway.core.result import ProbeResult, SuiteResult, SwayScore, Verdict from dlm_sway.probes._zscore import format_z_profile _VERDICT_STYLE = { Verdict.PASS: "bold green", Verdict.FAIL: "bold red", Verdict.WARN: "bold yellow", Verdict.SKIP: "dim", Verdict.ERROR: "bold magenta", } #: Sentinel character all renderers use for "no numeric value available." #: Single source prevents drift between surfaces (terminal vs markdown #: vs JSON downstream consumers that copy the rendered strings). _NONE_GLYPH = "—" # -- unified number formatters (S06.10) -------------------------------- # # Every surface that prints a number routes through one of these. A # tests snapshot that locks report output catches any drift before it # ships. Typing is wide (``float | int | None``) so callers don't have # to special-case ``None`` at every site. def format_score(v: float | int | None) -> str: """Two-decimal score, ``—`` when missing or non-finite.""" if v is None or not math.isfinite(float(v)): return _NONE_GLYPH return f"{float(v):.2f}" def format_raw(v: float | int | None) -> str: """Three-decimal raw metric, ``—`` when missing or non-finite. Uses thousands separators at magnitude ≥ 1 000 so half-life outputs from ``prompt_collapse`` don't render as ``1945.473`` (hard to eyeball). """ if v is None or not math.isfinite(float(v)): return _NONE_GLYPH return f"{float(v):,.3f}" def format_z(v: float | int | None) -> str: """Signed z-score with ``σ`` suffix and thousands separator, ``—`` on None.""" if v is None or not math.isfinite(float(v)): return _NONE_GLYPH return f"{float(v):+,.2f}σ" def format_ci(ci: tuple[float, float] | None) -> str: """Percentile-bootstrap 95% CI as ``[lo, hi]``; ``—`` on None / non-finite.""" if ci is None: return _NONE_GLYPH lo, hi = ci if not (math.isfinite(float(lo)) and math.isfinite(float(hi))): return _NONE_GLYPH return f"[{float(lo):.3f}, {float(hi):.3f}]" def _message_with_rank_profile(r: ProbeResult) -> str: """Append the per-rank z-profile to a probe's message when present. Renders as ``" | rank profile: +4.2σ @ 1x / +6.8σ @ 0.5x"``. When the probe didn't run under multi-rank calibration (``z_by_rank`` is ``None`` or has a single rank), returns the message unchanged. """ base = r.message or "" z_by_rank = r.evidence.get("z_by_rank") if not z_by_rank or len(z_by_rank) < 2: return base profile = format_z_profile(z_by_rank) if not profile: return base return f"{base} | rank profile: {profile}" if base else f"rank profile: {profile}" def format_duration_s(v: float | int | None) -> str: """Wall-time display. ``1.23s`` for sub-second, ``12.3s`` above 10, ``—`` on None.""" if v is None or not math.isfinite(float(v)): return _NONE_GLYPH f = float(v) if f < 10.0: return f"{f:.2f}s" if f < 100.0: return f"{f:.1f}s" return f"{f:,.0f}s" # -- extras-rollup helpers (S06.6) ------------------------------------- _MISSING_EXTRA_RE = re.compile(r"install the \[([^\]]+)\] extra", re.IGNORECASE) def collect_missing_extras(suite: SuiteResult) -> list[str]: """Parse SKIP messages for ``install the [X] extra`` hints. Returns a deduplicated, sorted list of extra names that would unskip probes. ``BackendNotAvailableError`` formats messages with ``install the [] extra`` so we can lift them out without wiring a new field through. """ found: set[str] = set() for p in suite.probes: if p.verdict != Verdict.SKIP or not p.message: continue for match in _MISSING_EXTRA_RE.finditer(p.message): found.add(match.group(1)) return sorted(found) def collect_degenerate_null_kinds(suite: SuiteResult) -> list[str]: """Probe kinds whose null-calibration stats were flagged degenerate. ``null_adapter`` marks a kind's stats with ``degenerate: 1.0`` when the calibration ran but the baseline was too narrow for the z-score path to fire (``runs: 1``, or a multi-seed run whose raws collapsed to an effectively-zero variance — F02 from Audit 03). Unlike :func:`collect_null_opt_outs` (which surfaces probes that opted out at spec-build time), this surface catches the case where the null *did* run but wasn't useful. Both cases fall back to fixed thresholds; the report distinguishes them so users can act: ``opt_out`` → expected for probes like ``adapter_revert``; ``degenerate`` → bump ``runs:`` in the spec. """ found: set[str] = set() for probe in suite.probes: if probe.kind != "null_adapter": continue # ``null_adapter`` writes per-kind stats into # ``SuiteResult.null_stats``, not the probe's evidence — the # suite-level field is the canonical place the runner threads # calibration across probes. stats_by_kind = suite.null_stats or {} for kind, kind_stats in stats_by_kind.items(): if not isinstance(kind_stats, dict): continue if kind_stats.get("degenerate", 0.0) >= 0.5: found.add(kind) return sorted(found) def collect_null_opt_outs(suite: SuiteResult) -> list[str]: """Probe kinds that opted out of null calibration. ``null_adapter`` publishes ``evidence["skipped_kinds"]`` with the probe kinds whose ``calibrate_spec`` returned ``None`` (e.g. ``adapter_revert`` — no embedder on the null proxy; ``prompt_collapse`` — noise can't fit an exponential decay). Returns a deduplicated, sorted list of those kinds, or an empty list when no null_adapter ran in the suite. """ found: set[str] = set() for p in suite.probes: if p.kind != "null_adapter": continue skipped = p.evidence.get("skipped_kinds") if not skipped: continue for kind in skipped: if isinstance(kind, str): found.add(kind) return sorted(found) def to_terminal(suite: SuiteResult, score: SwayScore, *, console: Console | None = None) -> None: """Render the report to a rich Console (stdout by default).""" c = console or Console() header = Text.assemble( ("sway report — ", "bold"), (suite.base_model_id, "cyan"), (" vs ", "dim"), (_adapter_label(suite.adapter_id), "cyan"), ) c.print(Panel(header, expand=False, border_style="blue")) c.print() c.print( Text.assemble( ("overall: ", "bold"), (format_score(score.overall), _score_style(score.overall)), (" ", ""), (f"[ {score.band} ]", _band_style(score.band)), ) ) # Component breakdown. Order matches ``DEFAULT_COMPONENT_WEIGHTS`` # (the extensibility point) and appends any categories present in # ``score.components`` but not in the default weights — so a custom # Probe subclass with a new category still renders. comp_table = Table.grid(padding=(0, 2)) comp_table.add_column(justify="left") comp_table.add_column(justify="right") comp_table.add_column() comp_table.add_column(style="dim") for cat in _category_order(score): if cat not in score.components: continue v = score.components[cat] weight = score.weights.get(cat, 0.0) # S03 / B18: a zero-weight category contributes nothing to the # composite; label explicitly so users don't mistake the visible # bar for judgment. label = "(informational, weight=0)" if weight == 0.0 else "" comp_table.add_row(cat, format_score(v), _bar(v), label) c.print(comp_table) c.print() # Per-probe detail. detail = Table(show_header=True, header_style="bold", box=None, padding=(0, 1)) detail.add_column("name", style="cyan") detail.add_column("kind", style="dim") detail.add_column("verdict") detail.add_column("score", justify="right") detail.add_column("raw", justify="right") detail.add_column("ci95", justify="right", style="dim") detail.add_column("z", justify="right") # D15: let Rich wrap long messages instead of hard-truncating at 80 # chars with an ellipsis. ``overflow="fold"`` + ``no_wrap=False`` # preserves the full text across multiple terminal lines. detail.add_column("note", style="dim", overflow="fold", no_wrap=False) for r in suite.probes: detail.add_row( r.name, r.kind, Text(r.verdict.value, style=_VERDICT_STYLE[r.verdict]), format_score(r.score), format_raw(r.raw), format_ci(r.ci_95), format_z(r.z_score), Text(_message_with_rank_profile(r)), ) c.print(detail) if score.findings: c.print() c.print(Text("top findings:", style="bold")) for i, f in enumerate(score.findings, start=1): c.print(f" {i}. {f}") # D3: missing-extras rollup. When probes SKIPped because their # backend extras aren't installed, collapse the hints into one # actionable footer rather than forcing the user to scan per-row. extras = collect_missing_extras(suite) if extras: c.print() skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP) c.print( Text( f"{skipped_ct} probe(s) skipped due to missing extras: " f"pip install 'dlm-sway[{','.join(extras)}]'", style="dim", ) ) # F15: null-calibration opt-outs rollup. Probes whose # ``calibrate_spec`` returns ``None`` fall back to fixed-threshold # verdicts. Surface the list in the footer so users understand # why those rows read ``(no calibration)`` in the message column. opt_outs = collect_null_opt_outs(suite) if opt_outs: c.print() c.print( Text( f"{len(opt_outs)} probe(s) opted out of null calibration " f"(using fixed thresholds): {', '.join(opt_outs)}", style="dim", ) ) # F02 (Audit 03): null-calibration-degenerate rollup. Distinct from # opt-outs — the null *did* run, but its baseline was too narrow # (``runs: 1`` or coincidentally-identical seeds). Users see this # and bump ``runs:`` in the spec; the fix is actionable. degenerate = collect_degenerate_null_kinds(suite) if degenerate: c.print() c.print( Text( f"{len(degenerate)} probe kind(s) had a degenerate null " f"baseline (std ≈ 0, insufficient for z-scoring): " f"{', '.join(degenerate)} — bump ``runs:`` in null_adapter spec.", style="dim", ) ) c.print() footer_parts = [f"wall: {format_duration_s(suite.wall_seconds)}", f"sway {suite.sway_version}"] if suite.determinism is not None: footer_parts.append(f"det: {suite.determinism.class_} (seed={suite.determinism.seed})") cache_line = _cache_line(suite) if cache_line is not None: footer_parts.append(cache_line) c.print(Text(" | ".join(footer_parts), style="dim")) def to_json(suite: SuiteResult, score: SwayScore) -> str: """Serialize the suite + composite score as JSON. Stable schema; downstream tools rely on it. Breaking changes bump a ``schema_version`` field (not yet present — this is v0.1). """ return json.dumps(_to_jsonable(suite, score), indent=2, sort_keys=True) def _to_jsonable(suite: SuiteResult, score: SwayScore) -> dict[str, Any]: determinism: dict[str, Any] | None = None if suite.determinism is not None: determinism = { "class": suite.determinism.class_, "seed": suite.determinism.seed, "notes": list(suite.determinism.notes), } return { "schema_version": 1, "sway_version": suite.sway_version, "spec_path": suite.spec_path, "base_model_id": suite.base_model_id, "adapter_id": suite.adapter_id, "started_at": suite.started_at.isoformat(), "finished_at": suite.finished_at.isoformat(), "wall_seconds": suite.wall_seconds, "determinism": determinism, "backend_stats": dict(suite.backend_stats) if suite.backend_stats else {}, "score": { "overall": score.overall, "band": score.band, "components": score.components, "weights": score.weights, "findings": list(score.findings), }, "null_stats": suite.null_stats, "probes": [_probe_to_jsonable(p) for p in suite.probes], } def _probe_to_jsonable(r: ProbeResult) -> dict[str, Any]: return { "name": r.name, "kind": r.kind, "verdict": r.verdict.value, "score": r.score, "raw": r.raw, "z_score": r.z_score, "base_value": r.base_value, "ft_value": r.ft_value, "evidence": r.evidence, "message": r.message, "duration_s": r.duration_s, # S14: bootstrap 95% CI on ``raw``. Serialized as a two-list # [lo, hi] so JSON stays tuple-free (match numpy convention). "ci_95": list(r.ci_95) if r.ci_95 is not None else None, } def from_json(raw: dict[str, Any]) -> tuple[SuiteResult, SwayScore]: """Reconstruct a ``(SuiteResult, SwayScore)`` pair from saved JSON. Inverse of :func:`to_json` for the fields the renderers consume. Missing fields are tolerated — older snapshots predate ``determinism`` and ``schema_version`` — so this helper stays backward-compatible by default. ``sway report --format X`` uses this so all four formats (terminal / md / junit / json) flow through the same renderers as a fresh ``sway run`` (B16). """ from datetime import datetime from dlm_sway.core.result import ( DEFAULT_COMPONENT_WEIGHTS, DeterminismReport, ProbeResult, SuiteResult, SwayScore, Verdict, ) def _ts(s: str | None) -> datetime: if s: return datetime.fromisoformat(s) # Snapshots that predate the field — give the renderer a # well-defined zero so wall-time displays as 0.00s. return datetime.fromtimestamp(0).astimezone() def _ci_95(v: Any) -> tuple[float, float] | None: if v is None: return None try: lo, hi = v return (float(lo), float(hi)) except (TypeError, ValueError): return None probes = tuple( ProbeResult( name=p["name"], kind=p["kind"], verdict=Verdict(p["verdict"]), score=p.get("score"), raw=p.get("raw"), z_score=p.get("z_score"), base_value=p.get("base_value"), ft_value=p.get("ft_value"), evidence=dict(p.get("evidence") or {}), message=p.get("message", ""), duration_s=float(p.get("duration_s", 0.0)), ci_95=_ci_95(p.get("ci_95")), ) for p in raw.get("probes", []) ) determinism: DeterminismReport | None = None det_raw = raw.get("determinism") if isinstance(det_raw, dict): determinism = DeterminismReport( class_=det_raw.get("class", "best_effort"), seed=int(det_raw.get("seed", 0)), notes=tuple(det_raw.get("notes") or ()), ) suite = SuiteResult( spec_path=raw.get("spec_path", ""), started_at=_ts(raw.get("started_at")), finished_at=_ts(raw.get("finished_at")), base_model_id=raw.get("base_model_id", ""), adapter_id=raw.get("adapter_id", ""), sway_version=raw.get("sway_version", "?"), probes=probes, null_stats=dict(raw.get("null_stats") or {}), determinism=determinism, backend_stats=dict(raw.get("backend_stats") or {}), ) score_raw: dict[str, Any] = raw.get("score") or {} score = SwayScore( overall=float(score_raw.get("overall", 0.0)), components=dict(score_raw.get("components") or {}), weights=dict(score_raw.get("weights") or DEFAULT_COMPONENT_WEIGHTS), band=score_raw.get("band", ""), findings=tuple(score_raw.get("findings") or ()), ) return suite, score def to_junit(suite: SuiteResult, score: SwayScore) -> str: """Serialize as JUnit XML. One ```` per probe.""" testsuite = ET.Element( "testsuite", { "name": "sway", "tests": str(len(suite.probes)), "failures": str(sum(1 for p in suite.probes if p.verdict == Verdict.FAIL)), "errors": str(sum(1 for p in suite.probes if p.verdict == Verdict.ERROR)), "skipped": str(sum(1 for p in suite.probes if p.verdict == Verdict.SKIP)), "time": f"{suite.wall_seconds:.3f}", }, ) # Properties — the composite score and category breakdown. props = ET.SubElement(testsuite, "properties") ET.SubElement(props, "property", {"name": "overall", "value": f"{score.overall:.4f}"}) ET.SubElement(props, "property", {"name": "band", "value": score.band}) for cat, v in score.components.items(): ET.SubElement(props, "property", {"name": f"component.{cat}", "value": f"{v:.4f}"}) for r in suite.probes: tc = ET.SubElement( testsuite, "testcase", {"classname": r.kind, "name": r.name, "time": f"{r.duration_s:.3f}"}, ) if r.verdict == Verdict.FAIL: ET.SubElement(tc, "failure", {"message": r.message or "failed"}) elif r.verdict == Verdict.ERROR: ET.SubElement(tc, "error", {"message": r.message or "errored"}) elif r.verdict == Verdict.SKIP: ET.SubElement(tc, "skipped", {"message": r.message or "skipped"}) return ET.tostring(testsuite, encoding="unicode") def to_markdown(suite: SuiteResult, score: SwayScore) -> str: """A portable, CI-friendly markdown report. The single source of the markdown emit (B16): both ``sway run --markdown`` and ``sway report --format md`` route through this function. No second ``_render_markdown_from_json``. """ buf = StringIO() buf.write("# sway report\n\n") buf.write(f"**Overall:** {format_score(score.overall)} (`{score.band}`) \n") buf.write(f"**Base:** `{suite.base_model_id}` \n") buf.write(f"**Adapter:** `{_adapter_label(suite.adapter_id)}` \n") buf.write(f"**Wall:** {format_duration_s(suite.wall_seconds)} \n") if suite.determinism is not None: buf.write( f"**Determinism:** `{suite.determinism.class_}` (seed={suite.determinism.seed}) \n" ) cache_line = _cache_line(suite) if cache_line is not None: buf.write(f"**Backend:** {cache_line} \n") buf.write("\n") buf.write("## Components\n\n") buf.write("| category | score | weight | |\n|---|---:|---:|---|\n") for cat in _category_order(score): if cat not in score.components: continue v = score.components[cat] weight = score.weights.get(cat, 0.0) label = "(informational, weight=0)" if weight == 0.0 else "" buf.write(f"| {cat} | {format_score(v)} | {format_score(weight)} | {label} |\n") # D9: markdown must reach parity with the terminal table — raw, # z_score, duration_s all shown. Findings are appended as a section # below so CI log consumers can see them without opening the JSON. buf.write("\n## Probes\n\n") buf.write( "| name | kind | verdict | score | raw | ci95 | z | duration | note |\n" "|---|---|---|---:|---:|---:|---:|---:|---|\n" ) for r in suite.probes: # Escape pipes in messages so markdown doesn't treat them as # column separators. Leading/trailing whitespace collapsed. note = _message_with_rank_profile(r).replace("|", "\\|").replace("\n", " ").strip() buf.write( f"| {r.name} | `{r.kind}` | {r.verdict.value} | " f"{format_score(r.score)} | {format_raw(r.raw)} | " f"{format_ci(r.ci_95)} | {format_z(r.z_score)} | " f"{format_duration_s(r.duration_s)} | {note} |\n" ) if score.findings: buf.write("\n## Top findings\n\n") for f in score.findings: buf.write(f"- {f}\n") # D3: missing-extras rollup. extras = collect_missing_extras(suite) if extras: skipped_ct = sum(1 for p in suite.probes if p.verdict == Verdict.SKIP) buf.write("\n## Skipped probes\n\n") buf.write(f"{skipped_ct} probe(s) skipped due to missing extras. Install with:\n\n") buf.write(f"```\npip install 'dlm-sway[{','.join(extras)}]'\n```\n") # F15: null-calibration opt-outs rollup. opt_outs = collect_null_opt_outs(suite) if opt_outs: buf.write("\n## Null-calibration opt-outs\n\n") buf.write( f"{len(opt_outs)} probe(s) fall back to fixed thresholds because " f"their `calibrate_spec` returns `None`:\n\n" ) for kind in opt_outs: buf.write(f"- `{kind}`\n") # F02 (Audit 03) — degenerate null-calibration rollup. degenerate = collect_degenerate_null_kinds(suite) if degenerate: buf.write("\n## Degenerate null calibration\n\n") buf.write( f"{len(degenerate)} probe kind(s) ran null_adapter but the " f"resulting baseline was too narrow for z-scoring " f"(std ≈ 0, typically `runs: 1` or coincidentally-matched " f"seeds). Fix: bump `runs:` in the `null_adapter` spec " f"entry. Affected kinds:\n\n" ) for kind in degenerate: buf.write(f"- `{kind}`\n") # F07 — cluster_kl sub-line: expand the per-cluster breakdown so # the reader can answer "which topic moved?" without cracking open # the JSON. The row itself already carries ``k=N, spec=X.XX`` in # the message; this section adds the per-cluster mean KL + top # exemplars. ck_probes = [p for p in suite.probes if p.kind == "cluster_kl" and p.evidence] if ck_probes: buf.write("\n## Cluster breakdown (cluster_kl)\n\n") for p in ck_probes: per_cluster = p.evidence.get("per_cluster_mean_kl", []) sizes = p.evidence.get("per_cluster_size", []) exemplars = p.evidence.get("cluster_exemplars", []) buf.write(f"### `{p.name}`\n\n") buf.write("| cluster | size | mean KL | exemplars |\n") buf.write("|---:|---:|---:|---|\n") for i, (mean, size, ex) in enumerate(zip(per_cluster, sizes, exemplars, strict=False)): mean_str = "—" if not isinstance(mean, int | float) else f"{mean:.3f}" ex_str = "; ".join(e.replace("|", "\\|") for e in (ex or [])) or "—" buf.write(f"| {i} | {size} | {mean_str} | {ex_str} |\n") buf.write("\n") return buf.getvalue() # -- helpers ----------------------------------------------------------- def _category_order(score: SwayScore) -> list[str]: """Unified render order for component categories. Falls back through two sources, in priority order: 1. Keys of :data:`core.result.DEFAULT_COMPONENT_WEIGHTS` — the canonical category list every first-party probe slots into. 2. Any category present in ``score.components`` that isn't in the default weights — so a custom :class:`Probe` subclass declaring a brand-new category still renders (F16). Keeps the renderer loop in terminal + markdown identical so future additions flow through both surfaces without a second code path. """ from dlm_sway.core.result import DEFAULT_COMPONENT_WEIGHTS order: list[str] = list(DEFAULT_COMPONENT_WEIGHTS.keys()) order.extend(cat for cat in score.components if cat not in DEFAULT_COMPONENT_WEIGHTS) return order def _cache_line(suite: SuiteResult) -> str | None: """Format the cache-hit-rate footer line, or ``None`` when no stats. S23 — suffixes a ``batches: N (avg=K)`` segment when the suite fired any batched forward calls. Runs that only use single-prompt scoring (older probes, opt-out probes) render the cache line alone, preserving pre-S23 footer shape. """ stats = suite.backend_stats if not stats: return None hits = int(stats.get("cache_hits", 0)) misses = int(stats.get("cache_misses", 0)) total = hits + misses if total == 0: return None pct = 100.0 * hits / total line = f"cache: {hits}/{total} = {pct:.0f}%" batches = int(stats.get("batches_sent", 0)) if batches > 0: avg = float(stats.get("avg_batch_size", 0.0)) line = f"{line} | batches: {batches} (avg={avg:.1f})" return line def _adapter_label(adapter_id: str) -> str: """Truncate the adapter path for display; quote when whitespace is present. D14: a path containing spaces (``/Users/me/My Adapters/v1``) was rendering ambiguously in the header. Quote it whenever any whitespace appears so the trailing path is unmistakable. """ if not adapter_id: return "(base only)" parts = adapter_id.rstrip("/").split("/") label = "/".join(parts[-3:]) if len(parts) > 3 else adapter_id if any(ch.isspace() for ch in label): # Use double quotes so the result drops cleanly into a CLI # invocation if a user copy-pastes it. return f'"{label}"' return label def _score_style(v: float) -> str: if v >= 0.6: return "bold green" if v >= 0.3: return "bold yellow" return "bold red" def _band_style(band: str) -> str: return { "noise": "red", "partial": "yellow", "healthy": "green", "suspicious": "magenta", }.get(band, "white") def _bar(v: float, *, width: int = 10) -> str: clamped = max(0.0, min(1.0, v)) filled = int(round(clamped * width)) return "█" * filled + "░" * (width - filled) __all__ = [ "collect_degenerate_null_kinds", "collect_missing_extras", "collect_null_opt_outs", "format_duration_s", "format_raw", "format_score", "format_z", "from_json", "to_json", "to_junit", "to_markdown", "to_terminal", ]