Python · 5796 bytes Raw Blame History
1 """C11: snapshot tests for the three report formats.
2
3 JSON is the machine-readable contract downstream tools depend on;
4 markdown is the CI-friendly human report; JUnit is the CI-dashboard
5 plumbing. Silent schema drift in any of them breaks consumers.
6
7 We serialize a deterministic fixture suite + score through each
8 emitter and byte-compare against checked-in snapshots under
9 ``tests/snapshots/``. Intentional schema bumps update the snapshot in
10 the same commit (``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest``); anything
11 else surfaces as a failed test.
12 """
13
14 from __future__ import annotations
15
16 import json
17 import os
18 import re
19 from datetime import UTC, datetime
20 from pathlib import Path
21
22 import pytest
23
24 from dlm_sway.core.result import (
25 DeterminismReport,
26 ProbeResult,
27 SuiteResult,
28 SwayScore,
29 Verdict,
30 )
31 from dlm_sway.suite import report
32
33 SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
34
35
36 def _fixture_suite_and_score() -> tuple[SuiteResult, SwayScore]:
37 """A hand-crafted SuiteResult whose every field is deterministic."""
38 started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
39 finished = datetime(2026, 1, 1, 12, 0, 2, 500000, tzinfo=UTC) # 2.5s wall
40 probes = (
41 ProbeResult(
42 name="dk",
43 kind="delta_kl",
44 verdict=Verdict.PASS,
45 score=0.87,
46 raw=0.456,
47 z_score=5.12,
48 evidence={"divergence_kind": "js", "num_prompts": 4, "weight": 1.0},
49 message="mean js=0.4560, z=+5.12σ vs null",
50 duration_s=0.123,
51 ci_95=(0.412, 0.497),
52 ),
53 ProbeResult(
54 name="sis",
55 kind="section_internalization",
56 verdict=Verdict.FAIL,
57 score=0.30,
58 raw=0.012,
59 z_score=0.5,
60 evidence={"num_sections": 4, "passing_frac": 0.25, "weight": 1.0},
61 message="1/4 sections cleared effective_sis≥0.05",
62 duration_s=0.456,
63 ),
64 ProbeResult(
65 name="lk",
66 kind="leakage",
67 verdict=Verdict.SKIP,
68 score=None,
69 message="no PROSE sections to test for leakage",
70 duration_s=0.001,
71 ),
72 ProbeResult(
73 name="ablation",
74 kind="adapter_ablation",
75 verdict=Verdict.ERROR,
76 score=None,
77 raw=None,
78 message="backend does not implement ScalableDifferentialBackend",
79 duration_s=0.0,
80 ),
81 )
82 suite = SuiteResult(
83 spec_path="/fixture/sway.yaml",
84 started_at=started,
85 finished_at=finished,
86 base_model_id="HuggingFaceTB/SmolLM2-135M-Instruct",
87 adapter_id="/fixture/runs/adapter/v0003",
88 sway_version="0.1.0.dev0",
89 probes=probes,
90 null_stats={"delta_kl": {"mean": 0.01, "std": 0.005, "n": 3.0}},
91 determinism=DeterminismReport(
92 class_="best_effort",
93 seed=0,
94 notes=("CPU-only backend: strict determinism depends on BLAS impl",),
95 ),
96 )
97 score = SwayScore(
98 overall=0.65,
99 components={
100 "adherence": 0.87,
101 "attribution": 0.30,
102 "calibration": 0.50,
103 "ablation": 0.0,
104 "baseline": 1.0,
105 },
106 weights={
107 "adherence": 0.30,
108 "attribution": 0.35,
109 "calibration": 0.20,
110 "ablation": 0.15,
111 "baseline": 0.0,
112 },
113 band="healthy",
114 findings=(
115 "sis (section_internalization) failed: 1/4 sections cleared effective_sis≥0.05",
116 "ablation score is 0.00 — below the noise threshold",
117 ),
118 )
119 return suite, score
120
121
122 def _compare_to_snapshot(actual: str, snapshot_name: str) -> None:
123 """Byte-compare ``actual`` against the snapshot file, updating when asked."""
124 path = SNAPSHOT_DIR / snapshot_name
125 if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
126 path.parent.mkdir(parents=True, exist_ok=True)
127 path.write_text(actual, encoding="utf-8")
128 pytest.skip(
129 f"snapshot {snapshot_name} written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
130 )
131 expected = path.read_text(encoding="utf-8")
132 assert actual == expected, (
133 f"{snapshot_name} drifted from snapshot.\n"
134 f"To accept the new output intentionally, run:\n"
135 f" SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_snapshot.py\n"
136 f"and commit the updated file.\n"
137 )
138
139
140 def test_json_schema_snapshot() -> None:
141 suite, score = _fixture_suite_and_score()
142 actual = report.to_json(suite, score)
143 # Sanity: it's parseable JSON with the expected top-level fields.
144 parsed = json.loads(actual)
145 assert parsed["schema_version"] == 1
146 assert parsed["determinism"] is not None
147 assert parsed["determinism"]["seed"] == 0
148 _compare_to_snapshot(actual + "\n", "report.json")
149
150
151 def test_markdown_layout_snapshot() -> None:
152 suite, score = _fixture_suite_and_score()
153 actual = report.to_markdown(suite, score)
154 _compare_to_snapshot(actual, "report.md")
155
156
157 def test_junit_layout_snapshot() -> None:
158 suite, score = _fixture_suite_and_score()
159 actual = report.to_junit(suite, score)
160 # ElementTree tostring doesn't include a trailing newline; normalize
161 # so diffs don't hinge on platform-dependent whitespace.
162 actual = actual.strip() + "\n"
163 # Strip the variable ``time`` attribute on <testsuite> — it encodes
164 # wall_seconds but all the testcase times are deterministic, so this
165 # single attribute is the only moving part we need to mask.
166 actual = re.sub(r' time="[\d.]+"', ' time="<wall>"', actual, count=1)
167 _compare_to_snapshot(actual, "report.junit.xml")