| 1 | """C11: snapshot tests for the three report formats. |
| 2 | |
| 3 | JSON is the machine-readable contract downstream tools depend on; |
| 4 | markdown is the CI-friendly human report; JUnit is the CI-dashboard |
| 5 | plumbing. Silent schema drift in any of them breaks consumers. |
| 6 | |
| 7 | We serialize a deterministic fixture suite + score through each |
| 8 | emitter and byte-compare against checked-in snapshots under |
| 9 | ``tests/snapshots/``. Intentional schema bumps update the snapshot in |
| 10 | the same commit (``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest``); anything |
| 11 | else surfaces as a failed test. |
| 12 | """ |
| 13 | |
| 14 | from __future__ import annotations |
| 15 | |
| 16 | import json |
| 17 | import os |
| 18 | import re |
| 19 | from datetime import UTC, datetime |
| 20 | from pathlib import Path |
| 21 | |
| 22 | import pytest |
| 23 | |
| 24 | from dlm_sway.core.result import ( |
| 25 | DeterminismReport, |
| 26 | ProbeResult, |
| 27 | SuiteResult, |
| 28 | SwayScore, |
| 29 | Verdict, |
| 30 | ) |
| 31 | from dlm_sway.suite import report |
| 32 | |
| 33 | SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots" |
| 34 | |
| 35 | |
| 36 | def _fixture_suite_and_score() -> tuple[SuiteResult, SwayScore]: |
| 37 | """A hand-crafted SuiteResult whose every field is deterministic.""" |
| 38 | started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC) |
| 39 | finished = datetime(2026, 1, 1, 12, 0, 2, 500000, tzinfo=UTC) # 2.5s wall |
| 40 | probes = ( |
| 41 | ProbeResult( |
| 42 | name="dk", |
| 43 | kind="delta_kl", |
| 44 | verdict=Verdict.PASS, |
| 45 | score=0.87, |
| 46 | raw=0.456, |
| 47 | z_score=5.12, |
| 48 | evidence={"divergence_kind": "js", "num_prompts": 4, "weight": 1.0}, |
| 49 | message="mean js=0.4560, z=+5.12σ vs null", |
| 50 | duration_s=0.123, |
| 51 | ci_95=(0.412, 0.497), |
| 52 | ), |
| 53 | ProbeResult( |
| 54 | name="sis", |
| 55 | kind="section_internalization", |
| 56 | verdict=Verdict.FAIL, |
| 57 | score=0.30, |
| 58 | raw=0.012, |
| 59 | z_score=0.5, |
| 60 | evidence={"num_sections": 4, "passing_frac": 0.25, "weight": 1.0}, |
| 61 | message="1/4 sections cleared effective_sis≥0.05", |
| 62 | duration_s=0.456, |
| 63 | ), |
| 64 | ProbeResult( |
| 65 | name="lk", |
| 66 | kind="leakage", |
| 67 | verdict=Verdict.SKIP, |
| 68 | score=None, |
| 69 | message="no PROSE sections to test for leakage", |
| 70 | duration_s=0.001, |
| 71 | ), |
| 72 | ProbeResult( |
| 73 | name="ablation", |
| 74 | kind="adapter_ablation", |
| 75 | verdict=Verdict.ERROR, |
| 76 | score=None, |
| 77 | raw=None, |
| 78 | message="backend does not implement ScalableDifferentialBackend", |
| 79 | duration_s=0.0, |
| 80 | ), |
| 81 | ) |
| 82 | suite = SuiteResult( |
| 83 | spec_path="/fixture/sway.yaml", |
| 84 | started_at=started, |
| 85 | finished_at=finished, |
| 86 | base_model_id="HuggingFaceTB/SmolLM2-135M-Instruct", |
| 87 | adapter_id="/fixture/runs/adapter/v0003", |
| 88 | sway_version="0.1.0.dev0", |
| 89 | probes=probes, |
| 90 | null_stats={"delta_kl": {"mean": 0.01, "std": 0.005, "n": 3.0}}, |
| 91 | determinism=DeterminismReport( |
| 92 | class_="best_effort", |
| 93 | seed=0, |
| 94 | notes=("CPU-only backend: strict determinism depends on BLAS impl",), |
| 95 | ), |
| 96 | ) |
| 97 | score = SwayScore( |
| 98 | overall=0.65, |
| 99 | components={ |
| 100 | "adherence": 0.87, |
| 101 | "attribution": 0.30, |
| 102 | "calibration": 0.50, |
| 103 | "ablation": 0.0, |
| 104 | "baseline": 1.0, |
| 105 | }, |
| 106 | weights={ |
| 107 | "adherence": 0.30, |
| 108 | "attribution": 0.35, |
| 109 | "calibration": 0.20, |
| 110 | "ablation": 0.15, |
| 111 | "baseline": 0.0, |
| 112 | }, |
| 113 | band="healthy", |
| 114 | findings=( |
| 115 | "sis (section_internalization) failed: 1/4 sections cleared effective_sis≥0.05", |
| 116 | "ablation score is 0.00 — below the noise threshold", |
| 117 | ), |
| 118 | ) |
| 119 | return suite, score |
| 120 | |
| 121 | |
| 122 | def _compare_to_snapshot(actual: str, snapshot_name: str) -> None: |
| 123 | """Byte-compare ``actual`` against the snapshot file, updating when asked.""" |
| 124 | path = SNAPSHOT_DIR / snapshot_name |
| 125 | if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists(): |
| 126 | path.parent.mkdir(parents=True, exist_ok=True) |
| 127 | path.write_text(actual, encoding="utf-8") |
| 128 | pytest.skip( |
| 129 | f"snapshot {snapshot_name} written — re-run without SWAY_UPDATE_SNAPSHOTS to verify" |
| 130 | ) |
| 131 | expected = path.read_text(encoding="utf-8") |
| 132 | assert actual == expected, ( |
| 133 | f"{snapshot_name} drifted from snapshot.\n" |
| 134 | f"To accept the new output intentionally, run:\n" |
| 135 | f" SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_snapshot.py\n" |
| 136 | f"and commit the updated file.\n" |
| 137 | ) |
| 138 | |
| 139 | |
| 140 | def test_json_schema_snapshot() -> None: |
| 141 | suite, score = _fixture_suite_and_score() |
| 142 | actual = report.to_json(suite, score) |
| 143 | # Sanity: it's parseable JSON with the expected top-level fields. |
| 144 | parsed = json.loads(actual) |
| 145 | assert parsed["schema_version"] == 1 |
| 146 | assert parsed["determinism"] is not None |
| 147 | assert parsed["determinism"]["seed"] == 0 |
| 148 | _compare_to_snapshot(actual + "\n", "report.json") |
| 149 | |
| 150 | |
| 151 | def test_markdown_layout_snapshot() -> None: |
| 152 | suite, score = _fixture_suite_and_score() |
| 153 | actual = report.to_markdown(suite, score) |
| 154 | _compare_to_snapshot(actual, "report.md") |
| 155 | |
| 156 | |
| 157 | def test_junit_layout_snapshot() -> None: |
| 158 | suite, score = _fixture_suite_and_score() |
| 159 | actual = report.to_junit(suite, score) |
| 160 | # ElementTree tostring doesn't include a trailing newline; normalize |
| 161 | # so diffs don't hinge on platform-dependent whitespace. |
| 162 | actual = actual.strip() + "\n" |
| 163 | # Strip the variable ``time`` attribute on <testsuite> — it encodes |
| 164 | # wall_seconds but all the testcase times are deterministic, so this |
| 165 | # single attribute is the only moving part we need to mask. |
| 166 | actual = re.sub(r' time="[\d.]+"', ' time="<wall>"', actual, count=1) |
| 167 | _compare_to_snapshot(actual, "report.junit.xml") |