sway Public

Watch 0 Fork 0 Star 0

Python · 5796 bytes Raw Blame History

  
        1
        """C11: snapshot tests for the three report formats.
      
        2
        
        3
        JSON is the machine-readable contract downstream tools depend on;
      
        4
        markdown is the CI-friendly human report; JUnit is the CI-dashboard
      
        5
        plumbing. Silent schema drift in any of them breaks consumers.
      
        6
        
        7
        We serialize a deterministic fixture suite + score through each
      
        8
        emitter and byte-compare against checked-in snapshots under
      
        9
        ``tests/snapshots/``. Intentional schema bumps update the snapshot in
      
        10
        the same commit (``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest``); anything
      
        11
        else surfaces as a failed test.
      
        12
        """
      
        13
        
        14
        from __future__ import annotations
      
        15
        
        16
        import json
      
        17
        import os
      
        18
        import re
      
        19
        from datetime import UTC, datetime
      
        20
        from pathlib import Path
      
        21
        
        22
        import pytest
      
        23
        
        24
        from dlm_sway.core.result import (
      
        25
            DeterminismReport,
      
        26
            ProbeResult,
      
        27
            SuiteResult,
      
        28
            SwayScore,
      
        29
            Verdict,
      
        30
        )
      
        31
        from dlm_sway.suite import report
      
        32
        
        33
        SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
      
        34
        
        35
        
        36
        def _fixture_suite_and_score() -> tuple[SuiteResult, SwayScore]:
      
        37
            """A hand-crafted SuiteResult whose every field is deterministic."""
      
        38
            started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
      
        39
            finished = datetime(2026, 1, 1, 12, 0, 2, 500000, tzinfo=UTC)  # 2.5s wall
      
        40
            probes = (
      
        41
                ProbeResult(
      
        42
                    name="dk",
      
        43
                    kind="delta_kl",
      
        44
                    verdict=Verdict.PASS,
      
        45
                    score=0.87,
      
        46
                    raw=0.456,
      
        47
                    z_score=5.12,
      
        48
                    evidence={"divergence_kind": "js", "num_prompts": 4, "weight": 1.0},
      
        49
                    message="mean js=0.4560, z=+5.12σ vs null",
      
        50
                    duration_s=0.123,
      
        51
                    ci_95=(0.412, 0.497),
      
        52
                ),
      
        53
                ProbeResult(
      
        54
                    name="sis",
      
        55
                    kind="section_internalization",
      
        56
                    verdict=Verdict.FAIL,
      
        57
                    score=0.30,
      
        58
                    raw=0.012,
      
        59
                    z_score=0.5,
      
        60
                    evidence={"num_sections": 4, "passing_frac": 0.25, "weight": 1.0},
      
        61
                    message="1/4 sections cleared effective_sis≥0.05",
      
        62
                    duration_s=0.456,
      
        63
                ),
      
        64
                ProbeResult(
      
        65
                    name="lk",
      
        66
                    kind="leakage",
      
        67
                    verdict=Verdict.SKIP,
      
        68
                    score=None,
      
        69
                    message="no PROSE sections to test for leakage",
      
        70
                    duration_s=0.001,
      
        71
                ),
      
        72
                ProbeResult(
      
        73
                    name="ablation",
      
        74
                    kind="adapter_ablation",
      
        75
                    verdict=Verdict.ERROR,
      
        76
                    score=None,
      
        77
                    raw=None,
      
        78
                    message="backend does not implement ScalableDifferentialBackend",
      
        79
                    duration_s=0.0,
      
        80
                ),
      
        81
            )
      
        82
            suite = SuiteResult(
      
        83
                spec_path="/fixture/sway.yaml",
      
        84
                started_at=started,
      
        85
                finished_at=finished,
      
        86
                base_model_id="HuggingFaceTB/SmolLM2-135M-Instruct",
      
        87
                adapter_id="/fixture/runs/adapter/v0003",
      
        88
                sway_version="0.1.0.dev0",
      
        89
                probes=probes,
      
        90
                null_stats={"delta_kl": {"mean": 0.01, "std": 0.005, "n": 3.0}},
      
        91
                determinism=DeterminismReport(
      
        92
                    class_="best_effort",
      
        93
                    seed=0,
      
        94
                    notes=("CPU-only backend: strict determinism depends on BLAS impl",),
      
        95
                ),
      
        96
            )
      
        97
            score = SwayScore(
      
        98
                overall=0.65,
      
        99
                components={
      
        100
                    "adherence": 0.87,
      
        101
                    "attribution": 0.30,
      
        102
                    "calibration": 0.50,
      
        103
                    "ablation": 0.0,
      
        104
                    "baseline": 1.0,
      
        105
                },
      
        106
                weights={
      
        107
                    "adherence": 0.30,
      
        108
                    "attribution": 0.35,
      
        109
                    "calibration": 0.20,
      
        110
                    "ablation": 0.15,
      
        111
                    "baseline": 0.0,
      
        112
                },
      
        113
                band="healthy",
      
        114
                findings=(
      
        115
                    "sis (section_internalization) failed: 1/4 sections cleared effective_sis≥0.05",
      
        116
                    "ablation score is 0.00 — below the noise threshold",
      
        117
                ),
      
        118
            )
      
        119
            return suite, score
      
        120
        
        121
        
        122
        def _compare_to_snapshot(actual: str, snapshot_name: str) -> None:
      
        123
            """Byte-compare ``actual`` against the snapshot file, updating when asked."""
      
        124
            path = SNAPSHOT_DIR / snapshot_name
      
        125
            if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
      
        126
                path.parent.mkdir(parents=True, exist_ok=True)
      
        127
                path.write_text(actual, encoding="utf-8")
      
        128
                pytest.skip(
      
        129
                    f"snapshot {snapshot_name} written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
      
        130
                )
      
        131
            expected = path.read_text(encoding="utf-8")
      
        132
            assert actual == expected, (
      
        133
                f"{snapshot_name} drifted from snapshot.\n"
      
        134
                f"To accept the new output intentionally, run:\n"
      
        135
                f"    SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_snapshot.py\n"
      
        136
                f"and commit the updated file.\n"
      
        137
            )
      
        138
        
        139
        
        140
        def test_json_schema_snapshot() -> None:
      
        141
            suite, score = _fixture_suite_and_score()
      
        142
            actual = report.to_json(suite, score)
      
        143
            # Sanity: it's parseable JSON with the expected top-level fields.
      
        144
            parsed = json.loads(actual)
      
        145
            assert parsed["schema_version"] == 1
      
        146
            assert parsed["determinism"] is not None
      
        147
            assert parsed["determinism"]["seed"] == 0
      
        148
            _compare_to_snapshot(actual + "\n", "report.json")
      
        149
        
        150
        
        151
        def test_markdown_layout_snapshot() -> None:
      
        152
            suite, score = _fixture_suite_and_score()
      
        153
            actual = report.to_markdown(suite, score)
      
        154
            _compare_to_snapshot(actual, "report.md")
      
        155
        
        156
        
        157
        def test_junit_layout_snapshot() -> None:
      
        158
            suite, score = _fixture_suite_and_score()
      
        159
            actual = report.to_junit(suite, score)
      
        160
            # ElementTree tostring doesn't include a trailing newline; normalize
      
        161
            # so diffs don't hinge on platform-dependent whitespace.
      
        162
            actual = actual.strip() + "\n"
      
        163
            # Strip the variable ``time`` attribute on <testsuite> — it encodes
      
        164
            # wall_seconds but all the testcase times are deterministic, so this
      
        165
            # single attribute is the only moving part we need to mask.
      
        166
            actual = re.sub(r' time="[\d.]+"', ' time="<wall>"', actual, count=1)
      
        167
            _compare_to_snapshot(actual, "report.junit.xml")

1	"""C11: snapshot tests for the three report formats.
2
3	JSON is the machine-readable contract downstream tools depend on;
4	markdown is the CI-friendly human report; JUnit is the CI-dashboard
5	plumbing. Silent schema drift in any of them breaks consumers.
6
7	We serialize a deterministic fixture suite + score through each
8	emitter and byte-compare against checked-in snapshots under
9	``tests/snapshots/``. Intentional schema bumps update the snapshot in
10	the same commit (``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest``); anything
11	else surfaces as a failed test.
12	"""
13
14	from __future__ import annotations
15
16	import json
17	import os
18	import re
19	from datetime import UTC, datetime
20	from pathlib import Path
21
22	import pytest
23
24	from dlm_sway.core.result import (
25	DeterminismReport,
26	ProbeResult,
27	SuiteResult,
28	SwayScore,
29	Verdict,
30	)
31	from dlm_sway.suite import report
32
33	SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
34
35
36	def _fixture_suite_and_score() -> tuple[SuiteResult, SwayScore]:
37	"""A hand-crafted SuiteResult whose every field is deterministic."""
38	started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
39	finished = datetime(2026, 1, 1, 12, 0, 2, 500000, tzinfo=UTC) # 2.5s wall
40	probes = (
41	ProbeResult(
42	name="dk",
43	kind="delta_kl",
44	verdict=Verdict.PASS,
45	score=0.87,
46	raw=0.456,
47	z_score=5.12,
48	evidence={"divergence_kind": "js", "num_prompts": 4, "weight": 1.0},
49	message="mean js=0.4560, z=+5.12σ vs null",
50	duration_s=0.123,
51	ci_95=(0.412, 0.497),
52	),
53	ProbeResult(
54	name="sis",
55	kind="section_internalization",
56	verdict=Verdict.FAIL,
57	score=0.30,
58	raw=0.012,
59	z_score=0.5,
60	evidence={"num_sections": 4, "passing_frac": 0.25, "weight": 1.0},
61	message="1/4 sections cleared effective_sis≥0.05",
62	duration_s=0.456,
63	),
64	ProbeResult(
65	name="lk",
66	kind="leakage",
67	verdict=Verdict.SKIP,
68	score=None,
69	message="no PROSE sections to test for leakage",
70	duration_s=0.001,
71	),
72	ProbeResult(
73	name="ablation",
74	kind="adapter_ablation",
75	verdict=Verdict.ERROR,
76	score=None,
77	raw=None,
78	message="backend does not implement ScalableDifferentialBackend",
79	duration_s=0.0,
80	),
81	)
82	suite = SuiteResult(
83	spec_path="/fixture/sway.yaml",
84	started_at=started,
85	finished_at=finished,
86	base_model_id="HuggingFaceTB/SmolLM2-135M-Instruct",
87	adapter_id="/fixture/runs/adapter/v0003",
88	sway_version="0.1.0.dev0",
89	probes=probes,
90	null_stats={"delta_kl": {"mean": 0.01, "std": 0.005, "n": 3.0}},
91	determinism=DeterminismReport(
92	class_="best_effort",
93	seed=0,
94	notes=("CPU-only backend: strict determinism depends on BLAS impl",),
95	),
96	)
97	score = SwayScore(
98	overall=0.65,
99	components={
100	"adherence": 0.87,
101	"attribution": 0.30,
102	"calibration": 0.50,
103	"ablation": 0.0,
104	"baseline": 1.0,
105	},
106	weights={
107	"adherence": 0.30,
108	"attribution": 0.35,
109	"calibration": 0.20,
110	"ablation": 0.15,
111	"baseline": 0.0,
112	},
113	band="healthy",
114	findings=(
115	"sis (section_internalization) failed: 1/4 sections cleared effective_sis≥0.05",
116	"ablation score is 0.00 — below the noise threshold",
117	),
118	)
119	return suite, score
120
121
122	def _compare_to_snapshot(actual: str, snapshot_name: str) -> None:
123	"""Byte-compare ``actual`` against the snapshot file, updating when asked."""
124	path = SNAPSHOT_DIR / snapshot_name
125	if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
126	path.parent.mkdir(parents=True, exist_ok=True)
127	path.write_text(actual, encoding="utf-8")
128	pytest.skip(
129	f"snapshot {snapshot_name} written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
130	)
131	expected = path.read_text(encoding="utf-8")
132	assert actual == expected, (
133	f"{snapshot_name} drifted from snapshot.\n"
134	f"To accept the new output intentionally, run:\n"
135	f" SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_snapshot.py\n"
136	f"and commit the updated file.\n"
137	)
138
139
140	def test_json_schema_snapshot() -> None:
141	suite, score = _fixture_suite_and_score()
142	actual = report.to_json(suite, score)
143	# Sanity: it's parseable JSON with the expected top-level fields.
144	parsed = json.loads(actual)
145	assert parsed["schema_version"] == 1
146	assert parsed["determinism"] is not None
147	assert parsed["determinism"]["seed"] == 0
148	_compare_to_snapshot(actual + "\n", "report.json")
149
150
151	def test_markdown_layout_snapshot() -> None:
152	suite, score = _fixture_suite_and_score()
153	actual = report.to_markdown(suite, score)
154	_compare_to_snapshot(actual, "report.md")
155
156
157	def test_junit_layout_snapshot() -> None:
158	suite, score = _fixture_suite_and_score()
159	actual = report.to_junit(suite, score)
160	# ElementTree tostring doesn't include a trailing newline; normalize
161	# so diffs don't hinge on platform-dependent whitespace.
162	actual = actual.strip() + "\n"
163	# Strip the variable ``time`` attribute on <testsuite> — it encodes
164	# wall_seconds but all the testcase times are deterministic, so this
165	# single attribute is the only moving part we need to mask.
166	actual = re.sub(r' time="[\d.]+"', ' time="<wall>"', actual, count=1)
167	_compare_to_snapshot(actual, "report.junit.xml")