sway Public

Watch 0 Fork 0 Star 0

Python · 10302 bytes Raw Blame History

  
        1
        """Tests for :mod:`dlm_sway.suite.report_html` (S12 / F6)."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import os
      
        6
        import re
      
        7
        from datetime import UTC, datetime
      
        8
        from html.parser import HTMLParser
      
        9
        from pathlib import Path
      
        10
        
        11
        import pytest
      
        12
        
        13
        from dlm_sway.core.result import (
      
        14
            ProbeResult,
      
        15
            SuiteResult,
      
        16
            SwayScore,
      
        17
            Verdict,
      
        18
        )
      
        19
        from dlm_sway.suite import report_html
      
        20
        
        21
        SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
      
        22
        
        23
        # Plotly is shipped via the optional [viz] extra. Skip the whole module
      
        24
        # when it's not importable — the install hint path is covered by the
      
        25
        # CLI test.
      
        26
        pytest.importorskip("plotly")
      
        27
        
        28
        
        29
        def _fixture_suite_and_score() -> tuple[SuiteResult, SwayScore]:
      
        30
            """Suite exercising every panel: section_internalization (SIS bars)
      
        31
            and adapter_ablation (response curve) both present."""
      
        32
            started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
      
        33
            finished = datetime(2026, 1, 1, 12, 0, 5, tzinfo=UTC)
      
        34
            probes = (
      
        35
                ProbeResult(
      
        36
                    name="dk",
      
        37
                    kind="delta_kl",
      
        38
                    verdict=Verdict.PASS,
      
        39
                    score=0.87,
      
        40
                    raw=0.456,
      
        41
                    z_score=5.12,
      
        42
                    evidence={},
      
        43
                    message="mean js=0.4560, z=+5.12σ vs null",
      
        44
                    duration_s=0.1,
      
        45
                ),
      
        46
                ProbeResult(
      
        47
                    name="sis",
      
        48
                    kind="section_internalization",
      
        49
                    verdict=Verdict.PASS,
      
        50
                    score=0.70,
      
        51
                    raw=0.14,
      
        52
                    z_score=3.8,
      
        53
                    evidence={
      
        54
                        "per_section": [
      
        55
                            {"section_id": "sec01", "effective_sis": 0.18, "passed": True},
      
        56
                            {"section_id": "sec02", "effective_sis": 0.21, "passed": True},
      
        57
                            {"section_id": "sec03", "effective_sis": 0.03, "passed": False},
      
        58
                            {"section_id": "sec04", "effective_sis": 0.10, "passed": True},
      
        59
                        ],
      
        60
                        "num_sections": 4,
      
        61
                        "passing_frac": 0.75,
      
        62
                    },
      
        63
                    message="3/4 sections cleared",
      
        64
                    duration_s=0.3,
      
        65
                ),
      
        66
                ProbeResult(
      
        67
                    name="abl",
      
        68
                    kind="adapter_ablation",
      
        69
                    verdict=Verdict.PASS,
      
        70
                    score=0.75,
      
        71
                    raw=0.92,
      
        72
                    z_score=3.5,
      
        73
                    evidence={
      
        74
                        "lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
      
        75
                        "mean_divergence_per_lambda": [0.0, 0.05, 0.11, 0.16, 0.19, 0.20],
      
        76
                        "linearity": 0.92,
      
        77
                        "saturation_lambda": 0.75,
      
        78
                        "saturation_reason": "found",
      
        79
                        "overshoot": 1.05,
      
        80
                    },
      
        81
                    message="R²=0.92, sat_λ=0.75 (in band), overshoot=1.05",
      
        82
                    duration_s=0.5,
      
        83
                ),
      
        84
                ProbeResult(
      
        85
                    name="lk",
      
        86
                    kind="leakage",
      
        87
                    verdict=Verdict.SKIP,
      
        88
                    score=None,
      
        89
                    message="no PROSE sections to test for leakage",
      
        90
                    duration_s=0.0,
      
        91
                ),
      
        92
            )
      
        93
            suite = SuiteResult(
      
        94
                spec_path="fixture.yaml",
      
        95
                started_at=started,
      
        96
                finished_at=finished,
      
        97
                base_model_id="HuggingFaceTB/SmolLM2-135M",
      
        98
                adapter_id="adapters/test/v1",
      
        99
                sway_version="0.1.0",
      
        100
                probes=probes,
      
        101
            )
      
        102
            score = SwayScore(
      
        103
                overall=0.77,
      
        104
                components={"adherence": 0.87, "attribution": 0.70, "calibration": 0.0, "ablation": 0.75},
      
        105
                weights={"adherence": 0.30, "attribution": 0.35, "calibration": 0.20, "ablation": 0.15},
      
        106
                band="healthy",
      
        107
            )
      
        108
            return suite, score
      
        109
        
        110
        
        111
        class _WellFormednessChecker(HTMLParser):
      
        112
            """Trivial subclass: we only use HTMLParser to *not raise*.
      
        113
        
        114
            The stdlib parser is tolerant; the test is 'it doesn't blow up.'
      
        115
            Strict XHTML well-formedness isn't what the browser enforces.
      
        116
            """
      
        117
        
        118
            def error(self, message: str) -> None:  # pragma: no cover — never called with HTMLParser
      
        119
                raise AssertionError(f"HTMLParser rejected the output: {message}")
      
        120
        
        121
        
        122
        def _parse_ok(html_text: str) -> None:
      
        123
            parser = _WellFormednessChecker(convert_charrefs=True)
      
        124
            parser.feed(html_text)
      
        125
            parser.close()
      
        126
        
        127
        
        128
        class TestToHtml:
      
        129
            def test_parses_as_html(self) -> None:
      
        130
                suite, score = _fixture_suite_and_score()
      
        131
                out = report_html.to_html(suite, score)
      
        132
                _parse_ok(out)
      
        133
        
        134
            def test_contains_all_probe_names(self) -> None:
      
        135
                suite, score = _fixture_suite_and_score()
      
        136
                out = report_html.to_html(suite, score)
      
        137
                for name in ("dk", "sis", "abl", "lk"):
      
        138
                    assert name in out, f"probe {name!r} not in HTML"
      
        139
        
        140
            def test_contains_all_five_panel_divs(self) -> None:
      
        141
                suite, score = _fixture_suite_and_score()
      
        142
                out = report_html.to_html(suite, score)
      
        143
                for div_id in ("sway-gauge", "sway-category", "sway-sis", "sway-ablation", "sway-scatter"):
      
        144
                    assert f'id="{div_id}"' in out, f"panel div {div_id!r} missing"
      
        145
        
        146
            def test_plotly_js_inlined_once(self) -> None:
      
        147
                """The ~3 MB Plotly bundle is embedded, not linked externally.
      
        148
        
        149
                Guard: no ``<script src="http..."`` tags exist — everything
      
        150
                loads from the inline bundle so the page works offline.
      
        151
                Plotly's bundle body *does* carry the string ``cdn.plot.ly`` as
      
        152
                an internal default for mapbox config; that's data, not a fetch,
      
        153
                so we only care about ``<script src=...>`` tags.
      
        154
                """
      
        155
                suite, score = _fixture_suite_and_score()
      
        156
                out = report_html.to_html(suite, score)
      
        157
                external_scripts = re.findall(r'<script\s+[^>]*src\s*=\s*["\'](https?:[^"\']+)["\']', out)
      
        158
                assert external_scripts == [], (
      
        159
                    f"HTML pulls in external scripts (should all be inlined): {external_scripts}"
      
        160
                )
      
        161
                # Sanity: output is >1 MB (JS bundle is ~3-5 MB — gives us room
      
        162
                # if Plotly slims down a bit between releases).
      
        163
                assert len(out) > 1_000_000, f"HTML output suspiciously small: {len(out)} bytes"
      
        164
        
        165
            def test_no_sis_panel_when_probe_absent(self) -> None:
      
        166
                """A suite without section_internalization skips the SIS panel but
      
        167
                still renders the other four."""
      
        168
                suite, score = _fixture_suite_and_score()
      
        169
                pruned_probes = tuple(p for p in suite.probes if p.kind != "section_internalization")
      
        170
                suite = SuiteResult(
      
        171
                    spec_path=suite.spec_path,
      
        172
                    started_at=suite.started_at,
      
        173
                    finished_at=suite.finished_at,
      
        174
                    base_model_id=suite.base_model_id,
      
        175
                    adapter_id=suite.adapter_id,
      
        176
                    sway_version=suite.sway_version,
      
        177
                    probes=pruned_probes,
      
        178
                )
      
        179
                out = report_html.to_html(suite, score)
      
        180
                assert 'id="sway-sis"' not in out
      
        181
                assert 'id="sway-ablation"' in out
      
        182
                assert 'id="sway-scatter"' in out
      
        183
        
        184
            def test_zero_probe_suite_still_renders(self) -> None:
      
        185
                """Empty probes — gauge/category/scatter still emit; no crashes."""
      
        186
                started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
      
        187
                suite = SuiteResult(
      
        188
                    spec_path="empty.yaml",
      
        189
                    started_at=started,
      
        190
                    finished_at=started,
      
        191
                    base_model_id="base",
      
        192
                    adapter_id="",
      
        193
                    sway_version="0.1.0",
      
        194
                    probes=(),
      
        195
                )
      
        196
                score = SwayScore(overall=0.0, components={}, band="noise")
      
        197
                out = report_html.to_html(suite, score)
      
        198
                _parse_ok(out)
      
        199
                assert 'id="sway-gauge"' in out
      
        200
                assert "no probes ran" in out
      
        201
        
        202
            def test_raises_when_plotly_missing(self, monkeypatch: pytest.MonkeyPatch) -> None:
      
        203
                """Simulated ImportError surfaces the install hint."""
      
        204
                import builtins
      
        205
        
        206
                real_import = builtins.__import__
      
        207
        
        208
                def fake_import(name, *args, **kwargs):  # type: ignore[no-untyped-def]
      
        209
                    if name.startswith("plotly"):
      
        210
                        raise ImportError("simulated missing plotly")
      
        211
                    return real_import(name, *args, **kwargs)
      
        212
        
        213
                monkeypatch.setattr(builtins, "__import__", fake_import)
      
        214
                suite, score = _fixture_suite_and_score()
      
        215
                with pytest.raises(RuntimeError, match=r"plotly.*\[viz\]"):
      
        216
                    report_html.to_html(suite, score)
      
        217
        
        218
        
        219
        class TestWrapperSnapshot:
      
        220
            """Snapshot the Sway-owned wrapper, strip the Plotly bundle JS so the
      
        221
            snapshot doesn't churn on Plotly point releases.
      
        222
            """
      
        223
        
        224
            #: Matches the single ``<script>...plotly_bundle...</script>`` we emit
      
        225
            #: in ``<head>``. Plotly's per-figure scripts live in the body and
      
        226
            #: carry the stable chart data — those we *do* want in the snapshot.
      
        227
            _HEAD_SCRIPT_RE = re.compile(
      
        228
                r'<script type="text/javascript">\s*/\*\*.*?</script>',
      
        229
                re.DOTALL,
      
        230
            )
      
        231
        
        232
            def test_snapshot(self) -> None:
      
        233
                """Run
      
        234
                ``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_html.py``
      
        235
                to regenerate after an intentional wrapper change. Plotly JS
      
        236
                bundle bumps should NOT drift this — it's stripped before compare.
      
        237
                """
      
        238
                suite, score = _fixture_suite_and_score()
      
        239
                raw = report_html.to_html(suite, score)
      
        240
        
        241
                # Strip the Plotly JS bundle; confirm we actually removed it.
      
        242
                stripped = self._HEAD_SCRIPT_RE.sub(
      
        243
                    '<script type="text/javascript">/* plotly bundle — stripped for snapshot */</script>',
      
        244
                    raw,
      
        245
                    count=1,
      
        246
                )
      
        247
                assert stripped != raw, (
      
        248
                    "failed to strip the Plotly JS bundle from the head — regex didn't match"
      
        249
                )
      
        250
                # Further shrink: replace per-figure config UUIDs (Plotly sprinkles
      
        251
                # `"uuid": "..."` in some payloads) to keep snapshot stable across
      
        252
                # minor Plotly versions.
      
        253
                stripped = re.sub(r'"uid": ?"[^"]*"', '"uid": "<stripped>"', stripped)
      
        254
        
        255
                path = SNAPSHOT_DIR / "report.html"
      
        256
                if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
      
        257
                    path.parent.mkdir(parents=True, exist_ok=True)
      
        258
                    path.write_text(stripped, encoding="utf-8")
      
        259
                    pytest.skip(
      
        260
                        "snapshot report.html written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
      
        261
                    )
      
        262
                expected = path.read_text(encoding="utf-8")
      
        263
                assert stripped == expected, (
      
        264
                    "report.html drifted from snapshot.\n"
      
        265
                    "To accept the new output intentionally, run:\n"
      
        266
                    "    SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_html.py\n"
      
        267
                    "and commit the updated file.\n"
      
        268
                )

1	"""Tests for :mod:`dlm_sway.suite.report_html` (S12 / F6)."""
2
3	from __future__ import annotations
4
5	import os
6	import re
7	from datetime import UTC, datetime
8	from html.parser import HTMLParser
9	from pathlib import Path
10
11	import pytest
12
13	from dlm_sway.core.result import (
14	ProbeResult,
15	SuiteResult,
16	SwayScore,
17	Verdict,
18	)
19	from dlm_sway.suite import report_html
20
21	SNAPSHOT_DIR = Path(__file__).parent.parent / "snapshots"
22
23	# Plotly is shipped via the optional [viz] extra. Skip the whole module
24	# when it's not importable — the install hint path is covered by the
25	# CLI test.
26	pytest.importorskip("plotly")
27
28
29	def _fixture_suite_and_score() -> tuple[SuiteResult, SwayScore]:
30	"""Suite exercising every panel: section_internalization (SIS bars)
31	and adapter_ablation (response curve) both present."""
32	started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
33	finished = datetime(2026, 1, 1, 12, 0, 5, tzinfo=UTC)
34	probes = (
35	ProbeResult(
36	name="dk",
37	kind="delta_kl",
38	verdict=Verdict.PASS,
39	score=0.87,
40	raw=0.456,
41	z_score=5.12,
42	evidence={},
43	message="mean js=0.4560, z=+5.12σ vs null",
44	duration_s=0.1,
45	),
46	ProbeResult(
47	name="sis",
48	kind="section_internalization",
49	verdict=Verdict.PASS,
50	score=0.70,
51	raw=0.14,
52	z_score=3.8,
53	evidence={
54	"per_section": [
55	{"section_id": "sec01", "effective_sis": 0.18, "passed": True},
56	{"section_id": "sec02", "effective_sis": 0.21, "passed": True},
57	{"section_id": "sec03", "effective_sis": 0.03, "passed": False},
58	{"section_id": "sec04", "effective_sis": 0.10, "passed": True},
59	],
60	"num_sections": 4,
61	"passing_frac": 0.75,
62	},
63	message="3/4 sections cleared",
64	duration_s=0.3,
65	),
66	ProbeResult(
67	name="abl",
68	kind="adapter_ablation",
69	verdict=Verdict.PASS,
70	score=0.75,
71	raw=0.92,
72	z_score=3.5,
73	evidence={
74	"lambdas": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25],
75	"mean_divergence_per_lambda": [0.0, 0.05, 0.11, 0.16, 0.19, 0.20],
76	"linearity": 0.92,
77	"saturation_lambda": 0.75,
78	"saturation_reason": "found",
79	"overshoot": 1.05,
80	},
81	message="R²=0.92, sat_λ=0.75 (in band), overshoot=1.05",
82	duration_s=0.5,
83	),
84	ProbeResult(
85	name="lk",
86	kind="leakage",
87	verdict=Verdict.SKIP,
88	score=None,
89	message="no PROSE sections to test for leakage",
90	duration_s=0.0,
91	),
92	)
93	suite = SuiteResult(
94	spec_path="fixture.yaml",
95	started_at=started,
96	finished_at=finished,
97	base_model_id="HuggingFaceTB/SmolLM2-135M",
98	adapter_id="adapters/test/v1",
99	sway_version="0.1.0",
100	probes=probes,
101	)
102	score = SwayScore(
103	overall=0.77,
104	components={"adherence": 0.87, "attribution": 0.70, "calibration": 0.0, "ablation": 0.75},
105	weights={"adherence": 0.30, "attribution": 0.35, "calibration": 0.20, "ablation": 0.15},
106	band="healthy",
107	)
108	return suite, score
109
110
111	class _WellFormednessChecker(HTMLParser):
112	"""Trivial subclass: we only use HTMLParser to not raise.
113
114	The stdlib parser is tolerant; the test is 'it doesn't blow up.'
115	Strict XHTML well-formedness isn't what the browser enforces.
116	"""
117
118	def error(self, message: str) -> None: # pragma: no cover — never called with HTMLParser
119	raise AssertionError(f"HTMLParser rejected the output: {message}")
120
121
122	def _parse_ok(html_text: str) -> None:
123	parser = _WellFormednessChecker(convert_charrefs=True)
124	parser.feed(html_text)
125	parser.close()
126
127
128	class TestToHtml:
129	def test_parses_as_html(self) -> None:
130	suite, score = _fixture_suite_and_score()
131	out = report_html.to_html(suite, score)
132	_parse_ok(out)
133
134	def test_contains_all_probe_names(self) -> None:
135	suite, score = _fixture_suite_and_score()
136	out = report_html.to_html(suite, score)
137	for name in ("dk", "sis", "abl", "lk"):
138	assert name in out, f"probe {name!r} not in HTML"
139
140	def test_contains_all_five_panel_divs(self) -> None:
141	suite, score = _fixture_suite_and_score()
142	out = report_html.to_html(suite, score)
143	for div_id in ("sway-gauge", "sway-category", "sway-sis", "sway-ablation", "sway-scatter"):
144	assert f'id="{div_id}"' in out, f"panel div {div_id!r} missing"
145
146	def test_plotly_js_inlined_once(self) -> None:
147	"""The ~3 MB Plotly bundle is embedded, not linked externally.
148
149	Guard: no ``<script src="http..."`` tags exist — everything
150	loads from the inline bundle so the page works offline.
151	Plotly's bundle body does carry the string ``cdn.plot.ly`` as
152	an internal default for mapbox config; that's data, not a fetch,
153	so we only care about ``<script src=...>`` tags.
154	"""
155	suite, score = _fixture_suite_and_score()
156	out = report_html.to_html(suite, score)
157	external_scripts = re.findall(r'<script\s+[^>]src\s=\s*["\'](https?:[^"\']+)["\']', out)
158	assert external_scripts == [], (
159	f"HTML pulls in external scripts (should all be inlined): {external_scripts}"
160	)
161	# Sanity: output is >1 MB (JS bundle is ~3-5 MB — gives us room
162	# if Plotly slims down a bit between releases).
163	assert len(out) > 1_000_000, f"HTML output suspiciously small: {len(out)} bytes"
164
165	def test_no_sis_panel_when_probe_absent(self) -> None:
166	"""A suite without section_internalization skips the SIS panel but
167	still renders the other four."""
168	suite, score = _fixture_suite_and_score()
169	pruned_probes = tuple(p for p in suite.probes if p.kind != "section_internalization")
170	suite = SuiteResult(
171	spec_path=suite.spec_path,
172	started_at=suite.started_at,
173	finished_at=suite.finished_at,
174	base_model_id=suite.base_model_id,
175	adapter_id=suite.adapter_id,
176	sway_version=suite.sway_version,
177	probes=pruned_probes,
178	)
179	out = report_html.to_html(suite, score)
180	assert 'id="sway-sis"' not in out
181	assert 'id="sway-ablation"' in out
182	assert 'id="sway-scatter"' in out
183
184	def test_zero_probe_suite_still_renders(self) -> None:
185	"""Empty probes — gauge/category/scatter still emit; no crashes."""
186	started = datetime(2026, 1, 1, 12, 0, 0, tzinfo=UTC)
187	suite = SuiteResult(
188	spec_path="empty.yaml",
189	started_at=started,
190	finished_at=started,
191	base_model_id="base",
192	adapter_id="",
193	sway_version="0.1.0",
194	probes=(),
195	)
196	score = SwayScore(overall=0.0, components={}, band="noise")
197	out = report_html.to_html(suite, score)
198	_parse_ok(out)
199	assert 'id="sway-gauge"' in out
200	assert "no probes ran" in out
201
202	def test_raises_when_plotly_missing(self, monkeypatch: pytest.MonkeyPatch) -> None:
203	"""Simulated ImportError surfaces the install hint."""
204	import builtins
205
206	real_import = builtins.__import__
207
208	def fake_import(name, args, *kwargs): # type: ignore[no-untyped-def]
209	if name.startswith("plotly"):
210	raise ImportError("simulated missing plotly")
211	return real_import(name, args, *kwargs)
212
213	monkeypatch.setattr(builtins, "__import__", fake_import)
214	suite, score = _fixture_suite_and_score()
215	with pytest.raises(RuntimeError, match=r"plotly.*\[viz\]"):
216	report_html.to_html(suite, score)
217
218
219	class TestWrapperSnapshot:
220	"""Snapshot the Sway-owned wrapper, strip the Plotly bundle JS so the
221	snapshot doesn't churn on Plotly point releases.
222	"""
223
224	#: Matches the single ``<script>...plotly_bundle...</script>`` we emit
225	#: in ``<head>``. Plotly's per-figure scripts live in the body and
226	#: carry the stable chart data — those we do want in the snapshot.
227	_HEAD_SCRIPT_RE = re.compile(
228	r'<script type="text/javascript">\s/\\.?</script>',
229	re.DOTALL,
230	)
231
232	def test_snapshot(self) -> None:
233	"""Run
234	``SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_html.py``
235	to regenerate after an intentional wrapper change. Plotly JS
236	bundle bumps should NOT drift this — it's stripped before compare.
237	"""
238	suite, score = _fixture_suite_and_score()
239	raw = report_html.to_html(suite, score)
240
241	# Strip the Plotly JS bundle; confirm we actually removed it.
242	stripped = self._HEAD_SCRIPT_RE.sub(
243	'<script type="text/javascript">/* plotly bundle — stripped for snapshot */</script>',
244	raw,
245	count=1,
246	)
247	assert stripped != raw, (
248	"failed to strip the Plotly JS bundle from the head — regex didn't match"
249	)
250	# Further shrink: replace per-figure config UUIDs (Plotly sprinkles
251	# `"uuid": "..."` in some payloads) to keep snapshot stable across
252	# minor Plotly versions.
253	stripped = re.sub(r'"uid": ?"[^"]*"', '"uid": "<stripped>"', stripped)
254
255	path = SNAPSHOT_DIR / "report.html"
256	if os.environ.get("SWAY_UPDATE_SNAPSHOTS") == "1" or not path.exists():
257	path.parent.mkdir(parents=True, exist_ok=True)
258	path.write_text(stripped, encoding="utf-8")
259	pytest.skip(
260	"snapshot report.html written — re-run without SWAY_UPDATE_SNAPSHOTS to verify"
261	)
262	expected = path.read_text(encoding="utf-8")
263	assert stripped == expected, (
264	"report.html drifted from snapshot.\n"
265	"To accept the new output intentionally, run:\n"
266	" SWAY_UPDATE_SNAPSHOTS=1 uv run pytest tests/unit/test_report_html.py\n"
267	"and commit the updated file.\n"
268	)