"""Unit tests for :mod:`dlm_sway.core.golden`. Pins the comparator's tolerance math and the variable-field mask so the cross-platform golden test (S18) has a reliable backbone. No HF or torch dependency — the comparator is pure-Python and runs in the fast lane. """ from __future__ import annotations import math from dlm_sway.core.golden import ( DEFAULT_VARIABLE_FIELDS, Diff, compare_goldens, mask_variable_fields, ) class TestMaskVariableFields: def test_strips_top_level_fields(self) -> None: payload = { "sway_version": "0.1.0", "wall_seconds": 1.23, "probes": [], } masked = mask_variable_fields(payload) assert "sway_version" not in masked assert "wall_seconds" not in masked assert "probes" in masked def test_strips_nested_duration_s(self) -> None: payload = { "probes": [ {"name": "p1", "raw": 0.5, "duration_s": 0.01}, {"name": "p2", "raw": 0.8, "duration_s": 0.02}, ], } masked = mask_variable_fields(payload) for probe in masked["probes"]: assert "duration_s" not in probe assert "raw" in probe def test_strips_started_and_finished(self) -> None: payload = {"started_at": "2026-01-01T00:00:00Z", "finished_at": "2026-01-01T00:00:05Z"} masked = mask_variable_fields(payload) assert masked == {} def test_strips_backend_stats(self) -> None: payload = { "backend_stats": {"cache_hits": 42, "wall_ms": 1230.0}, "overall": 0.8, } masked = mask_variable_fields(payload) assert "backend_stats" not in masked assert masked["overall"] == 0.8 def test_preserves_scalars(self) -> None: assert mask_variable_fields(42) == 42 assert mask_variable_fields("hello") == "hello" assert mask_variable_fields(None) is None def test_default_variable_fields_has_expected_members(self) -> None: """Lock the default mask set — accidentally dropping a field from the mask would make the golden test newly flaky.""" expected_members = { "started_at", "finished_at", "wall_seconds", "duration_s", "sway_version", "backend_stats", # Platform-dependent path identifiers. "adapter_id", "base_model_id", } assert expected_members <= DEFAULT_VARIABLE_FIELDS class TestCompareGoldensIdentical: def test_identical_payload_no_diffs(self) -> None: payload = {"overall": 0.85, "probes": [{"raw": 0.123, "score": 0.9}]} assert compare_goldens(payload, payload) == [] def test_empty_payload_no_diffs(self) -> None: assert compare_goldens({}, {}) == [] class TestCompareGoldensTolerance: def test_floats_within_logprob_tol_pass(self) -> None: actual = {"probes": [{"raw": 0.12345}]} expected = {"probes": [{"raw": 0.12345 + 5e-5}]} # well under 1e-4 assert compare_goldens(actual, expected) == [] def test_floats_just_above_logprob_tol_fail(self) -> None: actual = {"probes": [{"raw": 0.12345}]} expected = {"probes": [{"raw": 0.12345 + 2e-4}]} # double the tol diffs = compare_goldens(actual, expected) assert len(diffs) == 1 assert "raw" in diffs[0].path assert "Δ" in diffs[0].reason def test_scores_match_logprob_tol_default(self) -> None: """Score fields use ``score_tol`` (1e-4) — same as ``logprob_tol`` after S18's first-week tuning. A 5e-5 drift passes on both.""" actual = {"overall": 0.85} expected = {"overall": 0.85 + 5e-5} assert compare_goldens(actual, expected) == [] def test_score_field_drift_above_score_tol_fails(self) -> None: actual = {"overall": 0.85} expected = {"overall": 0.85 + 2e-4} # double the score tol diffs = compare_goldens(actual, expected) assert len(diffs) == 1 assert diffs[0].path == "$.overall" def test_custom_tolerances_respected(self) -> None: """Callers can tighten or loosen both tolerances.""" actual = {"probes": [{"raw": 0.1}]} expected = {"probes": [{"raw": 0.1 + 5e-4}]} # Default tol (1e-4) → fail. assert compare_goldens(actual, expected) != [] # Loosened to 1e-3 → pass. assert compare_goldens(actual, expected, logprob_tol=1e-3) == [] # Tightened to 1e-6 → same fail, but also a regression guard # if we ever tighten the default back. assert compare_goldens(actual, expected, logprob_tol=1e-6) != [] def test_nan_vs_nan_treated_equal(self) -> None: actual = {"z_score": float("nan")} expected = {"z_score": float("nan")} assert compare_goldens(actual, expected) == [] def test_nan_vs_finite_is_drift(self) -> None: actual = {"z_score": float("nan")} expected = {"z_score": 3.0} diffs = compare_goldens(actual, expected) assert len(diffs) == 1 assert diffs[0].path == "$.z_score" def test_inf_comparison(self) -> None: """Same-signed infinities compare equal; opposite signs drift.""" actual = {"raw": float("inf")} expected = {"raw": float("inf")} assert compare_goldens(actual, expected) == [] diffs = compare_goldens({"raw": float("inf")}, {"raw": float("-inf")}) assert diffs # IEEE compares same-sign as equal but opposite as distinct; # the comparator bails on non-finite diffs without a tolerance. def test_int_vs_float_not_type_mismatch(self) -> None: """``raw: 0`` (int) vs ``raw: 0.0`` (float) is not drift.""" assert compare_goldens({"raw": 0}, {"raw": 0.0}) == [] class TestCompareGoldensStructural: def test_missing_key_flagged(self) -> None: actual = {"overall": 0.8} expected = {"overall": 0.8, "band": "healthy"} diffs = compare_goldens(actual, expected) assert any(d.reason == "missing key in actual" for d in diffs) def test_extra_key_flagged(self) -> None: actual = {"overall": 0.8, "new_field": 42} expected = {"overall": 0.8} diffs = compare_goldens(actual, expected) assert any(d.reason == "unexpected key in actual" for d in diffs) def test_list_length_mismatch_flagged(self) -> None: actual = {"probes": [{"raw": 0.1}]} expected = {"probes": [{"raw": 0.1}, {"raw": 0.2}]} diffs = compare_goldens(actual, expected) assert len(diffs) == 1 assert "list length mismatch" in diffs[0].reason def test_type_mismatch_flagged(self) -> None: actual = {"band": "healthy"} expected = {"band": {"name": "healthy", "level": 3}} diffs = compare_goldens(actual, expected) assert any(d.reason == "type mismatch" for d in diffs) def test_string_mismatch_flagged(self) -> None: actual = {"band": "noise"} expected = {"band": "healthy"} diffs = compare_goldens(actual, expected) assert len(diffs) == 1 assert diffs[0].reason == "value mismatch" class TestDiffRepr: def test_str_includes_path_and_reason(self) -> None: d = Diff(path="$.foo", actual=1.0, expected=2.0, reason="drift") s = str(d) assert "$.foo" in s assert "drift" in s assert "1.0" in s assert "2.0" in s class TestRealisticPayload: def test_two_masked_payloads_match(self) -> None: """End-to-end sanity: mask timestamps + duration, compare the rest, drift-free.""" actual = { "schema_version": 1, "sway_version": "0.1.0", "started_at": "2026-04-01T00:00:00Z", "finished_at": "2026-04-01T00:00:05Z", "wall_seconds": 5.123, "overall": 0.82, "probes": [ { "name": "dk", "raw": 0.4561, "score": 0.87, "duration_s": 0.123, }, ], } expected = { "schema_version": 1, "sway_version": "0.0.9", # version bumped "started_at": "2026-03-15T12:00:00Z", "finished_at": "2026-03-15T12:00:03Z", "wall_seconds": 3.456, # different wall "overall": 0.82 + 5e-5, # within score_tol "probes": [ { "name": "dk", "raw": 0.4561 + 5e-5, # within logprob_tol (1e-4) "score": 0.87, "duration_s": 0.789, # different duration }, ], } masked_actual = mask_variable_fields(actual) masked_expected = mask_variable_fields(expected) assert compare_goldens(masked_actual, masked_expected) == [] def test_simulated_silent_algorithm_change_is_caught(self) -> None: """Prove-the-value sanity: a 1e-2 drift on a probe's raw is flagged — well above the 1e-4 default tolerance. Real algorithm changes (e.g. flipping ``top_k=256`` → 128) shift raws by this order of magnitude.""" expected = {"probes": [{"raw": 0.4561}]} actual = {"probes": [{"raw": 0.4561 + 1e-2}]} diffs = compare_goldens(actual, expected) assert len(diffs) == 1 assert "raw" in diffs[0].path assert math.isclose( abs(float(diffs[0].actual) - float(diffs[0].expected)), 1e-2, abs_tol=1e-9 )