tenseleyflow/sway / 464f1ac

Browse files

tests/runner: trace writer ↔ analyzer round-trip regression (F09)

Authored by espadonne
SHA
464f1ac85595c57d62be41419f0459b0031ca0ac
Parents
c8e3bf3
Tree
683c18c

1 changed file

StatusFile+-
M tests/unit/test_runner_backend_stats.py 54 0
tests/unit/test_runner_backend_stats.pymodified
@@ -104,6 +104,60 @@ def test_trace_writer_produces_jsonl(tmp_path: Path) -> None:
104104
     assert any(not line["hit"] for line in lines)
105105
 
106106
 
107
+def test_trace_roundtrip_matches_backend_stats(tmp_path: Path) -> None:
108
+    """F09 regression — trace writer and analyzer share the same schema.
109
+
110
+    The writer lives in ``backends/_instrumentation`` and the analyzer
111
+    lives in ``suite/trace_analysis``. They aren't linked by a shared
112
+    type. A writer-side field removal would parse cleanly in the
113
+    analyzer (everything is ``Optional``) and silently roll up every
114
+    event under ``<unassigned>``. This test runs a real suite with
115
+    ``trace_path=`` set, loads the file back, and asserts the loaded
116
+    events are consistent with the backend_stats counters produced by
117
+    the same run.
118
+    """
119
+    from dlm_sway.suite.trace_analysis import load as load_trace
120
+
121
+    trace_path = tmp_path / "trace.jsonl"
122
+    backend = _programmable_backend()
123
+    result = run_suite(_spec_with_repeated_prompts(), backend, trace_path=trace_path)
124
+
125
+    events = load_trace(trace_path)
126
+    assert events, "trace writer produced no events"
127
+    # Probe-scoped events must tag with their probe name. Pre-probe
128
+    # preflight events legitimately carry ``probe=None``; we split the
129
+    # stream and hold the probe-tagged subset to the stricter invariant.
130
+    probe_tagged = [e for e in events if e.probe is not None]
131
+    assert probe_tagged, "no probe-tagged events — writer dropped the probe field"
132
+    # dk1 + dk2 are the only two probes; both should surface.
133
+    probe_labels = {e.probe for e in probe_tagged}
134
+    assert probe_labels == {"dk1", "dk2"}
135
+    # base + ft are the only two views touched by delta_kl.
136
+    view_ids = {e.view_id for e in probe_tagged}
137
+    assert view_ids == {"base", "ft"}
138
+
139
+    # Trace hit-rate must match backend_stats hit-rate to within
140
+    # integer counts. ``backend_stats`` aggregates *every* cache
141
+    # access the instrumented backend saw during the run, including
142
+    # the pre-probe preflight call (which carries ``probe=None`` in
143
+    # the trace). Compare against the full event list, not just the
144
+    # probe-tagged subset.
145
+    stats = result.backend_stats
146
+    traced_hits = sum(1 for e in events if e.hit)
147
+    traced_misses = sum(1 for e in events if not e.hit)
148
+    assert traced_hits == stats["cache_hits"], (
149
+        f"trace says {traced_hits} hits; backend_stats says {stats['cache_hits']}. "
150
+        "Writer/analyzer schemas have drifted."
151
+    )
152
+    assert traced_misses == stats["cache_misses"]
153
+    # Every event carries the four fields the analyzer reads; a writer
154
+    # that dropped ``op`` or ``wall_ms`` would parse cleanly (both are
155
+    # required-but-defaulted on the dataclass) and roll up under empty
156
+    # buckets — this assertion catches that silent mode.
157
+    assert all(e.op == "next_token_dist" for e in events)
158
+    assert all(e.wall_ms >= 0.0 for e in events)
159
+
160
+
107161
 def test_ci_95_survives_runner_roundtrip() -> None:
108162
     """F01 regression — ``_with_duration`` must forward every field.
109163