@@ -104,6 +104,60 @@ def test_trace_writer_produces_jsonl(tmp_path: Path) -> None: |
| 104 | 104 | assert any(not line["hit"] for line in lines) |
| 105 | 105 | |
| 106 | 106 | |
| 107 | +def test_trace_roundtrip_matches_backend_stats(tmp_path: Path) -> None: |
| 108 | + """F09 regression — trace writer and analyzer share the same schema. |
| 109 | + |
| 110 | + The writer lives in ``backends/_instrumentation`` and the analyzer |
| 111 | + lives in ``suite/trace_analysis``. They aren't linked by a shared |
| 112 | + type. A writer-side field removal would parse cleanly in the |
| 113 | + analyzer (everything is ``Optional``) and silently roll up every |
| 114 | + event under ``<unassigned>``. This test runs a real suite with |
| 115 | + ``trace_path=`` set, loads the file back, and asserts the loaded |
| 116 | + events are consistent with the backend_stats counters produced by |
| 117 | + the same run. |
| 118 | + """ |
| 119 | + from dlm_sway.suite.trace_analysis import load as load_trace |
| 120 | + |
| 121 | + trace_path = tmp_path / "trace.jsonl" |
| 122 | + backend = _programmable_backend() |
| 123 | + result = run_suite(_spec_with_repeated_prompts(), backend, trace_path=trace_path) |
| 124 | + |
| 125 | + events = load_trace(trace_path) |
| 126 | + assert events, "trace writer produced no events" |
| 127 | + # Probe-scoped events must tag with their probe name. Pre-probe |
| 128 | + # preflight events legitimately carry ``probe=None``; we split the |
| 129 | + # stream and hold the probe-tagged subset to the stricter invariant. |
| 130 | + probe_tagged = [e for e in events if e.probe is not None] |
| 131 | + assert probe_tagged, "no probe-tagged events — writer dropped the probe field" |
| 132 | + # dk1 + dk2 are the only two probes; both should surface. |
| 133 | + probe_labels = {e.probe for e in probe_tagged} |
| 134 | + assert probe_labels == {"dk1", "dk2"} |
| 135 | + # base + ft are the only two views touched by delta_kl. |
| 136 | + view_ids = {e.view_id for e in probe_tagged} |
| 137 | + assert view_ids == {"base", "ft"} |
| 138 | + |
| 139 | + # Trace hit-rate must match backend_stats hit-rate to within |
| 140 | + # integer counts. ``backend_stats`` aggregates *every* cache |
| 141 | + # access the instrumented backend saw during the run, including |
| 142 | + # the pre-probe preflight call (which carries ``probe=None`` in |
| 143 | + # the trace). Compare against the full event list, not just the |
| 144 | + # probe-tagged subset. |
| 145 | + stats = result.backend_stats |
| 146 | + traced_hits = sum(1 for e in events if e.hit) |
| 147 | + traced_misses = sum(1 for e in events if not e.hit) |
| 148 | + assert traced_hits == stats["cache_hits"], ( |
| 149 | + f"trace says {traced_hits} hits; backend_stats says {stats['cache_hits']}. " |
| 150 | + "Writer/analyzer schemas have drifted." |
| 151 | + ) |
| 152 | + assert traced_misses == stats["cache_misses"] |
| 153 | + # Every event carries the four fields the analyzer reads; a writer |
| 154 | + # that dropped ``op`` or ``wall_ms`` would parse cleanly (both are |
| 155 | + # required-but-defaulted on the dataclass) and roll up under empty |
| 156 | + # buckets — this assertion catches that silent mode. |
| 157 | + assert all(e.op == "next_token_dist" for e in events) |
| 158 | + assert all(e.wall_ms >= 0.0 for e in events) |
| 159 | + |
| 160 | + |
| 107 | 161 | def test_ci_95_survives_runner_roundtrip() -> None: |
| 108 | 162 | """F01 regression — ``_with_duration`` must forward every field. |
| 109 | 163 | |