"""End-to-end Sprint 07 checks via the suite runner.

Asserts:

- ``SuiteResult.backend_stats`` is populated after a run
- duplicate (view, prompt, top_k) lookups hit the cache
- ``--trace`` writes JSONL with the expected per-probe labels
"""

from __future__ import annotations

import json
from pathlib import Path

import numpy as np

from dlm_sway.backends.dummy import DummyDifferentialBackend, DummyResponses
from dlm_sway.core.scoring import TokenDist
from dlm_sway.suite.runner import run as run_suite
from dlm_sway.suite.spec import SwaySpec


def _programmable_backend() -> DummyDifferentialBackend:
    base_dist = TokenDist(
        token_ids=np.array([1, 2, 3], dtype=np.int64),
        logprobs=np.log(np.array([0.7, 0.2, 0.1], dtype=np.float32)),
        vocab_size=100,
    )
    ft_dist = TokenDist(
        token_ids=np.array([1, 2, 3], dtype=np.int64),
        logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)),
        vocab_size=100,
    )
    return DummyDifferentialBackend(
        base=DummyResponses(token_dists={"hello": base_dist, "world": base_dist}),
        ft=DummyResponses(token_dists={"hello": ft_dist, "world": ft_dist}),
    )


def _spec_with_repeated_prompts() -> SwaySpec:
    """Two delta_kl probes over overlapping prompts — the cache should
    serve the second probe's base/ft dists from the first probe's hits."""
    return SwaySpec.model_validate(
        {
            "version": 1,
            "models": {
                "base": {"base": "b"},
                "ft": {"base": "b", "adapter": "/tmp/a"},
            },
            "suite": [
                {
                    "name": "dk1",
                    "kind": "delta_kl",
                    "prompts": ["hello", "world"],
                    "assert_mean_gte": 0.0,
                },
                {
                    "name": "dk2",
                    "kind": "delta_kl",
                    "prompts": ["hello", "world"],
                    "assert_mean_gte": 0.0,
                },
            ],
        }
    )


def test_backend_stats_populated_after_run() -> None:
    backend = _programmable_backend()
    result = run_suite(_spec_with_repeated_prompts(), backend)
    stats = result.backend_stats
    assert stats, "backend_stats should be non-empty when the backend is instrumented"
    for key in ("cache_hits", "cache_misses", "forward_passes", "hit_rate"):
        assert key in stats


def test_second_probe_hits_cache_for_duplicate_prompts() -> None:
    """dk2 runs the same 2 prompts × 2 views as dk1 → 4 hits."""
    backend = _programmable_backend()
    result = run_suite(_spec_with_repeated_prompts(), backend)
    stats = result.backend_stats
    # dk1 misses all 4 (2 prompts × 2 views); dk2 hits all 4.
    assert stats["cache_hits"] >= 4, f"expected ≥ 4 hits, got {stats}"
    assert stats["forward_passes"] == stats["cache_misses"]


def test_trace_writer_produces_jsonl(tmp_path: Path) -> None:
    trace_path = tmp_path / "trace.jsonl"
    backend = _programmable_backend()
    run_suite(_spec_with_repeated_prompts(), backend, trace_path=trace_path)

    lines = [
        json.loads(line)
        for line in trace_path.read_text(encoding="utf-8").splitlines()
        if line.strip()
    ]
    assert lines
    # Probe labels flow from the runner into the trace events.
    probe_names = {line["probe"] for line in lines}
    assert "dk1" in probe_names
    assert "dk2" in probe_names
    # Some hits, some misses — the whole point.
    assert any(line["hit"] for line in lines)
    assert any(not line["hit"] for line in lines)


def test_trace_roundtrip_matches_backend_stats(tmp_path: Path) -> None:
    """F09 regression — trace writer and analyzer share the same schema.

    The writer lives in ``backends/_instrumentation`` and the analyzer
    lives in ``suite/trace_analysis``. They aren't linked by a shared
    type. A writer-side field removal would parse cleanly in the
    analyzer (everything is ``Optional``) and silently roll up every
    event under ``<unassigned>``. This test runs a real suite with
    ``trace_path=`` set, loads the file back, and asserts the loaded
    events are consistent with the backend_stats counters produced by
    the same run.
    """
    from dlm_sway.suite.trace_analysis import load as load_trace

    trace_path = tmp_path / "trace.jsonl"
    backend = _programmable_backend()
    result = run_suite(_spec_with_repeated_prompts(), backend, trace_path=trace_path)

    events = load_trace(trace_path)
    assert events, "trace writer produced no events"
    # Probe-scoped events must tag with their probe name. Pre-probe
    # preflight events legitimately carry ``probe=None``; we split the
    # stream and hold the probe-tagged subset to the stricter invariant.
    probe_tagged = [e for e in events if e.probe is not None]
    assert probe_tagged, "no probe-tagged events — writer dropped the probe field"
    # dk1 + dk2 are the only two probes; both should surface.
    probe_labels = {e.probe for e in probe_tagged}
    assert probe_labels == {"dk1", "dk2"}
    # base + ft are the only two views touched by delta_kl.
    view_ids = {e.view_id for e in probe_tagged}
    assert view_ids == {"base", "ft"}

    # Trace hit-rate must match backend_stats hit-rate to within
    # integer counts. ``backend_stats`` aggregates *every* cache
    # access the instrumented backend saw during the run, including
    # the pre-probe preflight call (which carries ``probe=None`` in
    # the trace). Compare against the full event list, not just the
    # probe-tagged subset.
    stats = result.backend_stats
    traced_hits = sum(1 for e in events if e.hit)
    traced_misses = sum(1 for e in events if not e.hit)
    assert traced_hits == stats["cache_hits"], (
        f"trace says {traced_hits} hits; backend_stats says {stats['cache_hits']}. "
        "Writer/analyzer schemas have drifted."
    )
    assert traced_misses == stats["cache_misses"]
    # Every event carries the four fields the analyzer reads; a writer
    # that dropped ``op`` or ``wall_ms`` would parse cleanly (both are
    # required-but-defaulted on the dataclass) and roll up under empty
    # buckets — this assertion catches that silent mode.
    assert all(e.op == "next_token_dist" for e in events)
    assert all(e.wall_ms >= 0.0 for e in events)


def test_ci_95_survives_runner_roundtrip() -> None:
    """F01 regression — ``_with_duration`` must forward every field.

    The bug: ``_with_duration`` rebuilt the dataclass by hand and
    silently dropped ``ci_95`` on its way out of the runner. Every
    bootstrap CI was stripped before reaching the ``SuiteResult``.
    """
    # delta_kl only emits a bootstrap CI when it has ≥ 4 samples to
    # resample — tailor a fixture that clears that floor.
    base_dist = TokenDist(
        token_ids=np.array([1, 2, 3], dtype=np.int64),
        logprobs=np.log(np.array([0.7, 0.2, 0.1], dtype=np.float32)),
        vocab_size=100,
    )
    ft_dists = {
        "p1": TokenDist(
            token_ids=np.array([1, 2, 3], dtype=np.int64),
            logprobs=np.log(np.array([0.2, 0.4, 0.4], dtype=np.float32)),
            vocab_size=100,
        ),
        "p2": TokenDist(
            token_ids=np.array([1, 2, 3], dtype=np.int64),
            logprobs=np.log(np.array([0.3, 0.3, 0.4], dtype=np.float32)),
            vocab_size=100,
        ),
        "p3": TokenDist(
            token_ids=np.array([1, 2, 3], dtype=np.int64),
            logprobs=np.log(np.array([0.1, 0.5, 0.4], dtype=np.float32)),
            vocab_size=100,
        ),
        "p4": TokenDist(
            token_ids=np.array([1, 2, 3], dtype=np.int64),
            logprobs=np.log(np.array([0.25, 0.35, 0.4], dtype=np.float32)),
            vocab_size=100,
        ),
    }
    backend = DummyDifferentialBackend(
        base=DummyResponses(token_dists=dict.fromkeys(ft_dists, base_dist)),
        ft=DummyResponses(token_dists=ft_dists),
    )
    spec = SwaySpec.model_validate(
        {
            "version": 1,
            "models": {
                "base": {"base": "b"},
                "ft": {"base": "b", "adapter": "/tmp/a"},
            },
            "suite": [
                {
                    "name": "dk",
                    "kind": "delta_kl",
                    "prompts": list(ft_dists.keys()),
                    "assert_mean_gte": 0.0,
                }
            ],
        }
    )
    result = run_suite(spec, backend)
    probe = result.probes[0]
    assert probe.kind == "delta_kl"
    assert probe.ci_95 is not None, (
        "delta_kl emits a bootstrap CI at N=4; the runner dropped it. "
        "Check suite/runner.py:_with_duration."
    )
    lo, hi = probe.ci_95
    assert lo <= (probe.raw or 0.0) <= hi


def test_report_footer_includes_cache_hit_rate() -> None:
    """Report surface shows the ``cache: N/M = X%`` line when stats exist."""
    from dlm_sway.suite import report
    from dlm_sway.suite.score import compute as compute_score

    backend = _programmable_backend()
    result = run_suite(_spec_with_repeated_prompts(), backend)
    score = compute_score(result)
    md = report.to_markdown(result, score)
    assert "cache:" in md
    assert "%" in md