"""Probe and suite result types.

Every numeric probe ultimately returns a :class:`ProbeResult`. The suite
runner collects them into a :class:`SuiteResult` and the scorer folds
that into a single :class:`SwayScore` with transparent per-component
weights.

These dataclasses are deliberately plain — no pydantic — because they
cross probe/backend boundaries hundreds of times per run and a free
``model_validate`` on every construction would dominate the runtime of
cheap probes.
"""

from __future__ import annotations

import math
from dataclasses import dataclass, field
from datetime import UTC, datetime
from enum import StrEnum
from typing import Any


class Verdict(StrEnum):
    """Outcome of a single probe against its assertion."""

    PASS = "pass"
    FAIL = "fail"
    WARN = "warn"
    SKIP = "skip"
    ERROR = "error"


@dataclass(frozen=True, slots=True)
class ProbeResult:
    """The result of running one probe.

    Attributes
    ----------
    name:
        User-facing name from the spec (unique within a suite).
    kind:
        Probe discriminator (``delta_kl``, ``section_internalization`` …).
    verdict:
        Pass / fail / warn / skip / error.
    score:
        Normalized [0, 1] score. ``sigmoid(z_vs_null / 3)`` for numeric
        probes; 1.0 / 0.0 for binary ones. ``None`` for :attr:`Verdict.SKIP`.
    raw:
        The raw metric value (e.g. KL=0.083). Probe-specific units.
    z_score:
        Standard deviations above the null-adapter baseline. ``None``
        when no null calibration was run.
    base_value:
        The metric evaluated on the base model, when meaningful.
    ft_value:
        The metric evaluated on the fine-tuned model, when meaningful.
    evidence:
        Small structured payload for the report — prompts, example
        completions, per-section breakdowns. Kept bounded (<10 KB) so
        suite JSON stays under a megabyte.
    message:
        One-line diagnostic. Surfaces in the terminal report.
    duration_s:
        Wall time to execute.
    ci_95:
        95% percentile-bootstrap confidence interval on :attr:`raw`.
        Populated by aggregating probes (``delta_kl`` over N prompts,
        ``calibration_drift`` over pack items, etc.) via
        :func:`dlm_sway.core.stats.bootstrap_ci`. ``None`` when the
        probe doesn't aggregate (``adapter_ablation``,
        ``style_fingerprint``), when the sample count is too low to
        bootstrap meaningfully, or when the raw value isn't finite.
        Report surfaces it inline as ``raw [lo–hi]``.
    """

    name: str
    kind: str
    verdict: Verdict
    score: float | None
    raw: float | None = None
    z_score: float | None = None
    base_value: float | None = None
    ft_value: float | None = None
    evidence: dict[str, Any] = field(default_factory=dict)
    message: str = ""
    duration_s: float = 0.0
    ci_95: tuple[float, float] | None = None


@dataclass(frozen=True, slots=True)
class DeterminismReport:
    """Serializable view of what seeding the runner accomplished.

    Mirrors :class:`dlm_sway.core.determinism.DeterminismSummary` but
    lives here so :class:`SuiteResult` doesn't pull ``determinism`` as
    an import-time dependency of ``core.result``.
    """

    class_: str
    seed: int
    notes: tuple[str, ...] = ()


@dataclass(frozen=True, slots=True)
class SuiteResult:
    """A full run of a sway.yaml suite."""

    spec_path: str
    started_at: datetime
    finished_at: datetime
    base_model_id: str
    adapter_id: str
    sway_version: str
    probes: tuple[ProbeResult, ...] = ()
    null_stats: dict[str, dict[str, float]] = field(default_factory=dict)
    """Per-primitive null-adapter baseline stats (mean, std, runs). Used
    to turn raw metrics into z-scores when rendering the report."""
    determinism: DeterminismReport | None = None
    """Classification of the determinism regime the suite ran under, from
    :func:`dlm_sway.core.determinism.seed_everything`. ``None`` when the
    caller bypassed seeding (e.g., unit tests constructing a
    ``SuiteResult`` directly)."""
    backend_stats: dict[str, float | int] = field(default_factory=dict)
    """Forward-pass + cache counters from the backend
    (:class:`dlm_sway.backends._instrumentation.BackendStats.to_dict`).
    Populated by the runner at suite-end; ``{}`` when the backend
    doesn't expose instrumentation (custom backends, pre-S07 snapshots)."""

    @property
    def wall_seconds(self) -> float:
        return (self.finished_at - self.started_at).total_seconds()


# Component weights for the composite score. Overridable in sway.yaml.
# ``baseline`` is listed with weight 0.0 so the null-calibration row
# appears in the report for transparency but contributes nothing to the
# composite — it's an informational category, not a judgment one.
DEFAULT_COMPONENT_WEIGHTS: dict[str, float] = {
    "adherence": 0.30,
    "attribution": 0.35,
    "calibration": 0.20,
    "ablation": 0.15,
    "baseline": 0.0,
}


@dataclass(frozen=True, slots=True)
class SwayScore:
    """Composite score with a transparent per-component breakdown."""

    overall: float
    components: dict[str, float]
    weights: dict[str, float] = field(default_factory=lambda: dict(DEFAULT_COMPONENT_WEIGHTS))
    band: str = ""
    findings: tuple[str, ...] = ()

    @staticmethod
    def band_for(overall: float) -> str:
        """Map a score to a human-readable band.

        Bands (from the plan):
          - <0.3  : indistinguishable from noise
          - 0.3–0.6 : partial fit
          - 0.6–0.85: healthy
          - >0.85 : suspiciously good (possible overfit / memorization)
        """
        if overall < 0.3:
            return "noise"
        if overall < 0.6:
            return "partial"
        if overall <= 0.85:
            return "healthy"
        return "suspicious"


def utcnow() -> datetime:
    """Timezone-aware UTC timestamp (used by the runner)."""
    return datetime.now(UTC)


def safe_finalize(
    *,
    name: str,
    kind: str,
    verdict: Verdict,
    score: float | None = None,
    raw: float | None = None,
    z_score: float | None = None,
    base_value: float | None = None,
    ft_value: float | None = None,
    evidence: dict[str, Any] | None = None,
    message: str = "",
    duration_s: float = 0.0,
    ci_95: tuple[float, float] | None = None,
    critical_fields: tuple[str, ...] = ("raw",),
) -> ProbeResult:
    """Build a :class:`ProbeResult` with defense against non-finite metrics.

    Probes hand their candidate result kwargs here instead of constructing
    a :class:`ProbeResult` directly. The helper inspects every numeric
    field and classifies it:

    - **Critical field non-finite** (any field named in ``critical_fields``
      whose value is ``NaN`` or ``±inf``): the whole probe result is
      converted to :attr:`Verdict.ERROR` with all scalar fields nulled out,
      the offending values are preserved under
      ``evidence["non_finite_inputs"]``, and the message explains which
      field(s) were non-finite.
    - **Non-critical field non-finite**: nulled out silently (set to
      ``None``), and the field name appended to
      ``evidence["defensively_nulled"]`` so a report reader can see what
      happened.
    - **Everything finite**: passthrough, no change.

    The default ``critical_fields = ("raw",)`` reflects the design stance:
    ``raw`` is the probe's ground-truth metric; a non-finite ``raw`` means
    the probe cannot make a meaningful statement. Probes that care about
    other fields (e.g., probes whose ``z_score`` is load-bearing) pass a
    broader tuple.

    This helper is the single shared guardrail sprint 01 installs against
    the +11639σ class of bug, where NaN logprobs flowed silently through
    to a PASS verdict. Every numeric probe is expected to finalize through
    this function.
    """
    numeric_kwargs: dict[str, float | None] = {
        "score": score,
        "raw": raw,
        "z_score": z_score,
        "base_value": base_value,
        "ft_value": ft_value,
    }

    non_finite: dict[str, float] = {}
    for fname, v in numeric_kwargs.items():
        if isinstance(v, int | float) and not isinstance(v, bool) and not math.isfinite(float(v)):
            non_finite[fname] = float(v)

    ev: dict[str, Any] = dict(evidence) if evidence is not None else {}

    critical_non_finite = {k: v for k, v in non_finite.items() if k in critical_fields}
    if critical_non_finite:
        ev["non_finite_inputs"] = non_finite
        return ProbeResult(
            name=name,
            kind=kind,
            verdict=Verdict.ERROR,
            score=None,
            raw=None,
            z_score=None,
            base_value=None,
            ft_value=None,
            evidence=ev,
            message=(
                f"non-finite critical field(s): {', '.join(sorted(critical_non_finite))} "
                f"— probe cannot produce a meaningful result"
            ),
            duration_s=duration_s,
        )

    if non_finite:
        ev.setdefault("defensively_nulled", []).extend(sorted(non_finite))
        for fname in non_finite:
            numeric_kwargs[fname] = None

    # ``ci_95`` is only attached when ``raw`` survived the
    # defensive-null sweep — a CI bracketing a nulled-out point
    # estimate would mislead more than it informs.
    final_ci_95 = ci_95 if numeric_kwargs["raw"] is not None else None

    return ProbeResult(
        name=name,
        kind=kind,
        verdict=verdict,
        score=numeric_kwargs["score"],
        raw=numeric_kwargs["raw"],
        z_score=numeric_kwargs["z_score"],
        base_value=numeric_kwargs["base_value"],
        ft_value=numeric_kwargs["ft_value"],
        evidence=ev,
        message=message,
        duration_s=duration_s,
        ci_95=final_ci_95,
    )