"""`.dlm/training.yaml` per-codebase config.

Distinct from the `.dlm` frontmatter schema — different file, different
shape, different namespace. A codebase drops this alongside its source
to declare what `dlm train` should ingest when a directive descends
into the tree. The nearest-ancestor `.dlm/training.yaml` wins for each
file under its subtree, matching `.gitignore`'s resolution semantics.

See `dlm.directives.merge` for the full precedence table and
`docs/format/dlm-training-yaml.md` for the user-facing reference.
"""

from __future__ import annotations

from collections.abc import Mapping
from typing import Literal

from pydantic import BaseModel, ConfigDict, Field, field_validator


class DlmTrainingConfig(BaseModel):
    """Per-subtree training config discovered at `.dlm/training.yaml`.

    All fields optional — an empty config (just `dlm_training_version: 1`)
    is legal and means "no refinement at this level", which is useful
    as a placeholder marker while drafting.

    `exclude_defaults` controls whether `dlm.directives.defaults.DEFAULT_EXCLUDES`
    applies at this subtree. Most users want it True (secrets, VCS,
    build artifacts skipped automatically); trees that legitimately
    train on e.g. generated code can set it False to opt out.

    `metadata` flows onto every `Section` synthesized from this
    subtree via `Section.tags`. Tags do NOT affect `section_id`, so
    adjusting metadata doesn't invalidate the replay corpus.

    `weights` scales per-row training exposure by `(tag_key, tag_value)`.
    Resolution is multiplicative across tag keys and merges shallow-to-
    deep like `metadata` — deeper configs override shallower keys. A
    row with tags `{domain: auth, generated: true}` under a tree where
    the root sets `weights.domain.auth = 2.0` and a subtree sets
    `weights.generated.true = 0.5` ends up at effective weight 1.0.
    Rows with no matching tag/value get weight 1.0.
    """

    model_config = ConfigDict(extra="forbid", frozen=True)

    dlm_training_version: Literal[1] = 1
    include: tuple[str, ...] = ()
    exclude: tuple[str, ...] = ()
    exclude_defaults: bool = True
    metadata: Mapping[str, str] = Field(default_factory=dict)
    weights: Mapping[str, Mapping[str, float]] = Field(default_factory=dict)

    @field_validator("weights")
    @classmethod
    def _validate_weights(
        cls, value: Mapping[str, Mapping[str, float]]
    ) -> Mapping[str, Mapping[str, float]]:
        for tag_key, inner in value.items():
            for tag_value, scale in inner.items():
                if scale < 0:
                    raise ValueError(
                        f"weights[{tag_key!r}][{tag_value!r}] must be ≥ 0, "
                        f"got {scale}. Negative weights don't have a "
                        f"well-defined meaning under row-repetition expansion."
                    )
        return value