"""Typed errors for the dataset-assembly pipeline.

Every error carries enough context to point the user back at a specific
section (and line offset within the section) of the source `.dlm`. The
section grammar errors (`InstructionParseError`, `PreferenceParseError`)
include the section id + the line inside the section's body where the
problem was detected; callers compose the message with the section's
`start_line` from `doc.sections.Section` to recover a file:line location.
"""

from __future__ import annotations


class DataError(Exception):
    """Base for all `dlm.data` errors."""


class DataFormatError(DataError):
    """Row does not have a recognized shape for SFT / CPT / DPO routing."""


class TokenizerBringupError(DataError):
    """Tokenizer load / fixup failed (missing chat_template, pad == EOS, etc)."""


class SectionParseError(DataError):
    """Base for section-body grammar errors.

    `section_id` is the 16-char content-hash of the source section;
    `section_line` is 1-indexed and measured from the first line *after*
    the opening fence so users can skim to the offending line inside
    their editor.
    """

    def __init__(self, message: str, *, section_id: str, section_line: int) -> None:
        super().__init__(message)
        self.section_id = section_id
        self.section_line = section_line


class InstructionParseError(SectionParseError):
    """`### Q` / `### A` grammar violation inside an `::instruction::` fence."""


class PreferenceParseError(SectionParseError):
    """`### Prompt` / `### Chosen` / `### Rejected` grammar violation."""