| 1 |
"""Typed errors for the dataset-assembly pipeline. |
| 2 |
|
| 3 |
Every error carries enough context to point the user back at a specific |
| 4 |
section (and line offset within the section) of the source `.dlm`. The |
| 5 |
section grammar errors (`InstructionParseError`, `PreferenceParseError`) |
| 6 |
include the section id + the line inside the section's body where the |
| 7 |
problem was detected; callers compose the message with the section's |
| 8 |
`start_line` from `doc.sections.Section` to recover a file:line location. |
| 9 |
""" |
| 10 |
|
| 11 |
from __future__ import annotations |
| 12 |
|
| 13 |
|
| 14 |
class DataError(Exception): |
| 15 |
"""Base for all `dlm.data` errors.""" |
| 16 |
|
| 17 |
|
| 18 |
class DataFormatError(DataError): |
| 19 |
"""Row does not have a recognized shape for SFT / CPT / DPO routing.""" |
| 20 |
|
| 21 |
|
| 22 |
class TokenizerBringupError(DataError): |
| 23 |
"""Tokenizer load / fixup failed (missing chat_template, pad == EOS, etc).""" |
| 24 |
|
| 25 |
|
| 26 |
class SectionParseError(DataError): |
| 27 |
"""Base for section-body grammar errors. |
| 28 |
|
| 29 |
`section_id` is the 16-char content-hash of the source section; |
| 30 |
`section_line` is 1-indexed and measured from the first line *after* |
| 31 |
the opening fence so users can skim to the offending line inside |
| 32 |
their editor. |
| 33 |
""" |
| 34 |
|
| 35 |
def __init__(self, message: str, *, section_id: str, section_line: int) -> None: |
| 36 |
super().__init__(message) |
| 37 |
self.section_id = section_id |
| 38 |
self.section_line = section_line |
| 39 |
|
| 40 |
|
| 41 |
class InstructionParseError(SectionParseError): |
| 42 |
"""`### Q` / `### A` grammar violation inside an `::instruction::` fence.""" |
| 43 |
|
| 44 |
|
| 45 |
class PreferenceParseError(SectionParseError): |
| 46 |
"""`### Prompt` / `### Chosen` / `### Rejected` grammar violation.""" |