Python · 1601 bytes Raw Blame History
1 """Typed errors for the dataset-assembly pipeline.
2
3 Every error carries enough context to point the user back at a specific
4 section (and line offset within the section) of the source `.dlm`. The
5 section grammar errors (`InstructionParseError`, `PreferenceParseError`)
6 include the section id + the line inside the section's body where the
7 problem was detected; callers compose the message with the section's
8 `start_line` from `doc.sections.Section` to recover a file:line location.
9 """
10
11 from __future__ import annotations
12
13
14 class DataError(Exception):
15 """Base for all `dlm.data` errors."""
16
17
18 class DataFormatError(DataError):
19 """Row does not have a recognized shape for SFT / CPT / DPO routing."""
20
21
22 class TokenizerBringupError(DataError):
23 """Tokenizer load / fixup failed (missing chat_template, pad == EOS, etc)."""
24
25
26 class SectionParseError(DataError):
27 """Base for section-body grammar errors.
28
29 `section_id` is the 16-char content-hash of the source section;
30 `section_line` is 1-indexed and measured from the first line *after*
31 the opening fence so users can skim to the offending line inside
32 their editor.
33 """
34
35 def __init__(self, message: str, *, section_id: str, section_line: int) -> None:
36 super().__init__(message)
37 self.section_id = section_id
38 self.section_line = section_line
39
40
41 class InstructionParseError(SectionParseError):
42 """`### Q` / `### A` grammar violation inside an `::instruction::` fence."""
43
44
45 class PreferenceParseError(SectionParseError):
46 """`### Prompt` / `### Chosen` / `### Rejected` grammar violation."""