documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 1601 bytes Raw Blame History

  
        1
        """Typed errors for the dataset-assembly pipeline.
      
        2
        
        3
        Every error carries enough context to point the user back at a specific
      
        4
        section (and line offset within the section) of the source `.dlm`. The
      
        5
        section grammar errors (`InstructionParseError`, `PreferenceParseError`)
      
        6
        include the section id + the line inside the section's body where the
      
        7
        problem was detected; callers compose the message with the section's
      
        8
        `start_line` from `doc.sections.Section` to recover a file:line location.
      
        9
        """
      
        10
        
        11
        from __future__ import annotations
      
        12
        
        13
        
        14
        class DataError(Exception):
      
        15
            """Base for all `dlm.data` errors."""
      
        16
        
        17
        
        18
        class DataFormatError(DataError):
      
        19
            """Row does not have a recognized shape for SFT / CPT / DPO routing."""
      
        20
        
        21
        
        22
        class TokenizerBringupError(DataError):
      
        23
            """Tokenizer load / fixup failed (missing chat_template, pad == EOS, etc)."""
      
        24
        
        25
        
        26
        class SectionParseError(DataError):
      
        27
            """Base for section-body grammar errors.
      
        28
        
        29
            `section_id` is the 16-char content-hash of the source section;
      
        30
            `section_line` is 1-indexed and measured from the first line *after*
      
        31
            the opening fence so users can skim to the offending line inside
      
        32
            their editor.
      
        33
            """
      
        34
        
        35
            def __init__(self, message: str, *, section_id: str, section_line: int) -> None:
      
        36
                super().__init__(message)
      
        37
                self.section_id = section_id
      
        38
                self.section_line = section_line
      
        39
        
        40
        
        41
        class InstructionParseError(SectionParseError):
      
        42
            """`### Q` / `### A` grammar violation inside an `::instruction::` fence."""
      
        43
        
        44
        
        45
        class PreferenceParseError(SectionParseError):
      
        46
            """`### Prompt` / `### Chosen` / `### Rejected` grammar violation."""

1	"""Typed errors for the dataset-assembly pipeline.
2
3	Every error carries enough context to point the user back at a specific
4	section (and line offset within the section) of the source `.dlm`. The
5	section grammar errors (`InstructionParseError`, `PreferenceParseError`)
6	include the section id + the line inside the section's body where the
7	problem was detected; callers compose the message with the section's
8	`start_line` from `doc.sections.Section` to recover a file:line location.
9	"""
10
11	from __future__ import annotations
12
13
14	class DataError(Exception):
15	"""Base for all `dlm.data` errors."""
16
17
18	class DataFormatError(DataError):
19	"""Row does not have a recognized shape for SFT / CPT / DPO routing."""
20
21
22	class TokenizerBringupError(DataError):
23	"""Tokenizer load / fixup failed (missing chat_template, pad == EOS, etc)."""
24
25
26	class SectionParseError(DataError):
27	"""Base for section-body grammar errors.
28
29	`section_id` is the 16-char content-hash of the source section;
30	`section_line` is 1-indexed and measured from the first line after
31	the opening fence so users can skim to the offending line inside
32	their editor.
33	"""
34
35	def __init__(self, message: str, *, section_id: str, section_line: int) -> None:
36	super().__init__(message)
37	self.section_id = section_id
38	self.section_line = section_line
39
40
41	class InstructionParseError(SectionParseError):
42	"""`### Q` / `### A` grammar violation inside an `::instruction::` fence."""
43
44
45	class PreferenceParseError(SectionParseError):
46	"""`### Prompt` / `### Chosen` / `### Rejected` grammar violation."""