Python · 1343 bytes Raw Blame History
1 """Dataset assembly — turn parsed `.dlm` sections into a ready-to-train dataset.
2
3 Heavy imports (`datasets`, `transformers`, `trl`, `peft`) are deferred
4 to the call sites that actually use them, so `import dlm.data` stays
5 cheap even when the training stack isn't installed.
6 """
7
8 from __future__ import annotations
9
10 from dlm.data.dataset_builder import build_dataset
11 from dlm.data.errors import (
12 DataError,
13 DataFormatError,
14 InstructionParseError,
15 PreferenceParseError,
16 SectionParseError,
17 TokenizerBringupError,
18 )
19 from dlm.data.formatter import FormattingFunc, make_formatting_func
20 from dlm.data.instruction_parser import QAPair, parse_instruction_body
21 from dlm.data.preference_parser import PreferenceTriple, parse_preference_body
22 from dlm.data.sections_to_rows import sections_to_rows
23 from dlm.data.splitter import split
24 from dlm.data.tokenizer_bringup import TokenizerBringup, prepare_tokenizer
25
26 __all__ = [
27 "DataError",
28 "DataFormatError",
29 "FormattingFunc",
30 "InstructionParseError",
31 "PreferenceParseError",
32 "PreferenceTriple",
33 "QAPair",
34 "SectionParseError",
35 "TokenizerBringup",
36 "TokenizerBringupError",
37 "build_dataset",
38 "make_formatting_func",
39 "parse_instruction_body",
40 "parse_preference_body",
41 "prepare_tokenizer",
42 "sections_to_rows",
43 "split",
44 ]