documentlanguagemodel Public

Watch 0 Fork 0 Star 0

TOML · 8772 bytes Raw Blame History

  
        1
        [project]
      
        2
        name = "document-language-model"
      
        3
        version = "0.10.0"
      
        4
        description = "Directive-driven local LLM training, retraining, and export from .dlm documents, codebases, and multimodal sources."
      
        5
        readme = "README.md"
      
        6
        requires-python = ">=3.11"
      
        7
        license = { text = "MIT" }
      
        8
        authors = [{ name = "espadonne", email = "mfwolffe@outlook.com" }]
      
        9
        keywords = ["llm", "lora", "fine-tuning", "ollama", "local-ai"]
      
        10
        classifiers = [
      
        11
            "Development Status :: 2 - Pre-Alpha",
      
        12
            "Intended Audience :: Developers",
      
        13
            "License :: OSI Approved :: MIT License",
      
        14
            "Programming Language :: Python :: 3",
      
        15
            "Programming Language :: Python :: 3.11",
      
        16
            "Programming Language :: Python :: 3.12",
      
        17
            "Topic :: Scientific/Engineering :: Artificial Intelligence",
      
        18
        ]
      
        19
        dependencies = [
      
        20
            # CLI / doc / lightweight plumbing.
      
        21
            "typer>=0.12",
      
        22
            "rich>=13.7",
      
        23
            "prompt-toolkit>=3.0",    # dlm repl interactive prompt (Sprint 24)
      
        24
            "watchfiles>=0.24",       # dlm train --watch save-to-train loop (Sprint 25)
      
        25
            "python-ulid>=3.0",       # dlm_id generation (Sprint 13)
      
        26
            "pydantic>=2.9",          # schema validation (Sprint 03+)
      
        27
            "pyyaml>=6.0",            # .dlm frontmatter (Sprint 03+)
      
        28
            "psutil>=6.0",            # hardware doctor (Sprint 05)
      
        29
            "zstandard>=0.23",        # replay corpus framing (Sprint 08)
      
        30
            "cbor2>=5.6",             # replay snapshot encoding (Sprint 08)
      
        31
            "packaging>=24.0",        # lock policy semver parsing (Sprint 15)
      
        32
        
        33
            # ML runtime. `dlm train`, `dlm prompt`, `dlm export` all import these;
      
        34
            # a `pip install dlm` that omits them would ImportError on first call
      
        35
            # (audit-05 B1).
      
        36
            "torch>=2.4",
      
        37
            "transformers>=4.45",
      
        38
            "peft>=0.13",
      
        39
            "trl>=0.12",
      
        40
            "datasets>=3.0",
      
        41
            "huggingface-hub>=0.25",
      
        42
            "accelerate>=1.0",        # dlm train --gpus launcher (Sprint 23; audit-08 M2)
      
        43
            "safetensors>=0.4",       # GGUF conversion + adapter I/O (Sprint 11)
      
        44
            "sentencepiece>=0.2",     # vendored llama.cpp convert_lora_to_gguf imports it (audit-08 P3)
      
        45
        ]
      
        46
        
        47
        [project.optional-dependencies]
      
        48
        # CUDA-only. `bitsandbytes` won't install on macOS / CPU-only boxes; gate
      
        49
        # behind an extra so `pip install dlm` stays portable and `pip install
      
        50
        # dlm[cuda]` unlocks QLoRA.
      
        51
        cuda = [
      
        52
            "bitsandbytes>=0.43",
      
        53
        ]
      
        54
        # Apple Silicon only (Sprint 21). `mlx` + `mlx-lm` wheels are darwin-arm64
      
        55
        # exclusives; env markers keep `uv sync --extra mlx` a no-op on non-Apple
      
        56
        # hosts so wheel resolution doesn't fail for Linux/CUDA contributors.
      
        57
        mlx = [
      
        58
            "mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
      
        59
            "mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
      
        60
        ]
      
        61
        # Sprint 26 observability sinks. TensorBoard + W&B are optional — core
      
        62
        # metrics (SQLite) work without them. `uv sync --extra observability`
      
        63
        # unlocks `dlm train --tensorboard` and `dlm train --wandb <project>`.
      
        64
        observability = [
      
        65
            "tensorboard>=2.15",
      
        66
            "wandb>=0.17",
      
        67
        ]
      
        68
        # Sprint 35.2 audio-language bases. `soundfile` is the only dep needed
      
        69
        # to decode .wav / .flac / .ogg; MP3 support would require libsndfile
      
        70
        # ≥1.1 and is deferred. Keep optional so text-only users don't pull
      
        71
        # the libsndfile C library. `soxr` is the polyphase resampler used by
      
        72
        # `training.audio.auto_resample=True`; without it (or scipy as a
      
        73
        # fallback) the resample path raises `AudioResampleUnavailable`
      
        74
        # rather than training on the wrong sample rate.
      
        75
        audio = [
      
        76
            "soundfile>=0.12",
      
        77
            "soxr>=0.3",
      
        78
        ]
      
        79
        # Sprint 43 synth teachers. API clients are optional — `dlm synth`
      
        80
        # works with `self` and `hf:` teachers without these. Install via
      
        81
        # `pip install dlm[openai]` or `pip install dlm[anthropic]`.
      
        82
        openai = [
      
        83
            "openai>=1.0",
      
        84
        ]
      
        85
        anthropic = [
      
        86
            "anthropic>=0.30",
      
        87
        ]
      
        88
        # Sprint 26 (X1) cross-repo bridge: `dlm export --emit-sway-json` calls
      
        89
        # into ``dlm_sway.integrations.dlm.autogen`` to write a ready-to-run
      
        90
        # sway.yaml alongside the GGUF. NB: pulls plain `dlm-sway`, NOT
      
        91
        # `dlm-sway[dlm]` — `dlm-sway[dlm]` would round-trip back to this
      
        92
        # package and create a pip resolver cycle. Plain `dlm-sway` is enough
      
        93
        # because `build_spec_dict` operates on data structures the dlm
      
        94
        # bridge already exposes from sway's side.
      
        95
        sway = [
      
        96
            "dlm-sway>=0.1.0",
      
        97
        ]
      
        98
        
        99
        [project.scripts]
      
        100
        dlm = "dlm.cli.app:main"
      
        101
        
        102
        [project.urls]
      
        103
        Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel"
      
        104
        Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues"
      
        105
        
        106
        [dependency-groups]
      
        107
        dev = [
      
        108
            # Test + lint tooling only. ML runtime moved to [project].dependencies
      
        109
            # (audit-05 B1) so `pip install dlm` gives users a working CLI.
      
        110
            "pytest>=8.0",
      
        111
            "pytest-cov>=5.0",
      
        112
            "mypy>=1.11",
      
        113
            "ruff>=0.6",
      
        114
            "types-pyyaml>=6.0",
      
        115
            "types-psutil>=6.0",
      
        116
            "hypothesis>=6.152.1",
      
        117
        ]
      
        118
        docs = [
      
        119
            # Sprint 16: MkDocs Material site. Separated from `dev` so contributors
      
        120
            # working on code don't pay the docs dep surface.
      
        121
            "mkdocs>=1.6",
      
        122
            "mkdocs-material>=9.5",
      
        123
        ]
      
        124
        
        125
        [build-system]
      
        126
        requires = ["hatchling"]
      
        127
        build-backend = "hatchling.build"
      
        128
        
        129
        [tool.hatch.build.targets.wheel]
      
        130
        packages = ["src/dlm"]
      
        131
        
        132
        # -------- ruff --------
      
        133
        [tool.ruff]
      
        134
        line-length = 100
      
        135
        target-version = "py311"
      
        136
        src = ["src", "tests"]
      
        137
        # Vendored third-party code (llama.cpp) — don't lint or format it.
      
        138
        extend-exclude = ["vendor/llama.cpp"]
      
        139
        
        140
        [tool.ruff.lint]
      
        141
        select = [
      
        142
            "E",    # pycodestyle errors
      
        143
            "F",    # pyflakes
      
        144
            "W",    # pycodestyle warnings
      
        145
            "I",    # isort
      
        146
            "UP",   # pyupgrade
      
        147
            "B",    # bugbear
      
        148
            "N",    # pep8-naming
      
        149
            "C4",   # comprehensions
      
        150
            "SIM",  # simplify
      
        151
            "PT",   # pytest
      
        152
            "RET",  # return
      
        153
            "ARG",  # unused args
      
        154
            "PTH",  # use pathlib
      
        155
            "TID",  # tidy imports
      
        156
        ]
      
        157
        ignore = [
      
        158
            "E501",  # handled by formatter
      
        159
        ]
      
        160
        
        161
        [tool.ruff.lint.per-file-ignores]
      
        162
        # SIM117 (combinable `with`) is noisy in tests that stream tar+zstd
      
        163
        # into a sequence of readers; the nested form is clearer than the
      
        164
        # comma-separated one.
      
        165
        "tests/**/*.py" = ["ARG", "PT011", "SIM117"]
      
        166
        # Typer stub subcommands accept every CLI arg the real implementation
      
        167
        # will take so `--help` reflects the shipping surface — even though the
      
        168
        # stub body discards them.
      
        169
        "src/dlm/cli/commands.py" = ["ARG001"]
      
        170
        # HuggingFace Trainer callbacks MUST accept `args`/`state`/`control`
      
        171
        # positionally even when the implementation only reads some of them —
      
        172
        # HF dispatches them by position. ARG002 for these wrappers is noise.
      
        173
        "src/dlm/train/cpt/embed_warmup.py" = ["ARG002"]
      
        174
        # Modality dispatch uses a polymorphic interface — each subclass uses
      
        175
        # a different subset of the keyword args (text.dispatch_export reads
      
        176
        # none, VL reads gguf_emission_context, audio ignores it). ARG002
      
        177
        # flags the unused ones in each branch; the shared signature is the
      
        178
        # point of the abstraction.
      
        179
        "src/dlm/modality/*.py" = ["ARG002"]
      
        180
        
        181
        [tool.ruff.format]
      
        182
        quote-style = "double"
      
        183
        indent-style = "space"
      
        184
        
        185
        # -------- mypy --------
      
        186
        [tool.mypy]
      
        187
        strict = true
      
        188
        python_version = "3.11"
      
        189
        packages = ["dlm"]
      
        190
        mypy_path = "src"
      
        191
        warn_return_any = true
      
        192
        warn_unused_ignores = true
      
        193
        warn_redundant_casts = true
      
        194
        no_implicit_optional = true
      
        195
        disallow_untyped_decorators = true
      
        196
        plugins = ["pydantic.mypy"]
      
        197
        
        198
        [tool.pydantic-mypy]
      
        199
        init_forbid_extra = true
      
        200
        init_typed = true
      
        201
        warn_required_dynamic_aliases = true
      
        202
        
        203
        # HF ecosystem packages (Sprint 06/07/09) ship without stubs or py.typed markers.
      
        204
        # Sprint 07 modules touch them through narrow boundaries; annotations are explicit
      
        205
        # at our boundary so the rest of the codebase remains strict.
      
        206
        [[tool.mypy.overrides]]
      
        207
        module = [
      
        208
            "datasets.*",
      
        209
            "transformers.*",
      
        210
            "peft.*",
      
        211
            "trl.*",
      
        212
            "huggingface_hub.*",
      
        213
            "bitsandbytes.*",
      
        214
        ]
      
        215
        ignore_missing_imports = true
      
        216
        # Treat calls into these stubless ecosystems as Any-typed; their
      
        217
        # actual return types aren't reflected in stubs.
      
        218
        disable_error_code = ["no-untyped-call"]
      
        219
        
        220
        # Optional runtime deps used behind feature gates may be absent or
      
        221
        # ship without type metadata on some CI runners. Keep those imports
      
        222
        # narrow and local in code, and suppress missing-stub noise here
      
        223
        # instead of scattering environment-specific inline ignores.
      
        224
        [[tool.mypy.overrides]]
      
        225
        module = [
      
        226
            "soxr",
      
        227
            "scipy",
      
        228
            "scipy.*",
      
        229
            "mlx_lm",
      
        230
            "mlx_lm.*",
      
        231
        ]
      
        232
        ignore_missing_imports = true
      
        233
        disable_error_code = ["no-untyped-call"]
      
        234
        
        235
        # -------- pytest --------
      
        236
        [tool.pytest.ini_options]
      
        237
        testpaths = ["tests"]
      
        238
        addopts = [
      
        239
            "-ra",
      
        240
            "-m", "not slow and not gpu and not online and not vl and not audio and not ollama",
      
        241
        ]
      
        242
        markers = [
      
        243
            "slow: expensive; deselected by default",
      
        244
            "gpu: requires CUDA; skipped on CPU/MPS runners",
      
        245
            "online: touches the network; skipped in offline CI",
      
        246
            "vl: vision-language; requires a GPU + VL HF weights; deselected by default",
      
        247
            "audio: audio-language; requires a GPU + audio HF weights; deselected by default",
      
        248
            "ollama: requires a local Ollama install (0.4+); deselected by default",
      
        249
        ]

1	[project]
2	name = "document-language-model"
3	version = "0.10.0"
4	description = "Directive-driven local LLM training, retraining, and export from .dlm documents, codebases, and multimodal sources."
5	readme = "README.md"
6	requires-python = ">=3.11"
7	license = { text = "MIT" }
8	authors = [{ name = "espadonne", email = "mfwolffe@outlook.com" }]
9	keywords = ["llm", "lora", "fine-tuning", "ollama", "local-ai"]
10	classifiers = [
11	"Development Status :: 2 - Pre-Alpha",
12	"Intended Audience :: Developers",
13	"License :: OSI Approved :: MIT License",
14	"Programming Language :: Python :: 3",
15	"Programming Language :: Python :: 3.11",
16	"Programming Language :: Python :: 3.12",
17	"Topic :: Scientific/Engineering :: Artificial Intelligence",
18	]
19	dependencies = [
20	# CLI / doc / lightweight plumbing.
21	"typer>=0.12",
22	"rich>=13.7",
23	"prompt-toolkit>=3.0", # dlm repl interactive prompt (Sprint 24)
24	"watchfiles>=0.24", # dlm train --watch save-to-train loop (Sprint 25)
25	"python-ulid>=3.0", # dlm_id generation (Sprint 13)
26	"pydantic>=2.9", # schema validation (Sprint 03+)
27	"pyyaml>=6.0", # .dlm frontmatter (Sprint 03+)
28	"psutil>=6.0", # hardware doctor (Sprint 05)
29	"zstandard>=0.23", # replay corpus framing (Sprint 08)
30	"cbor2>=5.6", # replay snapshot encoding (Sprint 08)
31	"packaging>=24.0", # lock policy semver parsing (Sprint 15)
32
33	# ML runtime. `dlm train`, `dlm prompt`, `dlm export` all import these;
34	# a `pip install dlm` that omits them would ImportError on first call
35	# (audit-05 B1).
36	"torch>=2.4",
37	"transformers>=4.45",
38	"peft>=0.13",
39	"trl>=0.12",
40	"datasets>=3.0",
41	"huggingface-hub>=0.25",
42	"accelerate>=1.0", # dlm train --gpus launcher (Sprint 23; audit-08 M2)
43	"safetensors>=0.4", # GGUF conversion + adapter I/O (Sprint 11)
44	"sentencepiece>=0.2", # vendored llama.cpp convert_lora_to_gguf imports it (audit-08 P3)
45	]
46
47	[project.optional-dependencies]
48	# CUDA-only. `bitsandbytes` won't install on macOS / CPU-only boxes; gate
49	# behind an extra so `pip install dlm` stays portable and `pip install
50	# dlm[cuda]` unlocks QLoRA.
51	cuda = [
52	"bitsandbytes>=0.43",
53	]
54	# Apple Silicon only (Sprint 21). `mlx` + `mlx-lm` wheels are darwin-arm64
55	# exclusives; env markers keep `uv sync --extra mlx` a no-op on non-Apple
56	# hosts so wheel resolution doesn't fail for Linux/CUDA contributors.
57	mlx = [
58	"mlx>=0.18; sys_platform == 'darwin' and platform_machine == 'arm64'",
59	"mlx-lm>=0.19; sys_platform == 'darwin' and platform_machine == 'arm64'",
60	]
61	# Sprint 26 observability sinks. TensorBoard + W&B are optional — core
62	# metrics (SQLite) work without them. `uv sync --extra observability`
63	# unlocks `dlm train --tensorboard` and `dlm train --wandb <project>`.
64	observability = [
65	"tensorboard>=2.15",
66	"wandb>=0.17",
67	]
68	# Sprint 35.2 audio-language bases. `soundfile` is the only dep needed
69	# to decode .wav / .flac / .ogg; MP3 support would require libsndfile
70	# ≥1.1 and is deferred. Keep optional so text-only users don't pull
71	# the libsndfile C library. `soxr` is the polyphase resampler used by
72	# `training.audio.auto_resample=True`; without it (or scipy as a
73	# fallback) the resample path raises `AudioResampleUnavailable`
74	# rather than training on the wrong sample rate.
75	audio = [
76	"soundfile>=0.12",
77	"soxr>=0.3",
78	]
79	# Sprint 43 synth teachers. API clients are optional — `dlm synth`
80	# works with `self` and `hf:` teachers without these. Install via
81	# `pip install dlm[openai]` or `pip install dlm[anthropic]`.
82	openai = [
83	"openai>=1.0",
84	]
85	anthropic = [
86	"anthropic>=0.30",
87	]
88	# Sprint 26 (X1) cross-repo bridge: `dlm export --emit-sway-json` calls
89	# into ``dlm_sway.integrations.dlm.autogen`` to write a ready-to-run
90	# sway.yaml alongside the GGUF. NB: pulls plain `dlm-sway`, NOT
91	# `dlm-sway[dlm]` — `dlm-sway[dlm]` would round-trip back to this
92	# package and create a pip resolver cycle. Plain `dlm-sway` is enough
93	# because `build_spec_dict` operates on data structures the dlm
94	# bridge already exposes from sway's side.
95	sway = [
96	"dlm-sway>=0.1.0",
97	]
98
99	[project.scripts]
100	dlm = "dlm.cli.app:main"
101
102	[project.urls]
103	Homepage = "https://github.com/tenseleyFlow/DocumentLanguageModel"
104	Issues = "https://github.com/tenseleyFlow/DocumentLanguageModel/issues"
105
106	[dependency-groups]
107	dev = [
108	# Test + lint tooling only. ML runtime moved to [project].dependencies
109	# (audit-05 B1) so `pip install dlm` gives users a working CLI.
110	"pytest>=8.0",
111	"pytest-cov>=5.0",
112	"mypy>=1.11",
113	"ruff>=0.6",
114	"types-pyyaml>=6.0",
115	"types-psutil>=6.0",
116	"hypothesis>=6.152.1",
117	]
118	docs = [
119	# Sprint 16: MkDocs Material site. Separated from `dev` so contributors
120	# working on code don't pay the docs dep surface.
121	"mkdocs>=1.6",
122	"mkdocs-material>=9.5",
123	]
124
125	[build-system]
126	requires = ["hatchling"]
127	build-backend = "hatchling.build"
128
129	[tool.hatch.build.targets.wheel]
130	packages = ["src/dlm"]
131
132	# -------- ruff --------
133	[tool.ruff]
134	line-length = 100
135	target-version = "py311"
136	src = ["src", "tests"]
137	# Vendored third-party code (llama.cpp) — don't lint or format it.
138	extend-exclude = ["vendor/llama.cpp"]
139
140	[tool.ruff.lint]
141	select = [
142	"E", # pycodestyle errors
143	"F", # pyflakes
144	"W", # pycodestyle warnings
145	"I", # isort
146	"UP", # pyupgrade
147	"B", # bugbear
148	"N", # pep8-naming
149	"C4", # comprehensions
150	"SIM", # simplify
151	"PT", # pytest
152	"RET", # return
153	"ARG", # unused args
154	"PTH", # use pathlib
155	"TID", # tidy imports
156	]
157	ignore = [
158	"E501", # handled by formatter
159	]
160
161	[tool.ruff.lint.per-file-ignores]
162	# SIM117 (combinable `with`) is noisy in tests that stream tar+zstd
163	# into a sequence of readers; the nested form is clearer than the
164	# comma-separated one.
165	"tests/*/.py" = ["ARG", "PT011", "SIM117"]
166	# Typer stub subcommands accept every CLI arg the real implementation
167	# will take so `--help` reflects the shipping surface — even though the
168	# stub body discards them.
169	"src/dlm/cli/commands.py" = ["ARG001"]
170	# HuggingFace Trainer callbacks MUST accept `args`/`state`/`control`
171	# positionally even when the implementation only reads some of them —
172	# HF dispatches them by position. ARG002 for these wrappers is noise.
173	"src/dlm/train/cpt/embed_warmup.py" = ["ARG002"]
174	# Modality dispatch uses a polymorphic interface — each subclass uses
175	# a different subset of the keyword args (text.dispatch_export reads
176	# none, VL reads gguf_emission_context, audio ignores it). ARG002
177	# flags the unused ones in each branch; the shared signature is the
178	# point of the abstraction.
179	"src/dlm/modality/*.py" = ["ARG002"]
180
181	[tool.ruff.format]
182	quote-style = "double"
183	indent-style = "space"
184
185	# -------- mypy --------
186	[tool.mypy]
187	strict = true
188	python_version = "3.11"
189	packages = ["dlm"]
190	mypy_path = "src"
191	warn_return_any = true
192	warn_unused_ignores = true
193	warn_redundant_casts = true
194	no_implicit_optional = true
195	disallow_untyped_decorators = true
196	plugins = ["pydantic.mypy"]
197
198	[tool.pydantic-mypy]
199	init_forbid_extra = true
200	init_typed = true
201	warn_required_dynamic_aliases = true
202
203	# HF ecosystem packages (Sprint 06/07/09) ship without stubs or py.typed markers.
204	# Sprint 07 modules touch them through narrow boundaries; annotations are explicit
205	# at our boundary so the rest of the codebase remains strict.
206	[[tool.mypy.overrides]]
207	module = [
208	"datasets.*",
209	"transformers.*",
210	"peft.*",
211	"trl.*",
212	"huggingface_hub.*",
213	"bitsandbytes.*",
214	]
215	ignore_missing_imports = true
216	# Treat calls into these stubless ecosystems as Any-typed; their
217	# actual return types aren't reflected in stubs.
218	disable_error_code = ["no-untyped-call"]
219
220	# Optional runtime deps used behind feature gates may be absent or
221	# ship without type metadata on some CI runners. Keep those imports
222	# narrow and local in code, and suppress missing-stub noise here
223	# instead of scattering environment-specific inline ignores.
224	[[tool.mypy.overrides]]
225	module = [
226	"soxr",
227	"scipy",
228	"scipy.*",
229	"mlx_lm",
230	"mlx_lm.*",
231	]
232	ignore_missing_imports = true
233	disable_error_code = ["no-untyped-call"]
234
235	# -------- pytest --------
236	[tool.pytest.ini_options]
237	testpaths = ["tests"]
238	addopts = [
239	"-ra",
240	"-m", "not slow and not gpu and not online and not vl and not audio and not ollama",
241	]
242	markers = [
243	"slow: expensive; deselected by default",
244	"gpu: requires CUDA; skipped on CPU/MPS runners",
245	"online: touches the network; skipped in offline CI",
246	"vl: vision-language; requires a GPU + VL HF weights; deselected by default",
247	"audio: audio-language; requires a GPU + audio HF weights; deselected by default",
248	"ollama: requires a local Ollama install (0.4+); deselected by default",
249	]