Python · 2510 bytes Raw Blame History
1 """Curated default-exclude patterns applied unless opted out.
2
3 Everyone training on a codebase tree wants the same starter set gone:
4 VCS metadata, secrets, build artifacts, lockfiles, binaries. Shipping
5 these as defaults removes the "why is my adapter memorizing
6 package-lock.json" foot-gun from the common path.
7
8 `training.yaml` can opt out per-subtree via `exclude_defaults: false`
9 when a tree legitimately wants to train on (e.g.) generated code or
10 the `.git` dir itself. Defaults are a *starting point*, not a security
11 boundary — users with real secrets still need their own excludes.
12 """
13
14 from __future__ import annotations
15
16 from typing import Final
17
18 DEFAULT_EXCLUDES: Final[tuple[str, ...]] = (
19 # Version control metadata
20 ".git/**",
21 ".hg/**",
22 ".svn/**",
23 # Secrets / local config (best-effort; not a security boundary —
24 # users with actual secrets need an explicit exclude list).
25 ".env",
26 ".env.*",
27 "**/.env",
28 "**/.env.*",
29 "**/id_rsa",
30 "**/id_ed25519",
31 "**/*.pem",
32 "**/*.key",
33 "**/secrets.*",
34 # Python artifacts
35 "**/__pycache__/**",
36 "**/*.pyc",
37 "**/*.pyo",
38 ".venv/**",
39 "venv/**",
40 ".tox/**",
41 # Node / JS artifacts
42 "node_modules/**",
43 "**/*.min.js",
44 "**/*.min.css",
45 "**/*.map",
46 # Rust / Go / Java / C / C++ compiled output
47 "target/**",
48 "**/*.rlib",
49 "**/*.class",
50 "**/*.jar",
51 "**/*.o",
52 "**/*.so",
53 "**/*.dylib",
54 "**/*.dll",
55 # Build / dist trees
56 "build/**",
57 "dist/**",
58 "__generated__/**",
59 "generated/**",
60 # Lockfiles — long, low training signal, noisy to diff
61 "**/package-lock.json",
62 "**/yarn.lock",
63 "**/pnpm-lock.yaml",
64 "**/Cargo.lock",
65 "**/uv.lock",
66 "**/poetry.lock",
67 "**/Pipfile.lock",
68 # Archives + non-image binaries. Image extensions (png/jpg/jpeg/gif/
69 # webp/bmp/tiff) are intentionally *not* excluded — schema v10 made
70 # them first-class training material via `SectionType.IMAGE`, and
71 # `expand_sources` refuses to ingest them unless the caller supplies
72 # a `BlobStore`. Text-only workflows that pass no blob store still
73 # see zero images because the walker tallies them under
74 # `skipped_image_no_store` instead of emitting Sections.
75 "**/*.ico",
76 "**/*.pdf",
77 "**/*.zip",
78 "**/*.tar",
79 "**/*.gz",
80 "**/*.xz",
81 "**/*.bz2",
82 "**/*.7z",
83 "**/*.wasm",
84 # dlm's own config — don't train on the training config
85 ".dlm/**",
86 "**/.dlm/**",
87 )