Python · 5931 bytes Raw Blame History
1 """`.dlm/ignore` — gitignore-subset parser.
2
3 Drive-by exclusions for users who don't want full `.dlm/training.yaml`
4 config. Drop a three-line `.dlm/ignore`, done. Grammar is a strict
5 subset of `.gitignore`:
6
7 Supported:
8 - `#` starts a comment; blank lines skipped
9 - `**` globstar: matches zero-or-more path components
10 - `!pattern` negates — re-includes an otherwise-excluded file
11 - Trailing `/` matches directories only
12 - Leading `/` anchors to the `.dlm/`'s parent (not any ancestor)
13
14 NOT supported (document explicitly in docs/format/dlm-ignore.md):
15 - Backslash escapes
16 - Character classes `[abc]`
17 - Whitespace-escape with backslash
18
19 Malformed lines log one WARN and are dropped — ignore files are a
20 drive-by UX, a typo shouldn't kill the training run.
21 """
22
23 from __future__ import annotations
24
25 import logging
26 import re
27 from dataclasses import dataclass
28
29 _LOG = logging.getLogger(__name__)
30
31
32 @dataclass(frozen=True)
33 class IgnoreRule:
34 """One line from a `.dlm/ignore` file, pre-parsed.
35
36 `pattern` is the raw glob (minus leading `!` and trailing `/`).
37 `anchored` = leading `/` in source → match only at anchor root.
38 `directory_only` = trailing `/` → match only directories.
39 `negate` = leading `!` → re-include a previously-excluded path.
40 """
41
42 pattern: str
43 anchored: bool
44 directory_only: bool
45 negate: bool
46
47
48 def parse_ignore_file(text: str) -> tuple[IgnoreRule, ...]:
49 """Parse a `.dlm/ignore` body into rule tuples.
50
51 Skips blanks and comments. Malformed lines log + drop (never raise)
52 — the whole point of `.dlm/ignore` is low-ceremony, so a syntax
53 error in one line shouldn't fail the walk.
54 """
55 rules: list[IgnoreRule] = []
56 for lineno, raw in enumerate(text.splitlines(), start=1):
57 line = raw.rstrip()
58 if not line or line.lstrip().startswith("#"):
59 continue
60
61 negate = line.startswith("!")
62 if negate:
63 line = line[1:]
64 if not line:
65 _LOG.warning("dlm/ignore:%d: bare '!' with no pattern; skipping", lineno)
66 continue
67
68 anchored = line.startswith("/")
69 if anchored:
70 line = line[1:]
71 if not line:
72 _LOG.warning("dlm/ignore:%d: bare '/' with no pattern; skipping", lineno)
73 continue
74
75 directory_only = line.endswith("/")
76 if directory_only:
77 line = line[:-1]
78 if not line:
79 _LOG.warning("dlm/ignore:%d: pattern reduced to empty; skipping", lineno)
80 continue
81
82 rules.append(
83 IgnoreRule(
84 pattern=line,
85 anchored=anchored,
86 directory_only=directory_only,
87 negate=negate,
88 )
89 )
90 return tuple(rules)
91
92
93 def matches(rule: IgnoreRule, relpath: str, *, is_dir: bool) -> bool:
94 """Return True if `relpath` matches `rule`.
95
96 `relpath` is the POSIX-form path relative to the anchor directory
97 (the `.dlm/`'s parent). `is_dir` is used to honor `directory_only`.
98
99 Semantics follow `.gitignore`:
100 - `anchored=True` matches only when the pattern matches the path
101 from position 0 (`src/foo.py` matches `/src/**`, not `vendor/src/foo.py`).
102 - `anchored=False` matches if any directory-anchored suffix matches
103 (`node_modules/**` matches `a/b/node_modules/c.js`).
104 - `directory_only=True` only counts full-path matches when `is_dir`
105 is True. For files, the rule still matches if any ancestor path
106 component matches the pattern (so `build/` flags files under
107 any directory named `build`).
108 """
109 regex = _compile_ignore_pattern(rule.pattern)
110
111 if rule.anchored:
112 candidate_paths = [relpath]
113 else:
114 # Unanchored: try every "from this component onward" suffix.
115 parts = relpath.split("/")
116 candidate_paths = ["/".join(parts[i:]) for i in range(len(parts))]
117
118 full_path_match = any(regex.fullmatch(c) is not None for c in candidate_paths)
119
120 if full_path_match:
121 # Directory-only: a full match counts only if relpath IS a
122 # directory. A file literally named `build` (relpath="build",
123 # is_dir=False) does NOT match `build/` on its own path;
124 # it only matches via the ancestor-component check below.
125 if rule.directory_only and not is_dir:
126 pass
127 else:
128 return True
129
130 # Directory-only + unanchored: a file is also covered when any
131 # ancestor directory in its path matches the pattern.
132 if rule.directory_only and not rule.anchored:
133 parts = relpath.split("/")
134 # For files: only check ancestors (skip the final component).
135 # For dirs: check all components including the last.
136 last_index = len(parts) - (0 if is_dir else 1)
137 for i in range(last_index):
138 if regex.fullmatch(parts[i]) is not None:
139 return True
140
141 return False
142
143
144 def _compile_ignore_pattern(pattern: str) -> re.Pattern[str]:
145 """Translate the ignore-file glob grammar to a regex.
146
147 Shares the `**` / `*` / `?` semantics with
148 `dlm.directives.safety._compile_glob`, which the directive
149 include/exclude filters already use. Keeping the logic in sync
150 matters — a user who writes `tests/**` in `training.yaml.exclude`
151 should get the same match set as writing it in `.dlm/ignore`.
152 """
153 i = 0
154 n = len(pattern)
155 out: list[str] = ["^"]
156 while i < n:
157 c = pattern[i]
158 if c == "*":
159 if i + 1 < n and pattern[i + 1] == "*":
160 out.append(".*")
161 i += 2
162 if i < n and pattern[i] == "/":
163 i += 1
164 else:
165 out.append("[^/]*")
166 i += 1
167 elif c == "?":
168 out.append("[^/]")
169 i += 1
170 else:
171 out.append(re.escape(c))
172 i += 1
173 out.append("$")
174 return re.compile("".join(out))