| 1 |
"""`.dlm/ignore` — gitignore-subset parser. |
| 2 |
|
| 3 |
Drive-by exclusions for users who don't want full `.dlm/training.yaml` |
| 4 |
config. Drop a three-line `.dlm/ignore`, done. Grammar is a strict |
| 5 |
subset of `.gitignore`: |
| 6 |
|
| 7 |
Supported: |
| 8 |
- `#` starts a comment; blank lines skipped |
| 9 |
- `**` globstar: matches zero-or-more path components |
| 10 |
- `!pattern` negates — re-includes an otherwise-excluded file |
| 11 |
- Trailing `/` matches directories only |
| 12 |
- Leading `/` anchors to the `.dlm/`'s parent (not any ancestor) |
| 13 |
|
| 14 |
NOT supported (document explicitly in docs/format/dlm-ignore.md): |
| 15 |
- Backslash escapes |
| 16 |
- Character classes `[abc]` |
| 17 |
- Whitespace-escape with backslash |
| 18 |
|
| 19 |
Malformed lines log one WARN and are dropped — ignore files are a |
| 20 |
drive-by UX, a typo shouldn't kill the training run. |
| 21 |
""" |
| 22 |
|
| 23 |
from __future__ import annotations |
| 24 |
|
| 25 |
import logging |
| 26 |
import re |
| 27 |
from dataclasses import dataclass |
| 28 |
|
| 29 |
_LOG = logging.getLogger(__name__) |
| 30 |
|
| 31 |
|
| 32 |
@dataclass(frozen=True) |
| 33 |
class IgnoreRule: |
| 34 |
"""One line from a `.dlm/ignore` file, pre-parsed. |
| 35 |
|
| 36 |
`pattern` is the raw glob (minus leading `!` and trailing `/`). |
| 37 |
`anchored` = leading `/` in source → match only at anchor root. |
| 38 |
`directory_only` = trailing `/` → match only directories. |
| 39 |
`negate` = leading `!` → re-include a previously-excluded path. |
| 40 |
""" |
| 41 |
|
| 42 |
pattern: str |
| 43 |
anchored: bool |
| 44 |
directory_only: bool |
| 45 |
negate: bool |
| 46 |
|
| 47 |
|
| 48 |
def parse_ignore_file(text: str) -> tuple[IgnoreRule, ...]: |
| 49 |
"""Parse a `.dlm/ignore` body into rule tuples. |
| 50 |
|
| 51 |
Skips blanks and comments. Malformed lines log + drop (never raise) |
| 52 |
— the whole point of `.dlm/ignore` is low-ceremony, so a syntax |
| 53 |
error in one line shouldn't fail the walk. |
| 54 |
""" |
| 55 |
rules: list[IgnoreRule] = [] |
| 56 |
for lineno, raw in enumerate(text.splitlines(), start=1): |
| 57 |
line = raw.rstrip() |
| 58 |
if not line or line.lstrip().startswith("#"): |
| 59 |
continue |
| 60 |
|
| 61 |
negate = line.startswith("!") |
| 62 |
if negate: |
| 63 |
line = line[1:] |
| 64 |
if not line: |
| 65 |
_LOG.warning("dlm/ignore:%d: bare '!' with no pattern; skipping", lineno) |
| 66 |
continue |
| 67 |
|
| 68 |
anchored = line.startswith("/") |
| 69 |
if anchored: |
| 70 |
line = line[1:] |
| 71 |
if not line: |
| 72 |
_LOG.warning("dlm/ignore:%d: bare '/' with no pattern; skipping", lineno) |
| 73 |
continue |
| 74 |
|
| 75 |
directory_only = line.endswith("/") |
| 76 |
if directory_only: |
| 77 |
line = line[:-1] |
| 78 |
if not line: |
| 79 |
_LOG.warning("dlm/ignore:%d: pattern reduced to empty; skipping", lineno) |
| 80 |
continue |
| 81 |
|
| 82 |
rules.append( |
| 83 |
IgnoreRule( |
| 84 |
pattern=line, |
| 85 |
anchored=anchored, |
| 86 |
directory_only=directory_only, |
| 87 |
negate=negate, |
| 88 |
) |
| 89 |
) |
| 90 |
return tuple(rules) |
| 91 |
|
| 92 |
|
| 93 |
def matches(rule: IgnoreRule, relpath: str, *, is_dir: bool) -> bool: |
| 94 |
"""Return True if `relpath` matches `rule`. |
| 95 |
|
| 96 |
`relpath` is the POSIX-form path relative to the anchor directory |
| 97 |
(the `.dlm/`'s parent). `is_dir` is used to honor `directory_only`. |
| 98 |
|
| 99 |
Semantics follow `.gitignore`: |
| 100 |
- `anchored=True` matches only when the pattern matches the path |
| 101 |
from position 0 (`src/foo.py` matches `/src/**`, not `vendor/src/foo.py`). |
| 102 |
- `anchored=False` matches if any directory-anchored suffix matches |
| 103 |
(`node_modules/**` matches `a/b/node_modules/c.js`). |
| 104 |
- `directory_only=True` only counts full-path matches when `is_dir` |
| 105 |
is True. For files, the rule still matches if any ancestor path |
| 106 |
component matches the pattern (so `build/` flags files under |
| 107 |
any directory named `build`). |
| 108 |
""" |
| 109 |
regex = _compile_ignore_pattern(rule.pattern) |
| 110 |
|
| 111 |
if rule.anchored: |
| 112 |
candidate_paths = [relpath] |
| 113 |
else: |
| 114 |
# Unanchored: try every "from this component onward" suffix. |
| 115 |
parts = relpath.split("/") |
| 116 |
candidate_paths = ["/".join(parts[i:]) for i in range(len(parts))] |
| 117 |
|
| 118 |
full_path_match = any(regex.fullmatch(c) is not None for c in candidate_paths) |
| 119 |
|
| 120 |
if full_path_match: |
| 121 |
# Directory-only: a full match counts only if relpath IS a |
| 122 |
# directory. A file literally named `build` (relpath="build", |
| 123 |
# is_dir=False) does NOT match `build/` on its own path; |
| 124 |
# it only matches via the ancestor-component check below. |
| 125 |
if rule.directory_only and not is_dir: |
| 126 |
pass |
| 127 |
else: |
| 128 |
return True |
| 129 |
|
| 130 |
# Directory-only + unanchored: a file is also covered when any |
| 131 |
# ancestor directory in its path matches the pattern. |
| 132 |
if rule.directory_only and not rule.anchored: |
| 133 |
parts = relpath.split("/") |
| 134 |
# For files: only check ancestors (skip the final component). |
| 135 |
# For dirs: check all components including the last. |
| 136 |
last_index = len(parts) - (0 if is_dir else 1) |
| 137 |
for i in range(last_index): |
| 138 |
if regex.fullmatch(parts[i]) is not None: |
| 139 |
return True |
| 140 |
|
| 141 |
return False |
| 142 |
|
| 143 |
|
| 144 |
def _compile_ignore_pattern(pattern: str) -> re.Pattern[str]: |
| 145 |
"""Translate the ignore-file glob grammar to a regex. |
| 146 |
|
| 147 |
Shares the `**` / `*` / `?` semantics with |
| 148 |
`dlm.directives.safety._compile_glob`, which the directive |
| 149 |
include/exclude filters already use. Keeping the logic in sync |
| 150 |
matters — a user who writes `tests/**` in `training.yaml.exclude` |
| 151 |
should get the same match set as writing it in `.dlm/ignore`. |
| 152 |
""" |
| 153 |
i = 0 |
| 154 |
n = len(pattern) |
| 155 |
out: list[str] = ["^"] |
| 156 |
while i < n: |
| 157 |
c = pattern[i] |
| 158 |
if c == "*": |
| 159 |
if i + 1 < n and pattern[i + 1] == "*": |
| 160 |
out.append(".*") |
| 161 |
i += 2 |
| 162 |
if i < n and pattern[i] == "/": |
| 163 |
i += 1 |
| 164 |
else: |
| 165 |
out.append("[^/]*") |
| 166 |
i += 1 |
| 167 |
elif c == "?": |
| 168 |
out.append("[^/]") |
| 169 |
i += 1 |
| 170 |
else: |
| 171 |
out.append(re.escape(c)) |
| 172 |
i += 1 |
| 173 |
out.append("$") |
| 174 |
return re.compile("".join(out)) |