documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 5931 bytes Raw Blame History

  
        1
        """`.dlm/ignore` — gitignore-subset parser.
      
        2
        
        3
        Drive-by exclusions for users who don't want full `.dlm/training.yaml`
      
        4
        config. Drop a three-line `.dlm/ignore`, done. Grammar is a strict
      
        5
        subset of `.gitignore`:
      
        6
        
        7
        Supported:
      
        8
        - `#` starts a comment; blank lines skipped
      
        9
        - `**` globstar: matches zero-or-more path components
      
        10
        - `!pattern` negates — re-includes an otherwise-excluded file
      
        11
        - Trailing `/` matches directories only
      
        12
        - Leading `/` anchors to the `.dlm/`'s parent (not any ancestor)
      
        13
        
        14
        NOT supported (document explicitly in docs/format/dlm-ignore.md):
      
        15
        - Backslash escapes
      
        16
        - Character classes `[abc]`
      
        17
        - Whitespace-escape with backslash
      
        18
        
        19
        Malformed lines log one WARN and are dropped — ignore files are a
      
        20
        drive-by UX, a typo shouldn't kill the training run.
      
        21
        """
      
        22
        
        23
        from __future__ import annotations
      
        24
        
        25
        import logging
      
        26
        import re
      
        27
        from dataclasses import dataclass
      
        28
        
        29
        _LOG = logging.getLogger(__name__)
      
        30
        
        31
        
        32
        @dataclass(frozen=True)
      
        33
        class IgnoreRule:
      
        34
            """One line from a `.dlm/ignore` file, pre-parsed.
      
        35
        
        36
            `pattern` is the raw glob (minus leading `!` and trailing `/`).
      
        37
            `anchored` = leading `/` in source → match only at anchor root.
      
        38
            `directory_only` = trailing `/` → match only directories.
      
        39
            `negate` = leading `!` → re-include a previously-excluded path.
      
        40
            """
      
        41
        
        42
            pattern: str
      
        43
            anchored: bool
      
        44
            directory_only: bool
      
        45
            negate: bool
      
        46
        
        47
        
        48
        def parse_ignore_file(text: str) -> tuple[IgnoreRule, ...]:
      
        49
            """Parse a `.dlm/ignore` body into rule tuples.
      
        50
        
        51
            Skips blanks and comments. Malformed lines log + drop (never raise)
      
        52
            — the whole point of `.dlm/ignore` is low-ceremony, so a syntax
      
        53
            error in one line shouldn't fail the walk.
      
        54
            """
      
        55
            rules: list[IgnoreRule] = []
      
        56
            for lineno, raw in enumerate(text.splitlines(), start=1):
      
        57
                line = raw.rstrip()
      
        58
                if not line or line.lstrip().startswith("#"):
      
        59
                    continue
      
        60
        
        61
                negate = line.startswith("!")
      
        62
                if negate:
      
        63
                    line = line[1:]
      
        64
                if not line:
      
        65
                    _LOG.warning("dlm/ignore:%d: bare '!' with no pattern; skipping", lineno)
      
        66
                    continue
      
        67
        
        68
                anchored = line.startswith("/")
      
        69
                if anchored:
      
        70
                    line = line[1:]
      
        71
                if not line:
      
        72
                    _LOG.warning("dlm/ignore:%d: bare '/' with no pattern; skipping", lineno)
      
        73
                    continue
      
        74
        
        75
                directory_only = line.endswith("/")
      
        76
                if directory_only:
      
        77
                    line = line[:-1]
      
        78
                if not line:
      
        79
                    _LOG.warning("dlm/ignore:%d: pattern reduced to empty; skipping", lineno)
      
        80
                    continue
      
        81
        
        82
                rules.append(
      
        83
                    IgnoreRule(
      
        84
                        pattern=line,
      
        85
                        anchored=anchored,
      
        86
                        directory_only=directory_only,
      
        87
                        negate=negate,
      
        88
                    )
      
        89
                )
      
        90
            return tuple(rules)
      
        91
        
        92
        
        93
        def matches(rule: IgnoreRule, relpath: str, *, is_dir: bool) -> bool:
      
        94
            """Return True if `relpath` matches `rule`.
      
        95
        
        96
            `relpath` is the POSIX-form path relative to the anchor directory
      
        97
            (the `.dlm/`'s parent). `is_dir` is used to honor `directory_only`.
      
        98
        
        99
            Semantics follow `.gitignore`:
      
        100
            - `anchored=True` matches only when the pattern matches the path
      
        101
              from position 0 (`src/foo.py` matches `/src/**`, not `vendor/src/foo.py`).
      
        102
            - `anchored=False` matches if any directory-anchored suffix matches
      
        103
              (`node_modules/**` matches `a/b/node_modules/c.js`).
      
        104
            - `directory_only=True` only counts full-path matches when `is_dir`
      
        105
              is True. For files, the rule still matches if any ancestor path
      
        106
              component matches the pattern (so `build/` flags files under
      
        107
              any directory named `build`).
      
        108
            """
      
        109
            regex = _compile_ignore_pattern(rule.pattern)
      
        110
        
        111
            if rule.anchored:
      
        112
                candidate_paths = [relpath]
      
        113
            else:
      
        114
                # Unanchored: try every "from this component onward" suffix.
      
        115
                parts = relpath.split("/")
      
        116
                candidate_paths = ["/".join(parts[i:]) for i in range(len(parts))]
      
        117
        
        118
            full_path_match = any(regex.fullmatch(c) is not None for c in candidate_paths)
      
        119
        
        120
            if full_path_match:
      
        121
                # Directory-only: a full match counts only if relpath IS a
      
        122
                # directory. A file literally named `build` (relpath="build",
      
        123
                # is_dir=False) does NOT match `build/` on its own path;
      
        124
                # it only matches via the ancestor-component check below.
      
        125
                if rule.directory_only and not is_dir:
      
        126
                    pass
      
        127
                else:
      
        128
                    return True
      
        129
        
        130
            # Directory-only + unanchored: a file is also covered when any
      
        131
            # ancestor directory in its path matches the pattern.
      
        132
            if rule.directory_only and not rule.anchored:
      
        133
                parts = relpath.split("/")
      
        134
                # For files: only check ancestors (skip the final component).
      
        135
                # For dirs: check all components including the last.
      
        136
                last_index = len(parts) - (0 if is_dir else 1)
      
        137
                for i in range(last_index):
      
        138
                    if regex.fullmatch(parts[i]) is not None:
      
        139
                        return True
      
        140
        
        141
            return False
      
        142
        
        143
        
        144
        def _compile_ignore_pattern(pattern: str) -> re.Pattern[str]:
      
        145
            """Translate the ignore-file glob grammar to a regex.
      
        146
        
        147
            Shares the `**` / `*` / `?` semantics with
      
        148
            `dlm.directives.safety._compile_glob`, which the directive
      
        149
            include/exclude filters already use. Keeping the logic in sync
      
        150
            matters — a user who writes `tests/**` in `training.yaml.exclude`
      
        151
            should get the same match set as writing it in `.dlm/ignore`.
      
        152
            """
      
        153
            i = 0
      
        154
            n = len(pattern)
      
        155
            out: list[str] = ["^"]
      
        156
            while i < n:
      
        157
                c = pattern[i]
      
        158
                if c == "*":
      
        159
                    if i + 1 < n and pattern[i + 1] == "*":
      
        160
                        out.append(".*")
      
        161
                        i += 2
      
        162
                        if i < n and pattern[i] == "/":
      
        163
                            i += 1
      
        164
                    else:
      
        165
                        out.append("[^/]*")
      
        166
                        i += 1
      
        167
                elif c == "?":
      
        168
                    out.append("[^/]")
      
        169
                    i += 1
      
        170
                else:
      
        171
                    out.append(re.escape(c))
      
        172
                    i += 1
      
        173
            out.append("$")
      
        174
            return re.compile("".join(out))

1	"""`.dlm/ignore` — gitignore-subset parser.
2
3	Drive-by exclusions for users who don't want full `.dlm/training.yaml`
4	config. Drop a three-line `.dlm/ignore`, done. Grammar is a strict
5	subset of `.gitignore`:
6
7	Supported:
8	- `#` starts a comment; blank lines skipped
9	- `**` globstar: matches zero-or-more path components
10	- `!pattern` negates — re-includes an otherwise-excluded file
11	- Trailing `/` matches directories only
12	- Leading `/` anchors to the `.dlm/`'s parent (not any ancestor)
13
14	NOT supported (document explicitly in docs/format/dlm-ignore.md):
15	- Backslash escapes
16	- Character classes `[abc]`
17	- Whitespace-escape with backslash
18
19	Malformed lines log one WARN and are dropped — ignore files are a
20	drive-by UX, a typo shouldn't kill the training run.
21	"""
22
23	from __future__ import annotations
24
25	import logging
26	import re
27	from dataclasses import dataclass
28
29	_LOG = logging.getLogger(__name__)
30
31
32	@dataclass(frozen=True)
33	class IgnoreRule:
34	"""One line from a `.dlm/ignore` file, pre-parsed.
35
36	`pattern` is the raw glob (minus leading `!` and trailing `/`).
37	`anchored` = leading `/` in source → match only at anchor root.
38	`directory_only` = trailing `/` → match only directories.
39	`negate` = leading `!` → re-include a previously-excluded path.
40	"""
41
42	pattern: str
43	anchored: bool
44	directory_only: bool
45	negate: bool
46
47
48	def parse_ignore_file(text: str) -> tuple[IgnoreRule, ...]:
49	"""Parse a `.dlm/ignore` body into rule tuples.
50
51	Skips blanks and comments. Malformed lines log + drop (never raise)
52	— the whole point of `.dlm/ignore` is low-ceremony, so a syntax
53	error in one line shouldn't fail the walk.
54	"""
55	rules: list[IgnoreRule] = []
56	for lineno, raw in enumerate(text.splitlines(), start=1):
57	line = raw.rstrip()
58	if not line or line.lstrip().startswith("#"):
59	continue
60
61	negate = line.startswith("!")
62	if negate:
63	line = line[1:]
64	if not line:
65	_LOG.warning("dlm/ignore:%d: bare '!' with no pattern; skipping", lineno)
66	continue
67
68	anchored = line.startswith("/")
69	if anchored:
70	line = line[1:]
71	if not line:
72	_LOG.warning("dlm/ignore:%d: bare '/' with no pattern; skipping", lineno)
73	continue
74
75	directory_only = line.endswith("/")
76	if directory_only:
77	line = line[:-1]
78	if not line:
79	_LOG.warning("dlm/ignore:%d: pattern reduced to empty; skipping", lineno)
80	continue
81
82	rules.append(
83	IgnoreRule(
84	pattern=line,
85	anchored=anchored,
86	directory_only=directory_only,
87	negate=negate,
88	)
89	)
90	return tuple(rules)
91
92
93	def matches(rule: IgnoreRule, relpath: str, *, is_dir: bool) -> bool:
94	"""Return True if `relpath` matches `rule`.
95
96	`relpath` is the POSIX-form path relative to the anchor directory
97	(the `.dlm/`'s parent). `is_dir` is used to honor `directory_only`.
98
99	Semantics follow `.gitignore`:
100	- `anchored=True` matches only when the pattern matches the path
101	from position 0 (`src/foo.py` matches `/src/**`, not `vendor/src/foo.py`).
102	- `anchored=False` matches if any directory-anchored suffix matches
103	(`node_modules/**` matches `a/b/node_modules/c.js`).
104	- `directory_only=True` only counts full-path matches when `is_dir`
105	is True. For files, the rule still matches if any ancestor path
106	component matches the pattern (so `build/` flags files under
107	any directory named `build`).
108	"""
109	regex = _compile_ignore_pattern(rule.pattern)
110
111	if rule.anchored:
112	candidate_paths = [relpath]
113	else:
114	# Unanchored: try every "from this component onward" suffix.
115	parts = relpath.split("/")
116	candidate_paths = ["/".join(parts[i:]) for i in range(len(parts))]
117
118	full_path_match = any(regex.fullmatch(c) is not None for c in candidate_paths)
119
120	if full_path_match:
121	# Directory-only: a full match counts only if relpath IS a
122	# directory. A file literally named `build` (relpath="build",
123	# is_dir=False) does NOT match `build/` on its own path;
124	# it only matches via the ancestor-component check below.
125	if rule.directory_only and not is_dir:
126	pass
127	else:
128	return True
129
130	# Directory-only + unanchored: a file is also covered when any
131	# ancestor directory in its path matches the pattern.
132	if rule.directory_only and not rule.anchored:
133	parts = relpath.split("/")
134	# For files: only check ancestors (skip the final component).
135	# For dirs: check all components including the last.
136	last_index = len(parts) - (0 if is_dir else 1)
137	for i in range(last_index):
138	if regex.fullmatch(parts[i]) is not None:
139	return True
140
141	return False
142
143
144	def _compile_ignore_pattern(pattern: str) -> re.Pattern[str]:
145	"""Translate the ignore-file glob grammar to a regex.
146
147	Shares the `*` / `` / `?` semantics with
148	`dlm.directives.safety._compile_glob`, which the directive
149	include/exclude filters already use. Keeping the logic in sync
150	matters — a user who writes `tests/**` in `training.yaml.exclude`
151	should get the same match set as writing it in `.dlm/ignore`.
152	"""
153	i = 0
154	n = len(pattern)
155	out: list[str] = ["^"]
156	while i < n:
157	c = pattern[i]
158	if c == "*":
159	if i + 1 < n and pattern[i + 1] == "*":
160	out.append(".*")
161	i += 2
162	if i < n and pattern[i] == "/":
163	i += 1
164	else:
165	out.append("[^/]*")
166	i += 1
167	elif c == "?":
168	out.append("[^/]")
169	i += 1
170	else:
171	out.append(re.escape(c))
172	i += 1
173	out.append("$")
174	return re.compile("".join(out))