"""`.dlm/ignore` — gitignore-subset parser.

Drive-by exclusions for users who don't want full `.dlm/training.yaml`
config. Drop a three-line `.dlm/ignore`, done. Grammar is a strict
subset of `.gitignore`:

Supported:
- `#` starts a comment; blank lines skipped
- `**` globstar: matches zero-or-more path components
- `!pattern` negates — re-includes an otherwise-excluded file
- Trailing `/` matches directories only
- Leading `/` anchors to the `.dlm/`'s parent (not any ancestor)

NOT supported (document explicitly in docs/format/dlm-ignore.md):
- Backslash escapes
- Character classes `[abc]`
- Whitespace-escape with backslash

Malformed lines log one WARN and are dropped — ignore files are a
drive-by UX, a typo shouldn't kill the training run.
"""

from __future__ import annotations

import logging
import re
from dataclasses import dataclass

_LOG = logging.getLogger(__name__)


@dataclass(frozen=True)
class IgnoreRule:
    """One line from a `.dlm/ignore` file, pre-parsed.

    `pattern` is the raw glob (minus leading `!` and trailing `/`).
    `anchored` = leading `/` in source → match only at anchor root.
    `directory_only` = trailing `/` → match only directories.
    `negate` = leading `!` → re-include a previously-excluded path.
    """

    pattern: str
    anchored: bool
    directory_only: bool
    negate: bool


def parse_ignore_file(text: str) -> tuple[IgnoreRule, ...]:
    """Parse a `.dlm/ignore` body into rule tuples.

    Skips blanks and comments. Malformed lines log + drop (never raise)
    — the whole point of `.dlm/ignore` is low-ceremony, so a syntax
    error in one line shouldn't fail the walk.
    """
    rules: list[IgnoreRule] = []
    for lineno, raw in enumerate(text.splitlines(), start=1):
        line = raw.rstrip()
        if not line or line.lstrip().startswith("#"):
            continue

        negate = line.startswith("!")
        if negate:
            line = line[1:]
        if not line:
            _LOG.warning("dlm/ignore:%d: bare '!' with no pattern; skipping", lineno)
            continue

        anchored = line.startswith("/")
        if anchored:
            line = line[1:]
        if not line:
            _LOG.warning("dlm/ignore:%d: bare '/' with no pattern; skipping", lineno)
            continue

        directory_only = line.endswith("/")
        if directory_only:
            line = line[:-1]
        if not line:
            _LOG.warning("dlm/ignore:%d: pattern reduced to empty; skipping", lineno)
            continue

        rules.append(
            IgnoreRule(
                pattern=line,
                anchored=anchored,
                directory_only=directory_only,
                negate=negate,
            )
        )
    return tuple(rules)


def matches(rule: IgnoreRule, relpath: str, *, is_dir: bool) -> bool:
    """Return True if `relpath` matches `rule`.

    `relpath` is the POSIX-form path relative to the anchor directory
    (the `.dlm/`'s parent). `is_dir` is used to honor `directory_only`.

    Semantics follow `.gitignore`:
    - `anchored=True` matches only when the pattern matches the path
      from position 0 (`src/foo.py` matches `/src/**`, not `vendor/src/foo.py`).
    - `anchored=False` matches if any directory-anchored suffix matches
      (`node_modules/**` matches `a/b/node_modules/c.js`).
    - `directory_only=True` only counts full-path matches when `is_dir`
      is True. For files, the rule still matches if any ancestor path
      component matches the pattern (so `build/` flags files under
      any directory named `build`).
    """
    regex = _compile_ignore_pattern(rule.pattern)

    if rule.anchored:
        candidate_paths = [relpath]
    else:
        # Unanchored: try every "from this component onward" suffix.
        parts = relpath.split("/")
        candidate_paths = ["/".join(parts[i:]) for i in range(len(parts))]

    full_path_match = any(regex.fullmatch(c) is not None for c in candidate_paths)

    if full_path_match:
        # Directory-only: a full match counts only if relpath IS a
        # directory. A file literally named `build` (relpath="build",
        # is_dir=False) does NOT match `build/` on its own path;
        # it only matches via the ancestor-component check below.
        if rule.directory_only and not is_dir:
            pass
        else:
            return True

    # Directory-only + unanchored: a file is also covered when any
    # ancestor directory in its path matches the pattern.
    if rule.directory_only and not rule.anchored:
        parts = relpath.split("/")
        # For files: only check ancestors (skip the final component).
        # For dirs: check all components including the last.
        last_index = len(parts) - (0 if is_dir else 1)
        for i in range(last_index):
            if regex.fullmatch(parts[i]) is not None:
                return True

    return False


def _compile_ignore_pattern(pattern: str) -> re.Pattern[str]:
    """Translate the ignore-file glob grammar to a regex.

    Shares the `**` / `*` / `?` semantics with
    `dlm.directives.safety._compile_glob`, which the directive
    include/exclude filters already use. Keeping the logic in sync
    matters — a user who writes `tests/**` in `training.yaml.exclude`
    should get the same match set as writing it in `.dlm/ignore`.
    """
    i = 0
    n = len(pattern)
    out: list[str] = ["^"]
    while i < n:
        c = pattern[i]
        if c == "*":
            if i + 1 < n and pattern[i + 1] == "*":
                out.append(".*")
                i += 2
                if i < n and pattern[i] == "/":
                    i += 1
            else:
                out.append("[^/]*")
                i += 1
        elif c == "?":
            out.append("[^/]")
            i += 1
        else:
            out.append(re.escape(c))
            i += 1
    out.append("$")
    return re.compile("".join(out))