tenseleyflow/documentlanguagemodel / d2d152a

Browse files

feat(io): add utf-8/BOM/CRLF text helper (audit F15)

Authored by espadonne
SHA
d2d152a9c9cb8b42da745affdf7e960ef1baf77e
Parents
e84a101
Tree
2b3db59

2 changed files

StatusFile+-
A src/dlm/io/__init__.py 1 0
A src/dlm/io/text.py 113 0
src/dlm/io/__init__.pyadded
@@ -0,0 +1,1 @@
1
+"""I/O helpers shared across the package."""
src/dlm/io/text.pyadded
@@ -0,0 +1,113 @@
1
+"""UTF-8 text I/O with BOM and CRLF hygiene (audit F15).
2
+
3
+Every sprint that reads or writes a `.dlm` file or any plain-text store
4
+artifact must route through these helpers. The contract:
5
+
6
+- UTF-8 strict: invalid bytes raise `DlmEncodingError` with the offending
7
+  byte offset.
8
+- A leading UTF-8 BOM is stripped and a warning is emitted (some Windows
9
+  editors add it; YAML parsers choke on it).
10
+- CRLF is normalized to LF before hashing or parsing, so section IDs are
11
+  stable across Windows and Unix edits of the same content.
12
+
13
+Binary I/O uses the normal `open(..., "rb")` path; this module is only for
14
+text.
15
+"""
16
+
17
+from __future__ import annotations
18
+
19
+import logging
20
+from pathlib import Path
21
+from typing import Final
22
+
23
+_LOG: Final = logging.getLogger(__name__)
24
+
25
+_UTF8_BOM: Final = "\ufeff"
26
+_UTF8_BOM_BYTES: Final = b"\xef\xbb\xbf"
27
+
28
+
29
+class DlmEncodingError(ValueError):
30
+    """Raised when a text file is not valid UTF-8.
31
+
32
+    `byte_offset` is the zero-based position of the first offending byte,
33
+    as reported by the codec.
34
+    """
35
+
36
+    def __init__(self, path: Path | None, byte_offset: int, reason: str) -> None:
37
+        self.path = path
38
+        self.byte_offset = byte_offset
39
+        self.reason = reason
40
+        where = str(path) if path is not None else "<text>"
41
+        super().__init__(f"{where}: invalid UTF-8 at byte {byte_offset}: {reason}")
42
+
43
+
44
+def read_text(path: Path) -> str:
45
+    """Read `path` as UTF-8; strip BOM; normalize CRLF → LF.
46
+
47
+    Raises `DlmEncodingError` on invalid UTF-8. Emits a warning via the
48
+    `dlm.io.text` logger when a BOM is present so pipelines surface the
49
+    (usually Windows-editor) provenance.
50
+    """
51
+    raw = path.read_bytes()
52
+    return _decode(raw, path=path)
53
+
54
+
55
+def read_text_str(raw: bytes, *, source: str = "<bytes>") -> str:
56
+    """In-memory variant of `read_text` for tests and streamed input."""
57
+    # Decode with path=None; `source` is only for the error message.
58
+    try:
59
+        return _decode(raw, path=None)
60
+    except DlmEncodingError as exc:
61
+        # Rewrite to include source in the message.
62
+        raise DlmEncodingError(None, exc.byte_offset, f"{source}: {exc.reason}") from exc
63
+
64
+
65
+def write_text(path: Path, content: str) -> None:
66
+    """Write `content` as UTF-8 with LF line endings, no BOM.
67
+
68
+    Writes atomically: writes to a temp sibling file then `os.replace`s.
69
+    """
70
+    # Normalize line endings on the way out too, belt-and-braces.
71
+    normalized = content.replace("\r\n", "\n").replace("\r", "\n")
72
+    tmp = path.with_suffix(path.suffix + f".tmp.{_pid()}")
73
+    tmp.write_bytes(normalized.encode("utf-8"))
74
+    tmp.replace(path)
75
+
76
+
77
+def normalize_for_hashing(content: str) -> str:
78
+    """Produce the canonical form used when hashing text content.
79
+
80
+    - BOM removed (if any)
81
+    - CRLF → LF
82
+    - CR alone → LF
83
+
84
+    This must exactly mirror what `_decode` produces so that
85
+    `hash(read_text(path)) == hash(normalize_for_hashing(serialize(...)))`.
86
+    """
87
+    if content.startswith(_UTF8_BOM):
88
+        content = content[1:]
89
+    return content.replace("\r\n", "\n").replace("\r", "\n")
90
+
91
+
92
+# --- internals ---
93
+
94
+
95
+def _decode(raw: bytes, *, path: Path | None) -> str:
96
+    try:
97
+        text = raw.decode("utf-8")
98
+    except UnicodeDecodeError as exc:
99
+        raise DlmEncodingError(path, exc.start, exc.reason) from exc
100
+
101
+    if text.startswith(_UTF8_BOM):
102
+        where = str(path) if path is not None else "<text>"
103
+        _LOG.warning("%s: UTF-8 BOM present; stripped", where)
104
+        text = text[1:]
105
+
106
+    # CRLF first, then stray CR, to avoid double-replacing \r\n as \r then \n.
107
+    return text.replace("\r\n", "\n").replace("\r", "\n")
108
+
109
+
110
+def _pid() -> int:
111
+    import os as _os
112
+
113
+    return _os.getpid()