Python · 3396 bytes Raw Blame History
1 """Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
2
3 from __future__ import annotations
4
5 import logging
6 from pathlib import Path
7
8 import pytest
9
10 from dlm.io.text import (
11 DlmEncodingError,
12 normalize_for_hashing,
13 read_text,
14 read_text_str,
15 write_text,
16 )
17
18
19 class TestReadText:
20 def test_utf8_roundtrip(self, tmp_path: Path) -> None:
21 p = tmp_path / "sample.txt"
22 p.write_bytes(b"hello, world\n")
23 assert read_text(p) == "hello, world\n"
24
25 def test_bom_is_stripped_and_warned(
26 self, tmp_path: Path, caplog: pytest.LogCaptureFixture
27 ) -> None:
28 p = tmp_path / "bom.txt"
29 p.write_bytes(b"\xef\xbb\xbfhello\n")
30 with caplog.at_level(logging.WARNING, logger="dlm.io.text"):
31 text = read_text(p)
32 assert text == "hello\n"
33 assert any("BOM" in rec.message for rec in caplog.records)
34
35 def test_crlf_is_normalized(self, tmp_path: Path) -> None:
36 p = tmp_path / "windows.txt"
37 p.write_bytes(b"line1\r\nline2\r\n")
38 assert read_text(p) == "line1\nline2\n"
39
40 def test_lone_cr_is_normalized(self, tmp_path: Path) -> None:
41 p = tmp_path / "classic_mac.txt"
42 p.write_bytes(b"line1\rline2\r")
43 assert read_text(p) == "line1\nline2\n"
44
45 def test_mixed_endings_normalized(self, tmp_path: Path) -> None:
46 p = tmp_path / "mixed.txt"
47 p.write_bytes(b"a\r\nb\nc\r")
48 assert read_text(p) == "a\nb\nc\n"
49
50 def test_invalid_utf8_raises_with_offset(self, tmp_path: Path) -> None:
51 p = tmp_path / "bad.txt"
52 # \xff is never valid as a start byte in UTF-8
53 p.write_bytes(b"ok\xffbad")
54 with pytest.raises(DlmEncodingError) as exc_info:
55 read_text(p)
56 assert exc_info.value.byte_offset == 2
57 assert exc_info.value.path == p
58
59
60 class TestReadTextStr:
61 def test_bytes_roundtrip(self) -> None:
62 assert read_text_str(b"hello\n") == "hello\n"
63
64 def test_invalid_raises_with_source(self) -> None:
65 with pytest.raises(DlmEncodingError) as exc_info:
66 read_text_str(b"\xff", source="fixture:broken")
67 assert "fixture:broken" in str(exc_info.value)
68
69
70 class TestWriteText:
71 def test_writes_utf8_lf_no_bom(self, tmp_path: Path) -> None:
72 p = tmp_path / "out.txt"
73 write_text(p, "line1\r\nline2\r\n")
74 raw = p.read_bytes()
75 assert raw == b"line1\nline2\n"
76
77 def test_write_is_atomic_leaves_no_tmp(self, tmp_path: Path) -> None:
78 p = tmp_path / "out.txt"
79 write_text(p, "content\n")
80 siblings = list(tmp_path.iterdir())
81 assert siblings == [p], "write_text must clean up temp files"
82
83
84 class TestNormalizeForHashing:
85 def test_bom_stripped(self) -> None:
86 assert normalize_for_hashing("\ufeffhello") == "hello"
87
88 def test_crlf_normalized(self) -> None:
89 assert normalize_for_hashing("a\r\nb\rc") == "a\nb\nc"
90
91 def test_hash_identity_across_platforms(self) -> None:
92 """Windows-style CRLF and Unix-style LF with same content must hash-identically.
93
94 This is the core Sprint 03 / audit F15 contract: section IDs are stable
95 under line-ending edits.
96 """
97 unix = "header\n\nbody line 1\nbody line 2\n"
98 windows = "header\r\n\r\nbody line 1\r\nbody line 2\r\n"
99 assert normalize_for_hashing(unix) == normalize_for_hashing(windows)