| 1 |
"""Audit F15: UTF-8 strict, BOM strip, CRLF normalization.""" |
| 2 |
|
| 3 |
from __future__ import annotations |
| 4 |
|
| 5 |
import logging |
| 6 |
from pathlib import Path |
| 7 |
|
| 8 |
import pytest |
| 9 |
|
| 10 |
from dlm.io.text import ( |
| 11 |
DlmEncodingError, |
| 12 |
normalize_for_hashing, |
| 13 |
read_text, |
| 14 |
read_text_str, |
| 15 |
write_text, |
| 16 |
) |
| 17 |
|
| 18 |
|
| 19 |
class TestReadText: |
| 20 |
def test_utf8_roundtrip(self, tmp_path: Path) -> None: |
| 21 |
p = tmp_path / "sample.txt" |
| 22 |
p.write_bytes(b"hello, world\n") |
| 23 |
assert read_text(p) == "hello, world\n" |
| 24 |
|
| 25 |
def test_bom_is_stripped_and_warned( |
| 26 |
self, tmp_path: Path, caplog: pytest.LogCaptureFixture |
| 27 |
) -> None: |
| 28 |
p = tmp_path / "bom.txt" |
| 29 |
p.write_bytes(b"\xef\xbb\xbfhello\n") |
| 30 |
with caplog.at_level(logging.WARNING, logger="dlm.io.text"): |
| 31 |
text = read_text(p) |
| 32 |
assert text == "hello\n" |
| 33 |
assert any("BOM" in rec.message for rec in caplog.records) |
| 34 |
|
| 35 |
def test_crlf_is_normalized(self, tmp_path: Path) -> None: |
| 36 |
p = tmp_path / "windows.txt" |
| 37 |
p.write_bytes(b"line1\r\nline2\r\n") |
| 38 |
assert read_text(p) == "line1\nline2\n" |
| 39 |
|
| 40 |
def test_lone_cr_is_normalized(self, tmp_path: Path) -> None: |
| 41 |
p = tmp_path / "classic_mac.txt" |
| 42 |
p.write_bytes(b"line1\rline2\r") |
| 43 |
assert read_text(p) == "line1\nline2\n" |
| 44 |
|
| 45 |
def test_mixed_endings_normalized(self, tmp_path: Path) -> None: |
| 46 |
p = tmp_path / "mixed.txt" |
| 47 |
p.write_bytes(b"a\r\nb\nc\r") |
| 48 |
assert read_text(p) == "a\nb\nc\n" |
| 49 |
|
| 50 |
def test_invalid_utf8_raises_with_offset(self, tmp_path: Path) -> None: |
| 51 |
p = tmp_path / "bad.txt" |
| 52 |
# \xff is never valid as a start byte in UTF-8 |
| 53 |
p.write_bytes(b"ok\xffbad") |
| 54 |
with pytest.raises(DlmEncodingError) as exc_info: |
| 55 |
read_text(p) |
| 56 |
assert exc_info.value.byte_offset == 2 |
| 57 |
assert exc_info.value.path == p |
| 58 |
|
| 59 |
|
| 60 |
class TestReadTextStr: |
| 61 |
def test_bytes_roundtrip(self) -> None: |
| 62 |
assert read_text_str(b"hello\n") == "hello\n" |
| 63 |
|
| 64 |
def test_invalid_raises_with_source(self) -> None: |
| 65 |
with pytest.raises(DlmEncodingError) as exc_info: |
| 66 |
read_text_str(b"\xff", source="fixture:broken") |
| 67 |
assert "fixture:broken" in str(exc_info.value) |
| 68 |
|
| 69 |
|
| 70 |
class TestWriteText: |
| 71 |
def test_writes_utf8_lf_no_bom(self, tmp_path: Path) -> None: |
| 72 |
p = tmp_path / "out.txt" |
| 73 |
write_text(p, "line1\r\nline2\r\n") |
| 74 |
raw = p.read_bytes() |
| 75 |
assert raw == b"line1\nline2\n" |
| 76 |
|
| 77 |
def test_write_is_atomic_leaves_no_tmp(self, tmp_path: Path) -> None: |
| 78 |
p = tmp_path / "out.txt" |
| 79 |
write_text(p, "content\n") |
| 80 |
siblings = list(tmp_path.iterdir()) |
| 81 |
assert siblings == [p], "write_text must clean up temp files" |
| 82 |
|
| 83 |
|
| 84 |
class TestNormalizeForHashing: |
| 85 |
def test_bom_stripped(self) -> None: |
| 86 |
assert normalize_for_hashing("\ufeffhello") == "hello" |
| 87 |
|
| 88 |
def test_crlf_normalized(self) -> None: |
| 89 |
assert normalize_for_hashing("a\r\nb\rc") == "a\nb\nc" |
| 90 |
|
| 91 |
def test_hash_identity_across_platforms(self) -> None: |
| 92 |
"""Windows-style CRLF and Unix-style LF with same content must hash-identically. |
| 93 |
|
| 94 |
This is the core Sprint 03 / audit F15 contract: section IDs are stable |
| 95 |
under line-ending edits. |
| 96 |
""" |
| 97 |
unix = "header\n\nbody line 1\nbody line 2\n" |
| 98 |
windows = "header\r\n\r\nbody line 1\r\nbody line 2\r\n" |
| 99 |
assert normalize_for_hashing(unix) == normalize_for_hashing(windows) |