documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 3396 bytes Raw Blame History

  
        1
        """Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
      
        2
        
        3
        from __future__ import annotations
      
        4
        
        5
        import logging
      
        6
        from pathlib import Path
      
        7
        
        8
        import pytest
      
        9
        
        10
        from dlm.io.text import (
      
        11
            DlmEncodingError,
      
        12
            normalize_for_hashing,
      
        13
            read_text,
      
        14
            read_text_str,
      
        15
            write_text,
      
        16
        )
      
        17
        
        18
        
        19
        class TestReadText:
      
        20
            def test_utf8_roundtrip(self, tmp_path: Path) -> None:
      
        21
                p = tmp_path / "sample.txt"
      
        22
                p.write_bytes(b"hello, world\n")
      
        23
                assert read_text(p) == "hello, world\n"
      
        24
        
        25
            def test_bom_is_stripped_and_warned(
      
        26
                self, tmp_path: Path, caplog: pytest.LogCaptureFixture
      
        27
            ) -> None:
      
        28
                p = tmp_path / "bom.txt"
      
        29
                p.write_bytes(b"\xef\xbb\xbfhello\n")
      
        30
                with caplog.at_level(logging.WARNING, logger="dlm.io.text"):
      
        31
                    text = read_text(p)
      
        32
                assert text == "hello\n"
      
        33
                assert any("BOM" in rec.message for rec in caplog.records)
      
        34
        
        35
            def test_crlf_is_normalized(self, tmp_path: Path) -> None:
      
        36
                p = tmp_path / "windows.txt"
      
        37
                p.write_bytes(b"line1\r\nline2\r\n")
      
        38
                assert read_text(p) == "line1\nline2\n"
      
        39
        
        40
            def test_lone_cr_is_normalized(self, tmp_path: Path) -> None:
      
        41
                p = tmp_path / "classic_mac.txt"
      
        42
                p.write_bytes(b"line1\rline2\r")
      
        43
                assert read_text(p) == "line1\nline2\n"
      
        44
        
        45
            def test_mixed_endings_normalized(self, tmp_path: Path) -> None:
      
        46
                p = tmp_path / "mixed.txt"
      
        47
                p.write_bytes(b"a\r\nb\nc\r")
      
        48
                assert read_text(p) == "a\nb\nc\n"
      
        49
        
        50
            def test_invalid_utf8_raises_with_offset(self, tmp_path: Path) -> None:
      
        51
                p = tmp_path / "bad.txt"
      
        52
                # \xff is never valid as a start byte in UTF-8
      
        53
                p.write_bytes(b"ok\xffbad")
      
        54
                with pytest.raises(DlmEncodingError) as exc_info:
      
        55
                    read_text(p)
      
        56
                assert exc_info.value.byte_offset == 2
      
        57
                assert exc_info.value.path == p
      
        58
        
        59
        
        60
        class TestReadTextStr:
      
        61
            def test_bytes_roundtrip(self) -> None:
      
        62
                assert read_text_str(b"hello\n") == "hello\n"
      
        63
        
        64
            def test_invalid_raises_with_source(self) -> None:
      
        65
                with pytest.raises(DlmEncodingError) as exc_info:
      
        66
                    read_text_str(b"\xff", source="fixture:broken")
      
        67
                assert "fixture:broken" in str(exc_info.value)
      
        68
        
        69
        
        70
        class TestWriteText:
      
        71
            def test_writes_utf8_lf_no_bom(self, tmp_path: Path) -> None:
      
        72
                p = tmp_path / "out.txt"
      
        73
                write_text(p, "line1\r\nline2\r\n")
      
        74
                raw = p.read_bytes()
      
        75
                assert raw == b"line1\nline2\n"
      
        76
        
        77
            def test_write_is_atomic_leaves_no_tmp(self, tmp_path: Path) -> None:
      
        78
                p = tmp_path / "out.txt"
      
        79
                write_text(p, "content\n")
      
        80
                siblings = list(tmp_path.iterdir())
      
        81
                assert siblings == [p], "write_text must clean up temp files"
      
        82
        
        83
        
        84
        class TestNormalizeForHashing:
      
        85
            def test_bom_stripped(self) -> None:
      
        86
                assert normalize_for_hashing("\ufeffhello") == "hello"
      
        87
        
        88
            def test_crlf_normalized(self) -> None:
      
        89
                assert normalize_for_hashing("a\r\nb\rc") == "a\nb\nc"
      
        90
        
        91
            def test_hash_identity_across_platforms(self) -> None:
      
        92
                """Windows-style CRLF and Unix-style LF with same content must hash-identically.
      
        93
        
        94
                This is the core Sprint 03 / audit F15 contract: section IDs are stable
      
        95
                under line-ending edits.
      
        96
                """
      
        97
                unix = "header\n\nbody line 1\nbody line 2\n"
      
        98
                windows = "header\r\n\r\nbody line 1\r\nbody line 2\r\n"
      
        99
                assert normalize_for_hashing(unix) == normalize_for_hashing(windows)

1	"""Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
2
3	from __future__ import annotations
4
5	import logging
6	from pathlib import Path
7
8	import pytest
9
10	from dlm.io.text import (
11	DlmEncodingError,
12	normalize_for_hashing,
13	read_text,
14	read_text_str,
15	write_text,
16	)
17
18
19	class TestReadText:
20	def test_utf8_roundtrip(self, tmp_path: Path) -> None:
21	p = tmp_path / "sample.txt"
22	p.write_bytes(b"hello, world\n")
23	assert read_text(p) == "hello, world\n"
24
25	def test_bom_is_stripped_and_warned(
26	self, tmp_path: Path, caplog: pytest.LogCaptureFixture
27	) -> None:
28	p = tmp_path / "bom.txt"
29	p.write_bytes(b"\xef\xbb\xbfhello\n")
30	with caplog.at_level(logging.WARNING, logger="dlm.io.text"):
31	text = read_text(p)
32	assert text == "hello\n"
33	assert any("BOM" in rec.message for rec in caplog.records)
34
35	def test_crlf_is_normalized(self, tmp_path: Path) -> None:
36	p = tmp_path / "windows.txt"
37	p.write_bytes(b"line1\r\nline2\r\n")
38	assert read_text(p) == "line1\nline2\n"
39
40	def test_lone_cr_is_normalized(self, tmp_path: Path) -> None:
41	p = tmp_path / "classic_mac.txt"
42	p.write_bytes(b"line1\rline2\r")
43	assert read_text(p) == "line1\nline2\n"
44
45	def test_mixed_endings_normalized(self, tmp_path: Path) -> None:
46	p = tmp_path / "mixed.txt"
47	p.write_bytes(b"a\r\nb\nc\r")
48	assert read_text(p) == "a\nb\nc\n"
49
50	def test_invalid_utf8_raises_with_offset(self, tmp_path: Path) -> None:
51	p = tmp_path / "bad.txt"
52	# \xff is never valid as a start byte in UTF-8
53	p.write_bytes(b"ok\xffbad")
54	with pytest.raises(DlmEncodingError) as exc_info:
55	read_text(p)
56	assert exc_info.value.byte_offset == 2
57	assert exc_info.value.path == p
58
59
60	class TestReadTextStr:
61	def test_bytes_roundtrip(self) -> None:
62	assert read_text_str(b"hello\n") == "hello\n"
63
64	def test_invalid_raises_with_source(self) -> None:
65	with pytest.raises(DlmEncodingError) as exc_info:
66	read_text_str(b"\xff", source="fixture:broken")
67	assert "fixture:broken" in str(exc_info.value)
68
69
70	class TestWriteText:
71	def test_writes_utf8_lf_no_bom(self, tmp_path: Path) -> None:
72	p = tmp_path / "out.txt"
73	write_text(p, "line1\r\nline2\r\n")
74	raw = p.read_bytes()
75	assert raw == b"line1\nline2\n"
76
77	def test_write_is_atomic_leaves_no_tmp(self, tmp_path: Path) -> None:
78	p = tmp_path / "out.txt"
79	write_text(p, "content\n")
80	siblings = list(tmp_path.iterdir())
81	assert siblings == [p], "write_text must clean up temp files"
82
83
84	class TestNormalizeForHashing:
85	def test_bom_stripped(self) -> None:
86	assert normalize_for_hashing("\ufeffhello") == "hello"
87
88	def test_crlf_normalized(self) -> None:
89	assert normalize_for_hashing("a\r\nb\rc") == "a\nb\nc"
90
91	def test_hash_identity_across_platforms(self) -> None:
92	"""Windows-style CRLF and Unix-style LF with same content must hash-identically.
93
94	This is the core Sprint 03 / audit F15 contract: section IDs are stable
95	under line-ending edits.
96	"""
97	unix = "header\n\nbody line 1\nbody line 2\n"
98	windows = "header\r\n\r\nbody line 1\r\nbody line 2\r\n"
99	assert normalize_for_hashing(unix) == normalize_for_hashing(windows)