documentlanguagemodel Public

Watch 0 Fork 0 Star 0

Python · 6093 bytes Raw Blame History

  
        1
        """Hypothesis property tests for invariants across dlm.* (audit-04 T4).
      
        2
        
        3
        These aren't replacements for example-based tests — they're guardrails
      
        4
        for properties that hold for *all* inputs. Keep them cheap: the suite
      
        5
        runs inside the default pytest invocation, so each property's shrinking
      
        6
        budget is tuned small.
      
        7
        """
      
        8
        
        9
        from __future__ import annotations
      
        10
        
        11
        import string
      
        12
        from pathlib import Path
      
        13
        
        14
        import pytest
      
        15
        from hypothesis import HealthCheck, given, settings
      
        16
        from hypothesis import strategies as st
      
        17
        
        18
        from dlm.export.errors import PreflightError, UnsafeMergeError
      
        19
        from dlm.export.merge import check_merge_safety
      
        20
        from dlm.export.plan import ExportPlan
      
        21
        from dlm.export.tokenizer_sync import read_gguf_vocab_size
      
        22
        from dlm.io.ulid import mint_ulid
      
        23
        from dlm.pack.integrity import rollup_sha256
      
        24
        
        25
        # --- ULID ---------------------------------------------------------------------
      
        26
        
        27
        
        28
        class TestUlidMonotonicity:
      
        29
            """Crockford ULIDs are 26 chars, time-prefix monotonic within a ms."""
      
        30
        
        31
            @given(n=st.integers(min_value=2, max_value=32))
      
        32
            def test_each_ulid_is_26_chars(self, n: int) -> None:
      
        33
                for _ in range(n):
      
        34
                    u = mint_ulid()
      
        35
                    assert len(u) == 26
      
        36
        
        37
            @given(n=st.integers(min_value=2, max_value=16))
      
        38
            def test_two_ulids_minted_in_sequence_are_distinct(self, n: int) -> None:
      
        39
                # Even at the same millisecond, random-component collisions should
      
        40
                # be astronomically unlikely. Property: no duplicates across a run.
      
        41
                seen = {mint_ulid() for _ in range(n)}
      
        42
                assert len(seen) == n
      
        43
        
        44
            def test_alphabet_is_crockford(self) -> None:
      
        45
                # Crockford base32 excludes I, L, O, U. 32 samples gives high
      
        46
                # confidence that every alphabet slot has been observed.
      
        47
                allowed = set(string.digits + "ABCDEFGHJKMNPQRSTVWXYZ")
      
        48
                for _ in range(32):
      
        49
                    u = mint_ulid()
      
        50
                    assert set(u) <= allowed, f"ULID {u} has non-Crockford char"
      
        51
        
        52
        
        53
        # --- rollup_sha256 ------------------------------------------------------------
      
        54
        
        55
        
        56
        _hex64 = st.text(alphabet="0123456789abcdef", min_size=64, max_size=64)
      
        57
        _relpath = st.text(
      
        58
            alphabet=string.ascii_lowercase + string.digits + "/",
      
        59
            min_size=1,
      
        60
            max_size=20,
      
        61
        ).filter(lambda s: ".." not in s and not s.startswith("/") and not s.endswith("/"))
      
        62
        
        63
        
        64
        class TestRollupDeterminism:
      
        65
            """Rollup is content-addressable: input → same hex digest every time."""
      
        66
        
        67
            @given(st.dictionaries(_relpath, _hex64, min_size=1, max_size=12))
      
        68
            def test_rollup_is_deterministic(self, d: dict[str, str]) -> None:
      
        69
                assert rollup_sha256(d) == rollup_sha256(d)
      
        70
        
        71
            @given(st.dictionaries(_relpath, _hex64, min_size=2, max_size=12))
      
        72
            def test_rollup_is_order_independent(self, d: dict[str, str]) -> None:
      
        73
                # Reverse-ordered dict → same rollup (sort is the contract).
      
        74
                reversed_d = dict(reversed(list(d.items())))
      
        75
                assert rollup_sha256(d) == rollup_sha256(reversed_d)
      
        76
        
        77
            @given(st.dictionaries(_relpath, _hex64, min_size=1, max_size=8))
      
        78
            def test_rollup_changes_with_content(self, d: dict[str, str]) -> None:
      
        79
                base = rollup_sha256(d)
      
        80
                tweaked = {**d, "extra_path": "0" * 64}
      
        81
                assert rollup_sha256(tweaked) != base
      
        82
        
        83
        
        84
        # --- merge-safety truth table -------------------------------------------------
      
        85
        
        86
        
        87
        class TestMergeSafetyTruthTable:
      
        88
            """Exhaustive `(merged, dequantize, was_qlora)` sweep.
      
        89
        
        90
            Contract (CLAUDE.md pitfall #3):
      
        91
            - `merged=False`: always safe, regardless of other flags.
      
        92
            - `merged=True, was_qlora=False`: safe (plain LoRA merge).
      
        93
            - `merged=True, was_qlora=True, dequantize=False`: REFUSE.
      
        94
            - `merged=True, was_qlora=True, dequantize=True`: safe (user confirmed).
      
        95
            """
      
        96
        
        97
            @pytest.mark.parametrize("merged", [False, True])
      
        98
            @pytest.mark.parametrize("dequantize", [False, True])
      
        99
            @pytest.mark.parametrize("was_qlora", [False, True])
      
        100
            def test_truth_table(self, merged: bool, dequantize: bool, was_qlora: bool) -> None:
      
        101
                # `dequantize_confirmed=True` is meaningless without merged=True;
      
        102
                # `ExportPlan.__post_init__` rejects the combo before we can test it.
      
        103
                if dequantize and not merged:
      
        104
                    pytest.skip("invalid flag combination: --dequantize without --merged")
      
        105
                plan = ExportPlan(merged=merged, dequantize_confirmed=dequantize)
      
        106
                should_refuse = merged and was_qlora and not dequantize
      
        107
                if should_refuse:
      
        108
                    with pytest.raises(UnsafeMergeError):
      
        109
                        check_merge_safety(plan, was_qlora=was_qlora)
      
        110
                else:
      
        111
                    check_merge_safety(plan, was_qlora=was_qlora)  # no raise
      
        112
        
        113
        
        114
        # --- GGUF parser fuzz ---------------------------------------------------------
      
        115
        
        116
        
        117
        class TestGgufParserFuzz:
      
        118
            """Feed random bytes at `read_gguf_vocab_size`; it must surface a typed
      
        119
            `PreflightError`, never leak `struct.error` / `MemoryError` / a raw
      
        120
            decoded string. Audit-04 T6 / B7 defense-in-depth.
      
        121
        
        122
            Reusing `tmp_path` across hypothesis iterations is deliberate — we
      
        123
            overwrite the file each run, and spinning up a fresh dir per-sample
      
        124
            would dominate the test runtime.
      
        125
            """
      
        126
        
        127
            @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=50)
      
        128
            @given(
      
        129
                payload=st.binary(min_size=0, max_size=256).filter(
      
        130
                    # Skip valid headers — fuzzing valid packets isn't the point.
      
        131
                    lambda b: not b.startswith(b"GGUF")
      
        132
                )
      
        133
            )
      
        134
            def test_random_bytes_raise_preflight_error(self, payload: bytes, tmp_path: Path) -> None:
      
        135
                path = tmp_path / "fuzz.gguf"
      
        136
                path.write_bytes(payload)
      
        137
                with pytest.raises(PreflightError):
      
        138
                    read_gguf_vocab_size(path)
      
        139
        
        140
            @settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=50)
      
        141
            @given(body=st.binary(min_size=0, max_size=256))
      
        142
            def test_random_body_after_magic_doesnt_crash(self, body: bytes, tmp_path: Path) -> None:
      
        143
                """Even with valid magic, garbage body → typed error, no crash."""
      
        144
                path = tmp_path / "fuzz.gguf"
      
        145
                path.write_bytes(b"GGUF" + body)
      
        146
                with pytest.raises(PreflightError):
      
        147
                    read_gguf_vocab_size(path)

1	"""Hypothesis property tests for invariants across dlm.* (audit-04 T4).
2
3	These aren't replacements for example-based tests — they're guardrails
4	for properties that hold for all inputs. Keep them cheap: the suite
5	runs inside the default pytest invocation, so each property's shrinking
6	budget is tuned small.
7	"""
8
9	from __future__ import annotations
10
11	import string
12	from pathlib import Path
13
14	import pytest
15	from hypothesis import HealthCheck, given, settings
16	from hypothesis import strategies as st
17
18	from dlm.export.errors import PreflightError, UnsafeMergeError
19	from dlm.export.merge import check_merge_safety
20	from dlm.export.plan import ExportPlan
21	from dlm.export.tokenizer_sync import read_gguf_vocab_size
22	from dlm.io.ulid import mint_ulid
23	from dlm.pack.integrity import rollup_sha256
24
25	# --- ULID ---------------------------------------------------------------------
26
27
28	class TestUlidMonotonicity:
29	"""Crockford ULIDs are 26 chars, time-prefix monotonic within a ms."""
30
31	@given(n=st.integers(min_value=2, max_value=32))
32	def test_each_ulid_is_26_chars(self, n: int) -> None:
33	for _ in range(n):
34	u = mint_ulid()
35	assert len(u) == 26
36
37	@given(n=st.integers(min_value=2, max_value=16))
38	def test_two_ulids_minted_in_sequence_are_distinct(self, n: int) -> None:
39	# Even at the same millisecond, random-component collisions should
40	# be astronomically unlikely. Property: no duplicates across a run.
41	seen = {mint_ulid() for _ in range(n)}
42	assert len(seen) == n
43
44	def test_alphabet_is_crockford(self) -> None:
45	# Crockford base32 excludes I, L, O, U. 32 samples gives high
46	# confidence that every alphabet slot has been observed.
47	allowed = set(string.digits + "ABCDEFGHJKMNPQRSTVWXYZ")
48	for _ in range(32):
49	u = mint_ulid()
50	assert set(u) <= allowed, f"ULID {u} has non-Crockford char"
51
52
53	# --- rollup_sha256 ------------------------------------------------------------
54
55
56	_hex64 = st.text(alphabet="0123456789abcdef", min_size=64, max_size=64)
57	_relpath = st.text(
58	alphabet=string.ascii_lowercase + string.digits + "/",
59	min_size=1,
60	max_size=20,
61	).filter(lambda s: ".." not in s and not s.startswith("/") and not s.endswith("/"))
62
63
64	class TestRollupDeterminism:
65	"""Rollup is content-addressable: input → same hex digest every time."""
66
67	@given(st.dictionaries(_relpath, _hex64, min_size=1, max_size=12))
68	def test_rollup_is_deterministic(self, d: dict[str, str]) -> None:
69	assert rollup_sha256(d) == rollup_sha256(d)
70
71	@given(st.dictionaries(_relpath, _hex64, min_size=2, max_size=12))
72	def test_rollup_is_order_independent(self, d: dict[str, str]) -> None:
73	# Reverse-ordered dict → same rollup (sort is the contract).
74	reversed_d = dict(reversed(list(d.items())))
75	assert rollup_sha256(d) == rollup_sha256(reversed_d)
76
77	@given(st.dictionaries(_relpath, _hex64, min_size=1, max_size=8))
78	def test_rollup_changes_with_content(self, d: dict[str, str]) -> None:
79	base = rollup_sha256(d)
80	tweaked = {*d, "extra_path": "0" 64}
81	assert rollup_sha256(tweaked) != base
82
83
84	# --- merge-safety truth table -------------------------------------------------
85
86
87	class TestMergeSafetyTruthTable:
88	"""Exhaustive `(merged, dequantize, was_qlora)` sweep.
89
90	Contract (CLAUDE.md pitfall #3):
91	- `merged=False`: always safe, regardless of other flags.
92	- `merged=True, was_qlora=False`: safe (plain LoRA merge).
93	- `merged=True, was_qlora=True, dequantize=False`: REFUSE.
94	- `merged=True, was_qlora=True, dequantize=True`: safe (user confirmed).
95	"""
96
97	@pytest.mark.parametrize("merged", [False, True])
98	@pytest.mark.parametrize("dequantize", [False, True])
99	@pytest.mark.parametrize("was_qlora", [False, True])
100	def test_truth_table(self, merged: bool, dequantize: bool, was_qlora: bool) -> None:
101	# `dequantize_confirmed=True` is meaningless without merged=True;
102	# `ExportPlan.__post_init__` rejects the combo before we can test it.
103	if dequantize and not merged:
104	pytest.skip("invalid flag combination: --dequantize without --merged")
105	plan = ExportPlan(merged=merged, dequantize_confirmed=dequantize)
106	should_refuse = merged and was_qlora and not dequantize
107	if should_refuse:
108	with pytest.raises(UnsafeMergeError):
109	check_merge_safety(plan, was_qlora=was_qlora)
110	else:
111	check_merge_safety(plan, was_qlora=was_qlora) # no raise
112
113
114	# --- GGUF parser fuzz ---------------------------------------------------------
115
116
117	class TestGgufParserFuzz:
118	"""Feed random bytes at `read_gguf_vocab_size`; it must surface a typed
119	`PreflightError`, never leak `struct.error` / `MemoryError` / a raw
120	decoded string. Audit-04 T6 / B7 defense-in-depth.
121
122	Reusing `tmp_path` across hypothesis iterations is deliberate — we
123	overwrite the file each run, and spinning up a fresh dir per-sample
124	would dominate the test runtime.
125	"""
126
127	@settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=50)
128	@given(
129	payload=st.binary(min_size=0, max_size=256).filter(
130	# Skip valid headers — fuzzing valid packets isn't the point.
131	lambda b: not b.startswith(b"GGUF")
132	)
133	)
134	def test_random_bytes_raise_preflight_error(self, payload: bytes, tmp_path: Path) -> None:
135	path = tmp_path / "fuzz.gguf"
136	path.write_bytes(payload)
137	with pytest.raises(PreflightError):
138	read_gguf_vocab_size(path)
139
140	@settings(suppress_health_check=[HealthCheck.function_scoped_fixture], max_examples=50)
141	@given(body=st.binary(min_size=0, max_size=256))
142	def test_random_body_after_magic_doesnt_crash(self, body: bytes, tmp_path: Path) -> None:
143	"""Even with valid magic, garbage body → typed error, no crash."""
144	path = tmp_path / "fuzz.gguf"
145	path.write_bytes(b"GGUF" + body)
146	with pytest.raises(PreflightError):
147	read_gguf_vocab_size(path)