tenseleyflow/documentlanguagemodel / 52f7877

Browse files

test: add smoke, telemetry-gate, and io.text suites

Authored by espadonne
SHA
52f787783c654b405ab3da6b435a717b0a2042a4
Parents
d2d152a
Tree
ddc2dab

7 changed files

StatusFile+-
A tests/__init__.py 0 0
A tests/e2e/__init__.py 0 0
A tests/integration/__init__.py 0 0
A tests/test_smoke.py 65 0
A tests/unit/__init__.py 0 0
A tests/unit/test_cli_telemetry.py 60 0
A tests/unit/test_io_text.py 99 0
tests/__init__.pyadded
tests/e2e/__init__.pyadded
tests/integration/__init__.pyadded
tests/test_smoke.pyadded
@@ -0,0 +1,65 @@
1
+"""Smoke tests: the package imports and the CLI boots."""
2
+
3
+from __future__ import annotations
4
+
5
+import subprocess
6
+import sys
7
+
8
+from typer.testing import CliRunner
9
+
10
+import dlm
11
+from dlm.cli.app import app
12
+
13
+
14
+def test_package_version_is_set() -> None:
15
+    assert dlm.__version__
16
+    assert dlm.__version__ != "0.0.0+unknown", (
17
+        "package must be installed via uv sync / pip install for version lookup"
18
+    )
19
+
20
+
21
+def test_cli_version_flag() -> None:
22
+    runner = CliRunner()
23
+    result = runner.invoke(app, ["--version"])
24
+    assert result.exit_code == 0, result.output
25
+    assert result.output.strip() == f"dlm {dlm.__version__}"
26
+
27
+
28
+def test_cli_help_lists_all_v1_subcommands() -> None:
29
+    runner = CliRunner()
30
+    result = runner.invoke(app, ["--help"])
31
+    assert result.exit_code == 0
32
+    expected = {
33
+        "init",
34
+        "train",
35
+        "prompt",
36
+        "export",
37
+        "pack",
38
+        "unpack",
39
+        "doctor",
40
+        "show",
41
+        "migrate",
42
+    }
43
+    for name in expected:
44
+        assert name in result.output, f"`dlm --help` missing subcommand {name!r}"
45
+
46
+
47
+def test_cli_subcommand_stub_raises_notimplementederror() -> None:
48
+    runner = CliRunner()
49
+    # Typer surfaces the exception; invoke without catch_exceptions to capture.
50
+    result = runner.invoke(app, ["train", "nonexistent.dlm"], catch_exceptions=True)
51
+    assert result.exit_code != 0
52
+    assert isinstance(result.exception, NotImplementedError)
53
+    assert "Sprint 09" in str(result.exception)
54
+
55
+
56
+def test_python_module_entrypoint_runs() -> None:
57
+    """`python -m dlm --version` works (packaging sanity)."""
58
+    result = subprocess.run(
59
+        [sys.executable, "-m", "dlm", "--version"],
60
+        check=True,
61
+        capture_output=True,
62
+        text=True,
63
+        timeout=10,
64
+    )
65
+    assert result.stdout.strip() == f"dlm {dlm.__version__}"
tests/unit/__init__.pyadded
tests/unit/test_cli_telemetry.pyadded
@@ -0,0 +1,60 @@
1
+"""Audit F13: the CLI entry point must set telemetry-off env vars before any
2
+downstream imports. We test this by spawning a fresh subprocess (so we get a
3
+clean env) and asserting the vars are set after `dlm --version` returns.
4
+"""
5
+
6
+from __future__ import annotations
7
+
8
+import subprocess
9
+import sys
10
+import textwrap
11
+
12
+
13
+def test_cli_entry_forces_telemetry_off_env_vars() -> None:
14
+    probe = textwrap.dedent(
15
+        """\
16
+        import os
17
+        # Intentionally unset — we want to see if `import dlm.cli.app` sets them.
18
+        for v in ("HF_HUB_DISABLE_TELEMETRY", "DO_NOT_TRACK",
19
+                  "TRANSFORMERS_NO_ADVISORY_WARNINGS"):
20
+            os.environ.pop(v, None)
21
+        import dlm.cli.app  # noqa: F401
22
+        assert os.environ["HF_HUB_DISABLE_TELEMETRY"] == "1"
23
+        assert os.environ["DO_NOT_TRACK"] == "1"
24
+        assert os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] == "1"
25
+        print("ok")
26
+        """
27
+    )
28
+    result = subprocess.run(
29
+        [sys.executable, "-c", probe],
30
+        check=False,
31
+        capture_output=True,
32
+        text=True,
33
+        timeout=10,
34
+    )
35
+    assert result.returncode == 0, result.stderr
36
+    assert result.stdout.strip() == "ok"
37
+
38
+
39
+def test_user_preset_telemetry_vars_are_respected() -> None:
40
+    """If a user has explicitly set one of these vars to "0", we must NOT
41
+    overwrite them — `setdefault` semantics.
42
+    """
43
+    probe = textwrap.dedent(
44
+        """\
45
+        import os
46
+        os.environ["DO_NOT_TRACK"] = "0"
47
+        import dlm.cli.app  # noqa: F401
48
+        assert os.environ["DO_NOT_TRACK"] == "0"
49
+        print("ok")
50
+        """
51
+    )
52
+    result = subprocess.run(
53
+        [sys.executable, "-c", probe],
54
+        check=False,
55
+        capture_output=True,
56
+        text=True,
57
+        timeout=10,
58
+    )
59
+    assert result.returncode == 0, result.stderr
60
+    assert result.stdout.strip() == "ok"
tests/unit/test_io_text.pyadded
@@ -0,0 +1,99 @@
1
+"""Audit F15: UTF-8 strict, BOM strip, CRLF normalization."""
2
+
3
+from __future__ import annotations
4
+
5
+import logging
6
+from pathlib import Path
7
+
8
+import pytest
9
+
10
+from dlm.io.text import (
11
+    DlmEncodingError,
12
+    normalize_for_hashing,
13
+    read_text,
14
+    read_text_str,
15
+    write_text,
16
+)
17
+
18
+
19
+class TestReadText:
20
+    def test_utf8_roundtrip(self, tmp_path: Path) -> None:
21
+        p = tmp_path / "sample.txt"
22
+        p.write_bytes(b"hello, world\n")
23
+        assert read_text(p) == "hello, world\n"
24
+
25
+    def test_bom_is_stripped_and_warned(
26
+        self, tmp_path: Path, caplog: pytest.LogCaptureFixture
27
+    ) -> None:
28
+        p = tmp_path / "bom.txt"
29
+        p.write_bytes(b"\xef\xbb\xbfhello\n")
30
+        with caplog.at_level(logging.WARNING, logger="dlm.io.text"):
31
+            text = read_text(p)
32
+        assert text == "hello\n"
33
+        assert any("BOM" in rec.message for rec in caplog.records)
34
+
35
+    def test_crlf_is_normalized(self, tmp_path: Path) -> None:
36
+        p = tmp_path / "windows.txt"
37
+        p.write_bytes(b"line1\r\nline2\r\n")
38
+        assert read_text(p) == "line1\nline2\n"
39
+
40
+    def test_lone_cr_is_normalized(self, tmp_path: Path) -> None:
41
+        p = tmp_path / "classic_mac.txt"
42
+        p.write_bytes(b"line1\rline2\r")
43
+        assert read_text(p) == "line1\nline2\n"
44
+
45
+    def test_mixed_endings_normalized(self, tmp_path: Path) -> None:
46
+        p = tmp_path / "mixed.txt"
47
+        p.write_bytes(b"a\r\nb\nc\r")
48
+        assert read_text(p) == "a\nb\nc\n"
49
+
50
+    def test_invalid_utf8_raises_with_offset(self, tmp_path: Path) -> None:
51
+        p = tmp_path / "bad.txt"
52
+        # \xff is never valid as a start byte in UTF-8
53
+        p.write_bytes(b"ok\xffbad")
54
+        with pytest.raises(DlmEncodingError) as exc_info:
55
+            read_text(p)
56
+        assert exc_info.value.byte_offset == 2
57
+        assert exc_info.value.path == p
58
+
59
+
60
+class TestReadTextStr:
61
+    def test_bytes_roundtrip(self) -> None:
62
+        assert read_text_str(b"hello\n") == "hello\n"
63
+
64
+    def test_invalid_raises_with_source(self) -> None:
65
+        with pytest.raises(DlmEncodingError) as exc_info:
66
+            read_text_str(b"\xff", source="fixture:broken")
67
+        assert "fixture:broken" in str(exc_info.value)
68
+
69
+
70
+class TestWriteText:
71
+    def test_writes_utf8_lf_no_bom(self, tmp_path: Path) -> None:
72
+        p = tmp_path / "out.txt"
73
+        write_text(p, "line1\r\nline2\r\n")
74
+        raw = p.read_bytes()
75
+        assert raw == b"line1\nline2\n"
76
+
77
+    def test_write_is_atomic_leaves_no_tmp(self, tmp_path: Path) -> None:
78
+        p = tmp_path / "out.txt"
79
+        write_text(p, "content\n")
80
+        siblings = list(tmp_path.iterdir())
81
+        assert siblings == [p], "write_text must clean up temp files"
82
+
83
+
84
+class TestNormalizeForHashing:
85
+    def test_bom_stripped(self) -> None:
86
+        assert normalize_for_hashing("\ufeffhello") == "hello"
87
+
88
+    def test_crlf_normalized(self) -> None:
89
+        assert normalize_for_hashing("a\r\nb\rc") == "a\nb\nc"
90
+
91
+    def test_hash_identity_across_platforms(self) -> None:
92
+        """Windows-style CRLF and Unix-style LF with same content must hash-identically.
93
+
94
+        This is the core Sprint 03 / audit F15 contract: section IDs are stable
95
+        under line-ending edits.
96
+        """
97
+        unix = "header\n\nbody line 1\nbody line 2\n"
98
+        windows = "header\r\n\r\nbody line 1\r\nbody line 2\r\n"
99
+        assert normalize_for_hashing(unix) == normalize_for_hashing(windows)