sway Public

Watch 0 Fork 0 Star 0

Python · 2955 bytes Raw Blame History

  
        1
        """Packaged public-domain corpora for the ``external_perplexity`` probe (S09).
      
        2
        
        3
        Each corpus ships as a ``.txt`` file under ``_corpora/`` alongside this
      
        4
        module. The files carry inline ``#``-comment provenance headers that
      
        5
        name the original works and their public-domain basis; the loader
      
        6
        strips those headers at read time so probes see only the raw prose.
      
        7
        
        8
        Callers use :func:`load_corpus` to get a deterministic list of prose
      
        9
        chunks for a given corpus name. Adding a new corpus is three steps:
      
        10
        
        11
        1. Drop the ``.txt`` file under ``_corpora/`` with provenance comments.
      
        12
        2. Add the name → filename mapping to :data:`_CORPORA`.
      
        13
        3. Extend the ``CorpusName`` literal in ``external_perplexity.py``.
      
        14
        """
      
        15
        
        16
        from __future__ import annotations
      
        17
        
        18
        from pathlib import Path
      
        19
        from typing import Final
      
        20
        
        21
        _CORPUS_DIR: Final = Path(__file__).parent / "_corpora"
      
        22
        
        23
        _CORPORA: Final[dict[str, str]] = {
      
        24
            "public_domain_en": "public_domain_en.txt",
      
        25
        }
      
        26
        
        27
        
        28
        def available_corpora() -> tuple[str, ...]:
      
        29
            """Return the names every installed wheel ships corpora for."""
      
        30
            return tuple(sorted(_CORPORA))
      
        31
        
        32
        
        33
        def load_corpus(name: str) -> str:
      
        34
            """Read the corpus as one string, with ``#``-comment lines stripped.
      
        35
        
        36
            Raises :class:`KeyError` when ``name`` isn't registered. The raw
      
        37
            file stays UTF-8 on disk; the in-memory string is also UTF-8. No
      
        38
            tokenization happens here — the probe is responsible for chunking
      
        39
            the returned text.
      
        40
            """
      
        41
            if name not in _CORPORA:
      
        42
                raise KeyError(
      
        43
                    f"unknown external-perplexity corpus {name!r}; available: {sorted(_CORPORA)!r}"
      
        44
                )
      
        45
            path = _CORPUS_DIR / _CORPORA[name]
      
        46
            raw = path.read_text(encoding="utf-8")
      
        47
            # Strip provenance comments and blank lines. What remains is
      
        48
            # a flat prose block — the external-perplexity probe chunks it
      
        49
            # into fixed-width windows at measurement time.
      
        50
            body_lines = [
      
        51
                line for line in raw.splitlines() if line.strip() and not line.lstrip().startswith("#")
      
        52
            ]
      
        53
            return "\n\n".join(body_lines)
      
        54
        
        55
        
        56
        def chunk_corpus(text: str, *, chunk_chars: int, max_chunks: int) -> list[str]:
      
        57
            """Split a corpus string into up to ``max_chunks`` chunks of
      
        58
            ``chunk_chars`` characters each.
      
        59
        
        60
            Chunks are sliced from the start of ``text`` at fixed character
      
        61
            offsets — deterministic across runs, trivially re-playable. Any
      
        62
            final chunk shorter than 64 characters is dropped (a partial tail
      
        63
            contributes noise to rolling-logprob aggregation without adding
      
        64
            signal).
      
        65
            """
      
        66
            if chunk_chars <= 0:
      
        67
                raise ValueError(f"chunk_chars must be positive; got {chunk_chars}")
      
        68
            if max_chunks <= 0:
      
        69
                raise ValueError(f"max_chunks must be positive; got {max_chunks}")
      
        70
            chunks: list[str] = []
      
        71
            for start in range(0, len(text), chunk_chars):
      
        72
                if len(chunks) >= max_chunks:
      
        73
                    break
      
        74
                piece = text[start : start + chunk_chars]
      
        75
                if len(piece) >= 64:
      
        76
                    chunks.append(piece)
      
        77
            return chunks

1	"""Packaged public-domain corpora for the ``external_perplexity`` probe (S09).
2
3	Each corpus ships as a ``.txt`` file under ``_corpora/`` alongside this
4	module. The files carry inline ``#``-comment provenance headers that
5	name the original works and their public-domain basis; the loader
6	strips those headers at read time so probes see only the raw prose.
7
8	Callers use :func:`load_corpus` to get a deterministic list of prose
9	chunks for a given corpus name. Adding a new corpus is three steps:
10
11	1. Drop the ``.txt`` file under ``_corpora/`` with provenance comments.
12	2. Add the name → filename mapping to :data:`_CORPORA`.
13	3. Extend the ``CorpusName`` literal in ``external_perplexity.py``.
14	"""
15
16	from __future__ import annotations
17
18	from pathlib import Path
19	from typing import Final
20
21	_CORPUS_DIR: Final = Path(__file__).parent / "_corpora"
22
23	_CORPORA: Final[dict[str, str]] = {
24	"public_domain_en": "public_domain_en.txt",
25	}
26
27
28	def available_corpora() -> tuple[str, ...]:
29	"""Return the names every installed wheel ships corpora for."""
30	return tuple(sorted(_CORPORA))
31
32
33	def load_corpus(name: str) -> str:
34	"""Read the corpus as one string, with ``#``-comment lines stripped.
35
36	Raises :class:`KeyError` when ``name`` isn't registered. The raw
37	file stays UTF-8 on disk; the in-memory string is also UTF-8. No
38	tokenization happens here — the probe is responsible for chunking
39	the returned text.
40	"""
41	if name not in _CORPORA:
42	raise KeyError(
43	f"unknown external-perplexity corpus {name!r}; available: {sorted(_CORPORA)!r}"
44	)
45	path = _CORPUS_DIR / _CORPORA[name]
46	raw = path.read_text(encoding="utf-8")
47	# Strip provenance comments and blank lines. What remains is
48	# a flat prose block — the external-perplexity probe chunks it
49	# into fixed-width windows at measurement time.
50	body_lines = [
51	line for line in raw.splitlines() if line.strip() and not line.lstrip().startswith("#")
52	]
53	return "\n\n".join(body_lines)
54
55
56	def chunk_corpus(text: str, *, chunk_chars: int, max_chunks: int) -> list[str]:
57	"""Split a corpus string into up to ``max_chunks`` chunks of
58	``chunk_chars`` characters each.
59
60	Chunks are sliced from the start of ``text`` at fixed character
61	offsets — deterministic across runs, trivially re-playable. Any
62	final chunk shorter than 64 characters is dropped (a partial tail
63	contributes noise to rolling-logprob aggregation without adding
64	signal).
65	"""
66	if chunk_chars <= 0:
67	raise ValueError(f"chunk_chars must be positive; got {chunk_chars}")
68	if max_chunks <= 0:
69	raise ValueError(f"max_chunks must be positive; got {max_chunks}")
70	chunks: list[str] = []
71	for start in range(0, len(text), chunk_chars):
72	if len(chunks) >= max_chunks:
73	break
74	piece = text[start : start + chunk_chars]
75	if len(piece) >= 64:
76	chunks.append(piece)
77	return chunks