Python · 2955 bytes Raw Blame History
1 """Packaged public-domain corpora for the ``external_perplexity`` probe (S09).
2
3 Each corpus ships as a ``.txt`` file under ``_corpora/`` alongside this
4 module. The files carry inline ``#``-comment provenance headers that
5 name the original works and their public-domain basis; the loader
6 strips those headers at read time so probes see only the raw prose.
7
8 Callers use :func:`load_corpus` to get a deterministic list of prose
9 chunks for a given corpus name. Adding a new corpus is three steps:
10
11 1. Drop the ``.txt`` file under ``_corpora/`` with provenance comments.
12 2. Add the name → filename mapping to :data:`_CORPORA`.
13 3. Extend the ``CorpusName`` literal in ``external_perplexity.py``.
14 """
15
16 from __future__ import annotations
17
18 from pathlib import Path
19 from typing import Final
20
21 _CORPUS_DIR: Final = Path(__file__).parent / "_corpora"
22
23 _CORPORA: Final[dict[str, str]] = {
24 "public_domain_en": "public_domain_en.txt",
25 }
26
27
28 def available_corpora() -> tuple[str, ...]:
29 """Return the names every installed wheel ships corpora for."""
30 return tuple(sorted(_CORPORA))
31
32
33 def load_corpus(name: str) -> str:
34 """Read the corpus as one string, with ``#``-comment lines stripped.
35
36 Raises :class:`KeyError` when ``name`` isn't registered. The raw
37 file stays UTF-8 on disk; the in-memory string is also UTF-8. No
38 tokenization happens here — the probe is responsible for chunking
39 the returned text.
40 """
41 if name not in _CORPORA:
42 raise KeyError(
43 f"unknown external-perplexity corpus {name!r}; available: {sorted(_CORPORA)!r}"
44 )
45 path = _CORPUS_DIR / _CORPORA[name]
46 raw = path.read_text(encoding="utf-8")
47 # Strip provenance comments and blank lines. What remains is
48 # a flat prose block — the external-perplexity probe chunks it
49 # into fixed-width windows at measurement time.
50 body_lines = [
51 line for line in raw.splitlines() if line.strip() and not line.lstrip().startswith("#")
52 ]
53 return "\n\n".join(body_lines)
54
55
56 def chunk_corpus(text: str, *, chunk_chars: int, max_chunks: int) -> list[str]:
57 """Split a corpus string into up to ``max_chunks`` chunks of
58 ``chunk_chars`` characters each.
59
60 Chunks are sliced from the start of ``text`` at fixed character
61 offsets — deterministic across runs, trivially re-playable. Any
62 final chunk shorter than 64 characters is dropped (a partial tail
63 contributes noise to rolling-logprob aggregation without adding
64 signal).
65 """
66 if chunk_chars <= 0:
67 raise ValueError(f"chunk_chars must be positive; got {chunk_chars}")
68 if max_chunks <= 0:
69 raise ValueError(f"max_chunks must be positive; got {max_chunks}")
70 chunks: list[str] = []
71 for start in range(0, len(text), chunk_chars):
72 if len(chunks) >= max_chunks:
73 break
74 piece = text[start : start + chunk_chars]
75 if len(piece) >= 64:
76 chunks.append(piece)
77 return chunks