| 1 | """Packaged public-domain corpora for the ``external_perplexity`` probe (S09). |
| 2 | |
| 3 | Each corpus ships as a ``.txt`` file under ``_corpora/`` alongside this |
| 4 | module. The files carry inline ``#``-comment provenance headers that |
| 5 | name the original works and their public-domain basis; the loader |
| 6 | strips those headers at read time so probes see only the raw prose. |
| 7 | |
| 8 | Callers use :func:`load_corpus` to get a deterministic list of prose |
| 9 | chunks for a given corpus name. Adding a new corpus is three steps: |
| 10 | |
| 11 | 1. Drop the ``.txt`` file under ``_corpora/`` with provenance comments. |
| 12 | 2. Add the name → filename mapping to :data:`_CORPORA`. |
| 13 | 3. Extend the ``CorpusName`` literal in ``external_perplexity.py``. |
| 14 | """ |
| 15 | |
| 16 | from __future__ import annotations |
| 17 | |
| 18 | from pathlib import Path |
| 19 | from typing import Final |
| 20 | |
| 21 | _CORPUS_DIR: Final = Path(__file__).parent / "_corpora" |
| 22 | |
| 23 | _CORPORA: Final[dict[str, str]] = { |
| 24 | "public_domain_en": "public_domain_en.txt", |
| 25 | } |
| 26 | |
| 27 | |
| 28 | def available_corpora() -> tuple[str, ...]: |
| 29 | """Return the names every installed wheel ships corpora for.""" |
| 30 | return tuple(sorted(_CORPORA)) |
| 31 | |
| 32 | |
| 33 | def load_corpus(name: str) -> str: |
| 34 | """Read the corpus as one string, with ``#``-comment lines stripped. |
| 35 | |
| 36 | Raises :class:`KeyError` when ``name`` isn't registered. The raw |
| 37 | file stays UTF-8 on disk; the in-memory string is also UTF-8. No |
| 38 | tokenization happens here — the probe is responsible for chunking |
| 39 | the returned text. |
| 40 | """ |
| 41 | if name not in _CORPORA: |
| 42 | raise KeyError( |
| 43 | f"unknown external-perplexity corpus {name!r}; available: {sorted(_CORPORA)!r}" |
| 44 | ) |
| 45 | path = _CORPUS_DIR / _CORPORA[name] |
| 46 | raw = path.read_text(encoding="utf-8") |
| 47 | # Strip provenance comments and blank lines. What remains is |
| 48 | # a flat prose block — the external-perplexity probe chunks it |
| 49 | # into fixed-width windows at measurement time. |
| 50 | body_lines = [ |
| 51 | line for line in raw.splitlines() if line.strip() and not line.lstrip().startswith("#") |
| 52 | ] |
| 53 | return "\n\n".join(body_lines) |
| 54 | |
| 55 | |
| 56 | def chunk_corpus(text: str, *, chunk_chars: int, max_chunks: int) -> list[str]: |
| 57 | """Split a corpus string into up to ``max_chunks`` chunks of |
| 58 | ``chunk_chars`` characters each. |
| 59 | |
| 60 | Chunks are sliced from the start of ``text`` at fixed character |
| 61 | offsets — deterministic across runs, trivially re-playable. Any |
| 62 | final chunk shorter than 64 characters is dropped (a partial tail |
| 63 | contributes noise to rolling-logprob aggregation without adding |
| 64 | signal). |
| 65 | """ |
| 66 | if chunk_chars <= 0: |
| 67 | raise ValueError(f"chunk_chars must be positive; got {chunk_chars}") |
| 68 | if max_chunks <= 0: |
| 69 | raise ValueError(f"max_chunks must be positive; got {max_chunks}") |
| 70 | chunks: list[str] = [] |
| 71 | for start in range(0, len(text), chunk_chars): |
| 72 | if len(chunks) >= max_chunks: |
| 73 | break |
| 74 | piece = text[start : start + chunk_chars] |
| 75 | if len(piece) >= 64: |
| 76 | chunks.append(piece) |
| 77 | return chunks |