"""Packaged public-domain corpora for the ``external_perplexity`` probe (S09). Each corpus ships as a ``.txt`` file under ``_corpora/`` alongside this module. The files carry inline ``#``-comment provenance headers that name the original works and their public-domain basis; the loader strips those headers at read time so probes see only the raw prose. Callers use :func:`load_corpus` to get a deterministic list of prose chunks for a given corpus name. Adding a new corpus is three steps: 1. Drop the ``.txt`` file under ``_corpora/`` with provenance comments. 2. Add the name → filename mapping to :data:`_CORPORA`. 3. Extend the ``CorpusName`` literal in ``external_perplexity.py``. """ from __future__ import annotations from pathlib import Path from typing import Final _CORPUS_DIR: Final = Path(__file__).parent / "_corpora" _CORPORA: Final[dict[str, str]] = { "public_domain_en": "public_domain_en.txt", } def available_corpora() -> tuple[str, ...]: """Return the names every installed wheel ships corpora for.""" return tuple(sorted(_CORPORA)) def load_corpus(name: str) -> str: """Read the corpus as one string, with ``#``-comment lines stripped. Raises :class:`KeyError` when ``name`` isn't registered. The raw file stays UTF-8 on disk; the in-memory string is also UTF-8. No tokenization happens here — the probe is responsible for chunking the returned text. """ if name not in _CORPORA: raise KeyError( f"unknown external-perplexity corpus {name!r}; available: {sorted(_CORPORA)!r}" ) path = _CORPUS_DIR / _CORPORA[name] raw = path.read_text(encoding="utf-8") # Strip provenance comments and blank lines. What remains is # a flat prose block — the external-perplexity probe chunks it # into fixed-width windows at measurement time. body_lines = [ line for line in raw.splitlines() if line.strip() and not line.lstrip().startswith("#") ] return "\n\n".join(body_lines) def chunk_corpus(text: str, *, chunk_chars: int, max_chunks: int) -> list[str]: """Split a corpus string into up to ``max_chunks`` chunks of ``chunk_chars`` characters each. Chunks are sliced from the start of ``text`` at fixed character offsets — deterministic across runs, trivially re-playable. Any final chunk shorter than 64 characters is dropped (a partial tail contributes noise to rolling-logprob aggregation without adding signal). """ if chunk_chars <= 0: raise ValueError(f"chunk_chars must be positive; got {chunk_chars}") if max_chunks <= 0: raise ValueError(f"max_chunks must be positive; got {max_chunks}") chunks: list[str] = [] for start in range(0, len(text), chunk_chars): if len(chunks) >= max_chunks: break piece = text[start : start + chunk_chars] if len(piece) >= 64: chunks.append(piece) return chunks