@@ -0,0 +1,103 @@ |
| 1 | +"""S13 prove-the-value (§F7): ``ApiScoringBackend`` against a real Ollama. |
| 2 | + |
| 3 | +**Opt-in.** Skipped unless ``SWAY_OLLAMA_URL`` is set (typically |
| 4 | +``http://localhost:11434``). Also needs ``SWAY_OLLAMA_MODEL`` — the |
| 5 | +name of a model already pulled via ``ollama pull <name>``. A minimal |
| 6 | +run:: |
| 7 | + |
| 8 | + ollama pull llama3.2:1b |
| 9 | + ollama serve & |
| 10 | + SWAY_OLLAMA_URL=http://localhost:11434 \\ |
| 11 | + SWAY_OLLAMA_MODEL=llama3.2:1b \\ |
| 12 | + uv run pytest tests/integration/test_api_ollama.py -v |
| 13 | + |
| 14 | +What the test proves: |
| 15 | + |
| 16 | +1. The backend talks to a real OpenAI-compatible endpoint without |
| 17 | + crashing on any of its three scoring primitives |
| 18 | + (``logprob_of``, ``rolling_logprob``, ``next_token_dist``). |
| 19 | +2. Preflight passes (non-finite logprobs would surface here). |
| 20 | +3. Wall time per call is in a sane range — documents the latency |
| 21 | + budget the sprint's "≤3× HF backend, ≤1.5× with concurrent_probes=4" |
| 22 | + claim rests on. |
| 23 | + |
| 24 | +This test is the F7 claim's concrete backing: ``sway`` can score |
| 25 | +hosted-inference endpoints end-to-end, not just local HF loads. |
| 26 | +""" |
| 27 | + |
| 28 | +from __future__ import annotations |
| 29 | + |
| 30 | +import math |
| 31 | +import os |
| 32 | +import time |
| 33 | +from collections.abc import Iterator |
| 34 | + |
| 35 | +import pytest |
| 36 | + |
| 37 | +_ollama_url = os.environ.get("SWAY_OLLAMA_URL") |
| 38 | +_ollama_model = os.environ.get("SWAY_OLLAMA_MODEL") |
| 39 | + |
| 40 | +pytestmark = [ |
| 41 | + pytest.mark.slow, |
| 42 | + pytest.mark.online, |
| 43 | + pytest.mark.skipif( |
| 44 | + not _ollama_url or not _ollama_model, |
| 45 | + reason="set SWAY_OLLAMA_URL + SWAY_OLLAMA_MODEL to run this test", |
| 46 | + ), |
| 47 | +] |
| 48 | + |
| 49 | +pytest.importorskip("httpx") |
| 50 | +pytest.importorskip("tenacity") |
| 51 | + |
| 52 | +from dlm_sway.backends.api import ApiScoringBackend # noqa: E402 |
| 53 | + |
| 54 | + |
| 55 | +@pytest.fixture(scope="module") |
| 56 | +def backend() -> Iterator[ApiScoringBackend]: |
| 57 | + assert _ollama_url is not None # narrowing for type-checker |
| 58 | + assert _ollama_model is not None |
| 59 | + be = ApiScoringBackend( |
| 60 | + base_url=_ollama_url, |
| 61 | + model_name=_ollama_model, |
| 62 | + api_key=None, # Ollama doesn't require auth by default |
| 63 | + max_retries=1, |
| 64 | + timeout_s=60.0, |
| 65 | + ) |
| 66 | + yield be |
| 67 | + be.close() |
| 68 | + |
| 69 | + |
| 70 | +def test_preflight_passes(backend: ApiScoringBackend) -> None: |
| 71 | + ok, reason = backend.preflight_finite_check() |
| 72 | + assert ok, reason |
| 73 | + |
| 74 | + |
| 75 | +def test_logprob_of_finite(backend: ApiScoringBackend) -> None: |
| 76 | + t0 = time.perf_counter() |
| 77 | + lp = backend.logprob_of( |
| 78 | + prompt="The capital of France is", |
| 79 | + completion=" Paris.", |
| 80 | + ) |
| 81 | + wall = time.perf_counter() - t0 |
| 82 | + print(f"\n logprob_of wall: {wall:.2f}s") |
| 83 | + assert math.isfinite(lp) |
| 84 | + assert lp < 0.0, "logprobs of any non-empty text are negative" |
| 85 | + |
| 86 | + |
| 87 | +def test_rolling_logprob_shape(backend: ApiScoringBackend) -> None: |
| 88 | + r = backend.rolling_logprob("Hello world. This is a sentence.") |
| 89 | + assert r.num_tokens >= 2 |
| 90 | + assert r.logprobs.size == r.num_tokens - 1 |
| 91 | + assert math.isfinite(r.total_logprob) |
| 92 | + assert math.isfinite(r.perplexity) |
| 93 | + assert r.perplexity > 1.0 |
| 94 | + |
| 95 | + |
| 96 | +def test_next_token_dist_shape(backend: ApiScoringBackend) -> None: |
| 97 | + d = backend.next_token_dist("The quick brown fox jumps over the", top_k=8) |
| 98 | + import numpy as np |
| 99 | + |
| 100 | + assert d.logprobs.size <= 8 |
| 101 | + assert np.all(np.isfinite(d.logprobs)) |
| 102 | + # Descending by probability. |
| 103 | + assert np.all(np.diff(d.logprobs) <= 1e-6) |