@@ -25,6 +25,48 @@ def _offline_mode() -> bool: |
| 25 | return os.environ.get("SWAY_OFFLINE", "0") == "1" | 25 | return os.environ.get("SWAY_OFFLINE", "0") == "1" |
| 26 | | 26 | |
| 27 | | 27 | |
| | 28 | +def _snapshot_download_with_retry(**kwargs: object) -> str: |
| | 29 | + """``snapshot_download`` wrapped with exponential-backoff retry. |
| | 30 | + |
| | 31 | + F03 (Audit 03) observed an integration-lane macOS run that hung |
| | 32 | + 20+ minutes inside ``snapshot_download``'s cache-resolution path |
| | 33 | + after HF Hub connectivity briefly dropped. A silent stall is the |
| | 34 | + worst UX: the job times out with zero test output and no |
| | 35 | + actionable error. The retry wrapper turns a transient network |
| | 36 | + blip into a 5s-10s-20s back-off and a final timeout-ish failure |
| | 37 | + that surfaces cleanly. |
| | 38 | + |
| | 39 | + Each attempt is hard-capped by ``etag_timeout`` + a per-attempt |
| | 40 | + overall timeout so no single call can burn the test budget. The |
| | 41 | + retry policy runs at most 3 attempts with jittered exponential |
| | 42 | + backoff. |
| | 43 | + """ |
| | 44 | + from huggingface_hub import snapshot_download |
| | 45 | + from tenacity import ( |
| | 46 | + Retrying, |
| | 47 | + retry_if_exception_type, |
| | 48 | + stop_after_attempt, |
| | 49 | + wait_exponential, |
| | 50 | + ) |
| | 51 | + |
| | 52 | + retry_types: tuple[type[BaseException], ...] = (OSError, RuntimeError) |
| | 53 | + for attempt in Retrying( |
| | 54 | + stop=stop_after_attempt(3), |
| | 55 | + wait=wait_exponential(multiplier=5, min=5, max=30), |
| | 56 | + retry=retry_if_exception_type(retry_types), |
| | 57 | + reraise=True, |
| | 58 | + ): |
| | 59 | + with attempt: |
| | 60 | + # ``etag_timeout`` bounds the per-file head/etag probe |
| | 61 | + # (10 s is generous; 120s default is the real hang risk). |
| | 62 | + result: str = snapshot_download(etag_timeout=10, **kwargs) # type: ignore[arg-type] |
| | 63 | + return result |
| | 64 | + # ``reraise=True`` means the Retrying loop always either returns |
| | 65 | + # (above) or propagates the last exception — this line is |
| | 66 | + # unreachable, but keeps mypy happy with a pointed sentinel. |
| | 67 | + raise RuntimeError("snapshot_download retry loop exhausted without a return") |
| | 68 | + |
| | 69 | + |
| 28 | @pytest.fixture(scope="session") | 70 | @pytest.fixture(scope="session") |
| 29 | def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: | 71 | def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: |
| 30 | """Download (or reuse) the tiny model; yield the cached directory. | 72 | """Download (or reuse) the tiny model; yield the cached directory. |
@@ -33,15 +75,13 @@ def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: |
| 33 | env vars are cleared inside this fixture so ``snapshot_download`` | 75 | env vars are cleared inside this fixture so ``snapshot_download`` |
| 34 | actually fetches. | 76 | actually fetches. |
| 35 | """ | 77 | """ |
| 36 | - from huggingface_hub import snapshot_download | | |
| 37 | - | | |
| 38 | # Clear offline env guards (set by the unit-test autouse fixture). | 78 | # Clear offline env guards (set by the unit-test autouse fixture). |
| 39 | prior = { | 79 | prior = { |
| 40 | k: os.environ.pop(k, None) | 80 | k: os.environ.pop(k, None) |
| 41 | for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE") | 81 | for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE") |
| 42 | } | 82 | } |
| 43 | try: | 83 | try: |
| 44 | - path = snapshot_download( | 84 | + path = _snapshot_download_with_retry( |
| 45 | repo_id=TINY_MODEL_HF_ID, | 85 | repo_id=TINY_MODEL_HF_ID, |
| 46 | revision=TINY_MODEL_REVISION, | 86 | revision=TINY_MODEL_REVISION, |
| 47 | local_files_only=_offline_mode(), | 87 | local_files_only=_offline_mode(), |