@@ -25,6 +25,48 @@ def _offline_mode() -> bool: |
| 25 | 25 | return os.environ.get("SWAY_OFFLINE", "0") == "1" |
| 26 | 26 | |
| 27 | 27 | |
| 28 | +def _snapshot_download_with_retry(**kwargs: object) -> str: |
| 29 | + """``snapshot_download`` wrapped with exponential-backoff retry. |
| 30 | + |
| 31 | + F03 (Audit 03) observed an integration-lane macOS run that hung |
| 32 | + 20+ minutes inside ``snapshot_download``'s cache-resolution path |
| 33 | + after HF Hub connectivity briefly dropped. A silent stall is the |
| 34 | + worst UX: the job times out with zero test output and no |
| 35 | + actionable error. The retry wrapper turns a transient network |
| 36 | + blip into a 5s-10s-20s back-off and a final timeout-ish failure |
| 37 | + that surfaces cleanly. |
| 38 | + |
| 39 | + Each attempt is hard-capped by ``etag_timeout`` + a per-attempt |
| 40 | + overall timeout so no single call can burn the test budget. The |
| 41 | + retry policy runs at most 3 attempts with jittered exponential |
| 42 | + backoff. |
| 43 | + """ |
| 44 | + from huggingface_hub import snapshot_download |
| 45 | + from tenacity import ( |
| 46 | + Retrying, |
| 47 | + retry_if_exception_type, |
| 48 | + stop_after_attempt, |
| 49 | + wait_exponential, |
| 50 | + ) |
| 51 | + |
| 52 | + retry_types: tuple[type[BaseException], ...] = (OSError, RuntimeError) |
| 53 | + for attempt in Retrying( |
| 54 | + stop=stop_after_attempt(3), |
| 55 | + wait=wait_exponential(multiplier=5, min=5, max=30), |
| 56 | + retry=retry_if_exception_type(retry_types), |
| 57 | + reraise=True, |
| 58 | + ): |
| 59 | + with attempt: |
| 60 | + # ``etag_timeout`` bounds the per-file head/etag probe |
| 61 | + # (10 s is generous; 120s default is the real hang risk). |
| 62 | + result: str = snapshot_download(etag_timeout=10, **kwargs) # type: ignore[arg-type] |
| 63 | + return result |
| 64 | + # ``reraise=True`` means the Retrying loop always either returns |
| 65 | + # (above) or propagates the last exception — this line is |
| 66 | + # unreachable, but keeps mypy happy with a pointed sentinel. |
| 67 | + raise RuntimeError("snapshot_download retry loop exhausted without a return") |
| 68 | + |
| 69 | + |
| 28 | 70 | @pytest.fixture(scope="session") |
| 29 | 71 | def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: |
| 30 | 72 | """Download (or reuse) the tiny model; yield the cached directory. |
@@ -33,15 +75,13 @@ def tiny_model_dir(tmp_path_factory: pytest.TempPathFactory) -> Iterator[Path]: |
| 33 | 75 | env vars are cleared inside this fixture so ``snapshot_download`` |
| 34 | 76 | actually fetches. |
| 35 | 77 | """ |
| 36 | | - from huggingface_hub import snapshot_download |
| 37 | | - |
| 38 | 78 | # Clear offline env guards (set by the unit-test autouse fixture). |
| 39 | 79 | prior = { |
| 40 | 80 | k: os.environ.pop(k, None) |
| 41 | 81 | for k in ("HF_HUB_OFFLINE", "TRANSFORMERS_OFFLINE", "HF_DATASETS_OFFLINE") |
| 42 | 82 | } |
| 43 | 83 | try: |
| 44 | | - path = snapshot_download( |
| 84 | + path = _snapshot_download_with_retry( |
| 45 | 85 | repo_id=TINY_MODEL_HF_ID, |
| 46 | 86 | revision=TINY_MODEL_REVISION, |
| 47 | 87 | local_files_only=_offline_mode(), |