From 08ef0eaf2d3712e34fb4ef238344a417983098b8 Mon Sep 17 00:00:00 2001 From: przemarzec <98286080+przemarzec@users.noreply.github.com> Date: Fri, 5 Jun 2026 09:13:10 +0200 Subject: [PATCH 1/2] ci: cache HuggingFace models and pip to stop HF 429 failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The synthetic-benchmark tests load a real sentence-transformers model (all-MiniLM-L6-v2) on purpose, so CI downloads it from the Hub. With no caching, every test-matrix job and the smoke gate re-downloaded it, and the concurrent requests tripped HF rate limiting (HTTP 429), failing the model-dependent tests intermittently. Add caching across the workflows that install the embeddings stack: - cache: "pip" on setup-python in ci (lint/typecheck/test), smoke-gate, release (publish) and upgrade-test — reuses the torch/sentence-transformers download. - actions/cache on ~/.cache/huggingface in the model-loading jobs (ci test matrix, smoke-gate, release publish), keyed on the model — the model is fetched once and reused. - On a cache hit, set HF_HUB_OFFLINE/TRANSFORMERS_OFFLINE so no Hub request is made at all; gated on cache-hit so a cold first run can still download once (an empty cache + offline fails to find the files). No test logic changes — the benchmark tests still exercise the real model, now served from cache. Also corrects the stale pyproject [dev] comment that claimed a CI cache existed before this change. Verified locally: with the model cached, HF_HUB_OFFLINE=1 loads it with zero network, and the evaluator benchmark tests pass offline. --- .github/workflows/ci.yml | 23 +++++++++++++++++++++++ .github/workflows/release.yml | 14 ++++++++++++++ .github/workflows/smoke-gate.yml | 16 ++++++++++++++++ .github/workflows/upgrade-test.yml | 1 + pyproject.toml | 6 ++++-- 5 files changed, 58 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 80ae685..eae3858 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" - name: Install dependencies run: pip install -e ".[dev]" - name: Ruff check @@ -31,6 +32,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" - name: Install dependencies run: pip install -e ".[dev]" - name: Mypy @@ -46,7 +48,28 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: "pip" + # Cache the HuggingFace model store. The benchmark tests load a real + # sentence-transformers model (all-MiniLM-L6-v2) on purpose; without this + # cache every matrix job re-downloads it from the Hub, and the concurrent + # requests across the matrix trip HF rate limiting (HTTP 429), which fails + # the model-dependent tests. With the cache the model is fetched once and + # reused; on a cache hit we additionally go offline so no Hub request is + # made at all. + - name: Cache HuggingFace models + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 - name: Install dependencies run: pip install -e ".[dev]" - name: Run tests + env: + # Only force offline when the cache is populated (a cold first run must + # still be allowed to download the model once). An empty cache + offline + # would fail with "couldn't find the requested files in the local cache". + HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + HF_HUB_DISABLE_TELEMETRY: "1" run: pytest --cov --cov-fail-under=90 -q diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4d5aabf..05afd77 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -109,7 +109,21 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" + # The pre-publish smoke gate loads the real all-MiniLM-L6-v2 model; cache + # the Hub store (and go offline on a cache hit) so a release is never + # blocked by transient HF 429 rate limiting. + - name: Cache HuggingFace models + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 - name: Pre-publish smoke gate + env: + HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + HF_HUB_DISABLE_TELEMETRY: "1" run: | pip install -e ".[embeddings-local]" python scripts/check_smoke_gate.py diff --git a/.github/workflows/smoke-gate.yml b/.github/workflows/smoke-gate.yml index f7ce4c7..82df8d3 100644 --- a/.github/workflows/smoke-gate.yml +++ b/.github/workflows/smoke-gate.yml @@ -24,6 +24,18 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" + + # The smoke gate runs the synthetic benchmark, which loads the real + # all-MiniLM-L6-v2 model. Cache the Hub store so the model is not + # re-downloaded each run (avoids HF 429 rate limiting); go offline on a + # cache hit so no Hub request is made. + - name: Cache HuggingFace models + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 - name: Install package with embeddings extra run: pip install -e ".[embeddings-local]" @@ -31,4 +43,8 @@ jobs: - name: Run pre-publish smoke gate # Same committed floors as the publish path # (scripts/check_smoke_gate.py). A breach fails the run. + env: + HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + HF_HUB_DISABLE_TELEMETRY: "1" run: python scripts/check_smoke_gate.py diff --git a/.github/workflows/upgrade-test.yml b/.github/workflows/upgrade-test.yml index b0f474b..4c2c3ee 100644 --- a/.github/workflows/upgrade-test.yml +++ b/.github/workflows/upgrade-test.yml @@ -23,6 +23,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: "pip" - name: Check the baseline version is published id: baseline diff --git a/pyproject.toml b/pyproject.toml index 539e058..2bd16a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,8 +65,10 @@ dev = [ # version range as the ``embeddings-local`` extra so both # extras resolve together (``pip install -e .[dev,embeddings-local]`` # used to fail with a resolver conflict before this alignment). - # ``sentence-transformers`` pulls torch transitively (~500 MB) — - # the CI cache absorbs that cost after the first cold install. + # ``sentence-transformers`` pulls torch transitively (~500 MB). CI caches + # both the pip download (``cache: pip``) and the HuggingFace model store + # (``actions/cache`` on ``~/.cache/huggingface``), so this cost is paid once + # and the model-loading tests do not re-hit the Hub (avoids HF 429). "sentence-transformers>=3.0.0", # Install the optional sqlite-vec extension in the test environment so # CI actually exercises the real extension-load path (the worker-thread From 5a51ce9fae9e79e0a48ebe27276f180b262ddd69 Mon Sep 17 00:00:00 2001 From: przemarzec <98286080+przemarzec@users.noreply.github.com> Date: Fri, 5 Jun 2026 09:53:48 +0200 Subject: [PATCH 2/2] ci: warm HF cache in a single upstream job + tidy sqlite-vec comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review of the HF/pip cache change: - ci.yml: move the HuggingFace model download into a dedicated warm-hf-cache job that runs once before the test matrix (test now `needs: warm-hf-cache`). On a cold cache this guarantees a single, serialized model download for the whole workflow instead of three concurrent downloads across the matrix legs (which could still trip HF 429 and, if all legs failed, never populate the cache). The matrix jobs restore the warmed cache and run offline on a cache hit. Add actions/checkout before setup-python in the warm job so the pip cache key has the dependency files to hash. - pyproject.toml: reword the sqlite-vec dev-dependency comment — the extension exercises KNN search over the compact vec0 table (faster brute-force KNN, not ANN at the pinned 0.1.x line), and fix the unbalanced parenthesis. Verified locally: actionlint clean on all workflows, ruff clean, and the benchmark module (which loads the real model) passes offline — 30 passed. --- .github/workflows/ci.yml | 49 +++++++++++++++++++++++++++++++--------- pyproject.toml | 7 +++--- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eae3858..b77c2ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,38 @@ jobs: - name: Mypy run: mypy --strict src/ + # Warm the HuggingFace model cache exactly once before the test matrix. + # The benchmark tests load a real sentence-transformers model + # (all-MiniLM-L6-v2) on purpose. If the matrix legs each populated the cache + # themselves, a cold cache means all three legs miss and download the model + # concurrently — tripping HF rate limiting (HTTP 429), and if every leg fails + # the cache is never saved. This single upstream job does at most one download + # per cold workflow run, saves the shared cache, and the matrix (which `needs` + # it) then restores a warm cache and runs fully offline. + warm-hf-cache: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" + - name: Restore/Save HuggingFace cache + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 + - name: Download model on cache miss + if: steps.hf-cache.outputs.cache-hit != 'true' + # Only the minimal stack needed to fetch + load the model. This is the + # single, serialized download for the whole workflow on a cold cache. + run: | + pip install "sentence-transformers>=3.0.0" + python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')" + test: + needs: warm-hf-cache runs-on: ubuntu-latest strategy: matrix: @@ -49,14 +80,9 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: "pip" - # Cache the HuggingFace model store. The benchmark tests load a real - # sentence-transformers model (all-MiniLM-L6-v2) on purpose; without this - # cache every matrix job re-downloads it from the Hub, and the concurrent - # requests across the matrix trip HF rate limiting (HTTP 429), which fails - # the model-dependent tests. With the cache the model is fetched once and - # reused; on a cache hit we additionally go offline so no Hub request is - # made at all. - - name: Cache HuggingFace models + # Restore the cache warmed by `warm-hf-cache`. After warming this is a hit + # on every matrix leg, so the tests run offline and make no Hub request. + - name: Restore HuggingFace models id: hf-cache uses: actions/cache@v4 with: @@ -66,9 +92,10 @@ jobs: run: pip install -e ".[dev]" - name: Run tests env: - # Only force offline when the cache is populated (a cold first run must - # still be allowed to download the model once). An empty cache + offline - # would fail with "couldn't find the requested files in the local cache". + # Force offline only on a cache hit (the warm-hf-cache job populates it + # first, so legs hit). A defensive '0' on a miss lets a leg self-heal + # by downloading rather than hard-failing — but the warm job makes a + # miss here unexpected. HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} HF_HUB_DISABLE_TELEMETRY: "1" diff --git a/pyproject.toml b/pyproject.toml index 2bd16a9..788e470 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,9 +71,10 @@ dev = [ # and the model-loading tests do not re-hit the Hub (avoids HF 429). "sentence-transformers>=3.0.0", # Install the optional sqlite-vec extension in the test environment so - # CI actually exercises the real extension-load path (the worker-thread - # load and ANN search), not just the numpy fallback. Mirrors the - # constraint declared in the ``vec`` extra above. + # CI actually exercises the real extension-load path: the worker-thread + # load and KNN search over the compact vec0 table — faster brute-force KNN, + # not ANN at the pinned 0.1.x line — not just the numpy fallback. Mirrors + # the constraint declared in the ``vec`` extra above. "sqlite-vec>=0.1.0,<0.2.0", ]