From 08ef0eaf2d3712e34fb4ef238344a417983098b8 Mon Sep 17 00:00:00 2001
From: przemarzec <98286080+przemarzec@users.noreply.github.com>
Date: Fri, 5 Jun 2026 09:13:10 +0200
Subject: [PATCH 1/2] ci: cache HuggingFace models and pip to stop HF 429
 failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The synthetic-benchmark tests load a real sentence-transformers model
(all-MiniLM-L6-v2) on purpose, so CI downloads it from the Hub. With no caching,
every test-matrix job and the smoke gate re-downloaded it, and the concurrent
requests tripped HF rate limiting (HTTP 429), failing the model-dependent tests
intermittently.

Add caching across the workflows that install the embeddings stack:
- cache: "pip" on setup-python in ci (lint/typecheck/test), smoke-gate,
  release (publish) and upgrade-test — reuses the torch/sentence-transformers
  download.
- actions/cache on ~/.cache/huggingface in the model-loading jobs (ci test
  matrix, smoke-gate, release publish), keyed on the model — the model is fetched
  once and reused.
- On a cache hit, set HF_HUB_OFFLINE/TRANSFORMERS_OFFLINE so no Hub request is
  made at all; gated on cache-hit so a cold first run can still download once
  (an empty cache + offline fails to find the files).

No test logic changes — the benchmark tests still exercise the real model, now
served from cache. Also corrects the stale pyproject [dev] comment that claimed a
CI cache existed before this change. Verified locally: with the model cached,
HF_HUB_OFFLINE=1 loads it with zero network, and the evaluator benchmark tests
pass offline.
---
 .github/workflows/ci.yml           | 23 +++++++++++++++++++++++
 .github/workflows/release.yml      | 14 ++++++++++++++
 .github/workflows/smoke-gate.yml   | 16 ++++++++++++++++
 .github/workflows/upgrade-test.yml |  1 +
 pyproject.toml                     |  6 ++++--
 5 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 80ae685..eae3858 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,6 +17,7 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
       - name: Install dependencies
         run: pip install -e ".[dev]"
       - name: Ruff check
@@ -31,6 +32,7 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
       - name: Install dependencies
         run: pip install -e ".[dev]"
       - name: Mypy
@@ -46,7 +48,28 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: "pip"
+      # Cache the HuggingFace model store. The benchmark tests load a real
+      # sentence-transformers model (all-MiniLM-L6-v2) on purpose; without this
+      # cache every matrix job re-downloads it from the Hub, and the concurrent
+      # requests across the matrix trip HF rate limiting (HTTP 429), which fails
+      # the model-dependent tests. With the cache the model is fetched once and
+      # reused; on a cache hit we additionally go offline so no Hub request is
+      # made at all.
+      - name: Cache HuggingFace models
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
       - name: Install dependencies
         run: pip install -e ".[dev]"
       - name: Run tests
+        env:
+          # Only force offline when the cache is populated (a cold first run must
+          # still be allowed to download the model once). An empty cache + offline
+          # would fail with "couldn't find the requested files in the local cache".
+          HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          HF_HUB_DISABLE_TELEMETRY: "1"
         run: pytest --cov --cov-fail-under=90 -q
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4d5aabf..05afd77 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -109,7 +109,21 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
+      # The pre-publish smoke gate loads the real all-MiniLM-L6-v2 model; cache
+      # the Hub store (and go offline on a cache hit) so a release is never
+      # blocked by transient HF 429 rate limiting.
+      - name: Cache HuggingFace models
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
       - name: Pre-publish smoke gate
+        env:
+          HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          HF_HUB_DISABLE_TELEMETRY: "1"
         run: |
           pip install -e ".[embeddings-local]"
           python scripts/check_smoke_gate.py
diff --git a/.github/workflows/smoke-gate.yml b/.github/workflows/smoke-gate.yml
index f7ce4c7..82df8d3 100644
--- a/.github/workflows/smoke-gate.yml
+++ b/.github/workflows/smoke-gate.yml
@@ -24,6 +24,18 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
+
+      # The smoke gate runs the synthetic benchmark, which loads the real
+      # all-MiniLM-L6-v2 model. Cache the Hub store so the model is not
+      # re-downloaded each run (avoids HF 429 rate limiting); go offline on a
+      # cache hit so no Hub request is made.
+      - name: Cache HuggingFace models
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
 
       - name: Install package with embeddings extra
         run: pip install -e ".[embeddings-local]"
@@ -31,4 +43,8 @@ jobs:
       - name: Run pre-publish smoke gate
         # Same committed floors as the publish path
         # (scripts/check_smoke_gate.py). A breach fails the run.
+        env:
+          HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          HF_HUB_DISABLE_TELEMETRY: "1"
         run: python scripts/check_smoke_gate.py
diff --git a/.github/workflows/upgrade-test.yml b/.github/workflows/upgrade-test.yml
index b0f474b..4c2c3ee 100644
--- a/.github/workflows/upgrade-test.yml
+++ b/.github/workflows/upgrade-test.yml
@@ -23,6 +23,7 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: "pip"
 
       - name: Check the baseline version is published
         id: baseline
diff --git a/pyproject.toml b/pyproject.toml
index 539e058..2bd16a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,8 +65,10 @@ dev = [
     # version range as the ``embeddings-local`` extra so both
     # extras resolve together (``pip install -e .[dev,embeddings-local]``
     # used to fail with a resolver conflict before this alignment).
-    # ``sentence-transformers`` pulls torch transitively (~500 MB) —
-    # the CI cache absorbs that cost after the first cold install.
+    # ``sentence-transformers`` pulls torch transitively (~500 MB). CI caches
+    # both the pip download (``cache: pip``) and the HuggingFace model store
+    # (``actions/cache`` on ``~/.cache/huggingface``), so this cost is paid once
+    # and the model-loading tests do not re-hit the Hub (avoids HF 429).
     "sentence-transformers>=3.0.0",
     # Install the optional sqlite-vec extension in the test environment so
     # CI actually exercises the real extension-load path (the worker-thread

From 5a51ce9fae9e79e0a48ebe27276f180b262ddd69 Mon Sep 17 00:00:00 2001
From: przemarzec <98286080+przemarzec@users.noreply.github.com>
Date: Fri, 5 Jun 2026 09:53:48 +0200
Subject: [PATCH 2/2] ci: warm HF cache in a single upstream job + tidy
 sqlite-vec comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review of the HF/pip cache change:

- ci.yml: move the HuggingFace model download into a dedicated warm-hf-cache job
  that runs once before the test matrix (test now `needs: warm-hf-cache`). On a
  cold cache this guarantees a single, serialized model download for the whole
  workflow instead of three concurrent downloads across the matrix legs (which
  could still trip HF 429 and, if all legs failed, never populate the cache). The
  matrix jobs restore the warmed cache and run offline on a cache hit. Add
  actions/checkout before setup-python in the warm job so the pip cache key has
  the dependency files to hash.
- pyproject.toml: reword the sqlite-vec dev-dependency comment — the extension
  exercises KNN search over the compact vec0 table (faster brute-force KNN, not
  ANN at the pinned 0.1.x line), and fix the unbalanced parenthesis.

Verified locally: actionlint clean on all workflows, ruff clean, and the
benchmark module (which loads the real model) passes offline — 30 passed.
---
 .github/workflows/ci.yml | 49 +++++++++++++++++++++++++++++++---------
 pyproject.toml           |  7 +++---
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eae3858..b77c2ff 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,7 +38,38 @@ jobs:
       - name: Mypy
         run: mypy --strict src/
 
+  # Warm the HuggingFace model cache exactly once before the test matrix.
+  # The benchmark tests load a real sentence-transformers model
+  # (all-MiniLM-L6-v2) on purpose. If the matrix legs each populated the cache
+  # themselves, a cold cache means all three legs miss and download the model
+  # concurrently — tripping HF rate limiting (HTTP 429), and if every leg fails
+  # the cache is never saved. This single upstream job does at most one download
+  # per cold workflow run, saves the shared cache, and the matrix (which `needs`
+  # it) then restores a warm cache and runs fully offline.
+  warm-hf-cache:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+      - name: Restore/Save HuggingFace cache
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
+      - name: Download model on cache miss
+        if: steps.hf-cache.outputs.cache-hit != 'true'
+        # Only the minimal stack needed to fetch + load the model. This is the
+        # single, serialized download for the whole workflow on a cold cache.
+        run: |
+          pip install "sentence-transformers>=3.0.0"
+          python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+
   test:
+    needs: warm-hf-cache
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -49,14 +80,9 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           cache: "pip"
-      # Cache the HuggingFace model store. The benchmark tests load a real
-      # sentence-transformers model (all-MiniLM-L6-v2) on purpose; without this
-      # cache every matrix job re-downloads it from the Hub, and the concurrent
-      # requests across the matrix trip HF rate limiting (HTTP 429), which fails
-      # the model-dependent tests. With the cache the model is fetched once and
-      # reused; on a cache hit we additionally go offline so no Hub request is
-      # made at all.
-      - name: Cache HuggingFace models
+      # Restore the cache warmed by `warm-hf-cache`. After warming this is a hit
+      # on every matrix leg, so the tests run offline and make no Hub request.
+      - name: Restore HuggingFace models
         id: hf-cache
         uses: actions/cache@v4
         with:
@@ -66,9 +92,10 @@ jobs:
         run: pip install -e ".[dev]"
       - name: Run tests
         env:
-          # Only force offline when the cache is populated (a cold first run must
-          # still be allowed to download the model once). An empty cache + offline
-          # would fail with "couldn't find the requested files in the local cache".
+          # Force offline only on a cache hit (the warm-hf-cache job populates it
+          # first, so legs hit). A defensive '0' on a miss lets a leg self-heal
+          # by downloading rather than hard-failing — but the warm job makes a
+          # miss here unexpected.
           HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
           TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
           HF_HUB_DISABLE_TELEMETRY: "1"
diff --git a/pyproject.toml b/pyproject.toml
index 2bd16a9..788e470 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,9 +71,10 @@ dev = [
     # and the model-loading tests do not re-hit the Hub (avoids HF 429).
     "sentence-transformers>=3.0.0",
     # Install the optional sqlite-vec extension in the test environment so
-    # CI actually exercises the real extension-load path (the worker-thread
-    # load and ANN search), not just the numpy fallback.  Mirrors the
-    # constraint declared in the ``vec`` extra above.
+    # CI actually exercises the real extension-load path: the worker-thread
+    # load and KNN search over the compact vec0 table — faster brute-force KNN,
+    # not ANN at the pinned 0.1.x line — not just the numpy fallback.  Mirrors
+    # the constraint declared in the ``vec`` extra above.
     "sqlite-vec>=0.1.0,<0.2.0",
 ]