sovantica · przemarzec · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,6 +17,7 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
       - name: Install dependencies
         run: pip install -e ".[dev]"
       - name: Ruff check
@@ -31,12 +32,44 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
       - name: Install dependencies
         run: pip install -e ".[dev]"
       - name: Mypy
         run: mypy --strict src/
 
+  # Warm the HuggingFace model cache exactly once before the test matrix.
+  # The benchmark tests load a real sentence-transformers model
+  # (all-MiniLM-L6-v2) on purpose. If the matrix legs each populated the cache
+  # themselves, a cold cache means all three legs miss and download the model
+  # concurrently — tripping HF rate limiting (HTTP 429), and if every leg fails
+  # the cache is never saved. This single upstream job does at most one download
+  # per cold workflow run, saves the shared cache, and the matrix (which `needs`
+  # it) then restores a warm cache and runs fully offline.
+  warm-hf-cache:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+      - name: Restore/Save HuggingFace cache
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
+      - name: Download model on cache miss
+        if: steps.hf-cache.outputs.cache-hit != 'true'
+        # Only the minimal stack needed to fetch + load the model. This is the
+        # single, serialized download for the whole workflow on a cold cache.
+        run: |
+          pip install "sentence-transformers>=3.0.0"
+          python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+
   test:
+    needs: warm-hf-cache
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -46,7 +79,24 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: "pip"
+      # Restore the cache warmed by `warm-hf-cache`. After warming this is a hit
+      # on every matrix leg, so the tests run offline and make no Hub request.
+      - name: Restore HuggingFace models
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
       - name: Install dependencies
         run: pip install -e ".[dev]"
       - name: Run tests
+        env:
+          # Force offline only on a cache hit (the warm-hf-cache job populates it
+          # first, so legs hit). A defensive '0' on a miss lets a leg self-heal
+          # by downloading rather than hard-failing — but the warm job makes a
+          # miss here unexpected.
+          HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          HF_HUB_DISABLE_TELEMETRY: "1"
         run: pytest --cov --cov-fail-under=90 -q
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -109,7 +109,21 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
+      # The pre-publish smoke gate loads the real all-MiniLM-L6-v2 model; cache
+      # the Hub store (and go offline on a cache hit) so a release is never
+      # blocked by transient HF 429 rate limiting.
+      - name: Cache HuggingFace models
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
       - name: Pre-publish smoke gate
+        env:
+          HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          HF_HUB_DISABLE_TELEMETRY: "1"
         run: |
           pip install -e ".[embeddings-local]"
           python scripts/check_smoke_gate.py

diff --git a/.github/workflows/smoke-gate.yml b/.github/workflows/smoke-gate.yml
@@ -24,11 +24,27 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
+          cache: "pip"
+
+      # The smoke gate runs the synthetic benchmark, which loads the real
+      # all-MiniLM-L6-v2 model. Cache the Hub store so the model is not
+      # re-downloaded each run (avoids HF 429 rate limiting); go offline on a
+      # cache hit so no Hub request is made.
+      - name: Cache HuggingFace models
+        id: hf-cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface
+          key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1
 
       - name: Install package with embeddings extra
         run: pip install -e ".[embeddings-local]"
 
       - name: Run pre-publish smoke gate
         # Same committed floors as the publish path
         # (scripts/check_smoke_gate.py). A breach fails the run.
+        env:
+          HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }}
+          HF_HUB_DISABLE_TELEMETRY: "1"
         run: python scripts/check_smoke_gate.py
diff --git a/.github/workflows/upgrade-test.yml b/.github/workflows/upgrade-test.yml
@@ -23,6 +23,7 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: "pip"
 
       - name: Check the baseline version is published
         id: baseline

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,13 +65,16 @@ dev = [
     # version range as the ``embeddings-local`` extra so both
     # extras resolve together (``pip install -e .[dev,embeddings-local]``
     # used to fail with a resolver conflict before this alignment).
-    # ``sentence-transformers`` pulls torch transitively (~500 MB) —
-    # the CI cache absorbs that cost after the first cold install.
+    # ``sentence-transformers`` pulls torch transitively (~500 MB). CI caches
+    # both the pip download (``cache: pip``) and the HuggingFace model store
+    # (``actions/cache`` on ``~/.cache/huggingface``), so this cost is paid once
+    # and the model-loading tests do not re-hit the Hub (avoids HF 429).
     "sentence-transformers>=3.0.0",
     # Install the optional sqlite-vec extension in the test environment so
-    # CI actually exercises the real extension-load path (the worker-thread
-    # load and ANN search), not just the numpy fallback.  Mirrors the
-    # constraint declared in the ``vec`` extra above.
+    # CI actually exercises the real extension-load path: the worker-thread
+    # load and KNN search over the compact vec0 table — faster brute-force KNN,
+    # not ANN at the pinned 0.1.x line — not just the numpy fallback.  Mirrors
+    # the constraint declared in the ``vec`` extra above.
     "sqlite-vec>=0.1.0,<0.2.0",
 ]