diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 80ae685..b77c2ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" - name: Install dependencies run: pip install -e ".[dev]" - name: Ruff check @@ -31,12 +32,44 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" - name: Install dependencies run: pip install -e ".[dev]" - name: Mypy run: mypy --strict src/ + # Warm the HuggingFace model cache exactly once before the test matrix. + # The benchmark tests load a real sentence-transformers model + # (all-MiniLM-L6-v2) on purpose. If the matrix legs each populated the cache + # themselves, a cold cache means all three legs miss and download the model + # concurrently — tripping HF rate limiting (HTTP 429), and if every leg fails + # the cache is never saved. This single upstream job does at most one download + # per cold workflow run, saves the shared cache, and the matrix (which `needs` + # it) then restores a warm cache and runs fully offline. + warm-hf-cache: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" + - name: Restore/Save HuggingFace cache + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 + - name: Download model on cache miss + if: steps.hf-cache.outputs.cache-hit != 'true' + # Only the minimal stack needed to fetch + load the model. This is the + # single, serialized download for the whole workflow on a cold cache. + run: | + pip install "sentence-transformers>=3.0.0" + python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')" + test: + needs: warm-hf-cache runs-on: ubuntu-latest strategy: matrix: @@ -46,7 +79,24 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: "pip" + # Restore the cache warmed by `warm-hf-cache`. After warming this is a hit + # on every matrix leg, so the tests run offline and make no Hub request. + - name: Restore HuggingFace models + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 - name: Install dependencies run: pip install -e ".[dev]" - name: Run tests + env: + # Force offline only on a cache hit (the warm-hf-cache job populates it + # first, so legs hit). A defensive '0' on a miss lets a leg self-heal + # by downloading rather than hard-failing — but the warm job makes a + # miss here unexpected. + HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + HF_HUB_DISABLE_TELEMETRY: "1" run: pytest --cov --cov-fail-under=90 -q diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4d5aabf..05afd77 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -109,7 +109,21 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" + # The pre-publish smoke gate loads the real all-MiniLM-L6-v2 model; cache + # the Hub store (and go offline on a cache hit) so a release is never + # blocked by transient HF 429 rate limiting. + - name: Cache HuggingFace models + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 - name: Pre-publish smoke gate + env: + HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + HF_HUB_DISABLE_TELEMETRY: "1" run: | pip install -e ".[embeddings-local]" python scripts/check_smoke_gate.py diff --git a/.github/workflows/smoke-gate.yml b/.github/workflows/smoke-gate.yml index f7ce4c7..82df8d3 100644 --- a/.github/workflows/smoke-gate.yml +++ b/.github/workflows/smoke-gate.yml @@ -24,6 +24,18 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.12" + cache: "pip" + + # The smoke gate runs the synthetic benchmark, which loads the real + # all-MiniLM-L6-v2 model. Cache the Hub store so the model is not + # re-downloaded each run (avoids HF 429 rate limiting); go offline on a + # cache hit so no Hub request is made. + - name: Cache HuggingFace models + id: hf-cache + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: hf-${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2-v1 - name: Install package with embeddings extra run: pip install -e ".[embeddings-local]" @@ -31,4 +43,8 @@ jobs: - name: Run pre-publish smoke gate # Same committed floors as the publish path # (scripts/check_smoke_gate.py). A breach fails the run. + env: + HF_HUB_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + TRANSFORMERS_OFFLINE: ${{ steps.hf-cache.outputs.cache-hit == 'true' && '1' || '0' }} + HF_HUB_DISABLE_TELEMETRY: "1" run: python scripts/check_smoke_gate.py diff --git a/.github/workflows/upgrade-test.yml b/.github/workflows/upgrade-test.yml index b0f474b..4c2c3ee 100644 --- a/.github/workflows/upgrade-test.yml +++ b/.github/workflows/upgrade-test.yml @@ -23,6 +23,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: "pip" - name: Check the baseline version is published id: baseline diff --git a/pyproject.toml b/pyproject.toml index 539e058..788e470 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,13 +65,16 @@ dev = [ # version range as the ``embeddings-local`` extra so both # extras resolve together (``pip install -e .[dev,embeddings-local]`` # used to fail with a resolver conflict before this alignment). - # ``sentence-transformers`` pulls torch transitively (~500 MB) — - # the CI cache absorbs that cost after the first cold install. + # ``sentence-transformers`` pulls torch transitively (~500 MB). CI caches + # both the pip download (``cache: pip``) and the HuggingFace model store + # (``actions/cache`` on ``~/.cache/huggingface``), so this cost is paid once + # and the model-loading tests do not re-hit the Hub (avoids HF 429). "sentence-transformers>=3.0.0", # Install the optional sqlite-vec extension in the test environment so - # CI actually exercises the real extension-load path (the worker-thread - # load and ANN search), not just the numpy fallback. Mirrors the - # constraint declared in the ``vec`` extra above. + # CI actually exercises the real extension-load path: the worker-thread + # load and KNN search over the compact vec0 table — faster brute-force KNN, + # not ANN at the pinned 0.1.x line — not just the numpy fallback. Mirrors + # the constraint declared in the ``vec`` extra above. "sqlite-vec>=0.1.0,<0.2.0", ]