diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..3113e63 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,54 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +# Cancel in-flight runs on the same ref when a new commit lands. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: test (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + # Match the supported range declared in pyproject.toml + # (``requires-python = ">=3.11,<4.0"`` + classifier list). + python-version: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Poetry + run: pipx install poetry + + - name: Cache Poetry virtualenv + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: poetry-${{ matrix.python-version }}-${{ hashFiles('poetry.lock') }} + + - name: Install dependencies + run: poetry install --with dev --no-interaction --no-ansi + + - name: Lint (ruff) + run: poetry run ruff check llm_gateway/ tests/ + + - name: Format check (black) + run: poetry run black --check llm_gateway/ tests/ + + - name: Type check (pyright) + run: poetry run pyright llm_gateway/ + + - name: Tests (pytest) + run: poetry run pytest tests/unit/ -v diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3e1ff9f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,45 @@ +# Pre-commit hooks — the same gates CI runs, but local and fast. +# +# Install once per checkout: +# pre-commit install +# +# Run all hooks ad hoc: +# pre-commit run --all-files +# +# Versions pinned to exact SHA / tag so contributors get reproducible +# behaviour; bump deliberately rather than tracking ``main``. + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-added-large-files + args: ["--maxkb=512"] + - id: check-merge-conflict + - id: mixed-line-ending + args: ["--fix=lf"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + # Match the ruff version pinned in pyproject.toml's dev-deps so + # local + CI behaviour does not drift between operators. + rev: v0.7.0 + hooks: + - id: ruff + args: ["--fix"] + + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 26.3.1 + hooks: + - id: black + + - repo: https://github.com/Yelp/detect-secrets + rev: v1.5.0 + hooks: + - id: detect-secrets + args: ["--baseline", ".secrets.baseline"] + # Allow first-run when the baseline does not yet exist. + exclude: ^\.secrets\.baseline$ diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..1fc6711 --- /dev/null +++ b/NOTICE @@ -0,0 +1,16 @@ +llm-gateway +Copyright 2026 CoreNovus contributors + +Licensed under the MIT License — see LICENSE for the full text. + +This product is a thin FastAPI proxy in front of vLLM, an open-source +LLM serving engine (https://github.com/vllm-project/vllm) developed +by the vLLM project and licensed under the Apache License, Version 2.0: + + https://www.apache.org/licenses/LICENSE-2.0 + +This product does not redistribute the vLLM engine itself; vLLM is +expected to be supplied by the operator (vendor Docker container) at +deploy time. References to vLLM in code, documentation, and examples +are nominative and do not imply endorsement by or affiliation with +the vLLM project. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..ba28c9c --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,80 @@ +# Security policy + +## Reporting a vulnerability + +Please report suspected security vulnerabilities **privately** via +[GitHub Security Advisories][advisories]. + +[advisories]: https://github.com/CoreNovus/llm-gateway/security/advisories/new + +Do **not** open a public GitHub issue for security reports. + +We aim to acknowledge within 5 business days. Please include: + +- A clear description of the vulnerability and its impact +- Reproduction steps or a minimal proof-of-concept +- Affected versions / commits / deployment shape (loopback vs. exposed) + +If a public exploit is already in circulation, say so in the report — +we accelerate the disclosure window in that case. + +## Disclosure policy + +We follow a 90-day coordinated disclosure window: + +1. Day 0 — report received, triage begins +2. Day 5 — acknowledgement + initial severity assessment shared with reporter +3. Day 90 — fix released, advisory published, reporter credited (if desired) + +If the fix lands sooner, the advisory ships with the release. If +active exploitation is observed, the window is shortened by mutual +agreement with the reporter. + +## Scope + +**In scope:** + +- Source code under `llm_gateway/` +- Container / systemd artefacts under `deploy/` +- Pinned dependencies in `pyproject.toml` / `poetry.lock` + +**Out of scope** (please report to upstream): + +- vLLM (`https://github.com/vllm-project/vllm`) — engine internals +- FastAPI / Starlette / httpx / pydantic / uvicorn — framework dependencies +- Operator-controlled configuration. Empty `BEARER_TOKEN` disables + auth; this is documented behaviour for local dev and is gated by + `__main__._require_bearer_token` on the production entry path. + +## Known limitations (documented, not vulnerabilities) + +The following are accepted trade-offs rather than bugs. Filing them +as vulnerabilities is welcome but the response will point back here: + +- The `vllm_upstream_url` SSRF blocklist only checks IPv4 literals + against a known-metadata-IP set. Hostnames that resolve to those + IPs (DNS rebinding) are not caught at config-load time; the httpx + transport must enforce this on its own if the threat model + requires it. +- Chunked transfer-encoding requests are refused with 411 Length + Required rather than length-counted at the ASGI layer. Honest JSON + clients (httpx, openai-python, langchain-openai, curl) always set + `Content-Length` so the practical impact is zero. Operators who + must accept chunked uploads need a proxy in front that materialises + `Content-Length`, or to extend `BodySizeLimitMiddleware` to a + streaming ASGI implementation. +- The gateway binds to `127.0.0.1` by default and assumes an SSH + tunnel as the network-level boundary. Lifting the bind to + `0.0.0.0` without re-reading the threat model is an operator + misconfiguration, not a gateway vulnerability. + +## Hardening recommendations for operators + +Beyond the in-process defences this package ships with, deploy-time +hardening lives in `deploy/`: + +- Container hardening: `cap_drop: ALL`, `no-new-privileges`, + `read_only` rootfs, non-root UID 1001. +- Systemd unit: same posture for non-container deploys. +- Operator-side `--limit-max-requests` / kernel-level limits cover + HTTP-protocol-level abuse below this middleware's reach. diff --git a/llm_gateway/api/chat_completions.py b/llm_gateway/api/chat_completions.py index d95c757..2902d66 100644 --- a/llm_gateway/api/chat_completions.py +++ b/llm_gateway/api/chat_completions.py @@ -23,7 +23,7 @@ # deferred-string evaluation). Same constraint as ``api/health.py``. from collections.abc import Callable -from typing import Annotated +from typing import Annotated, Any from fastapi import APIRouter, Depends, status from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -134,10 +134,17 @@ def _record_token_usage(metrics: Metrics, model: str, response: dict) -> None: metrics.tokens_completion_total.labels(model=model).inc(completion) -def _safe_token_count(value: object) -> int: +def _safe_token_count(value: Any) -> int: """Coerce an upstream token-count field to a non-negative ``int``. Returns 0 for ``None`` / missing / non-numeric / negative values. + + Parameter is typed ``Any`` (not ``object``) because the value comes + straight from a parsed-JSON upstream payload — pyright rejects + ``int(object | Literal[0])`` since plain ``object`` does not + conform to ``ConvertibleToInt`` (str | Buffer | SupportsInt | + SupportsIndex). The ``try/except`` already covers every runtime + shape, so the static type can be honest about that. """ try: count = int(value or 0) diff --git a/tests/unit/test_rate_limit.py b/tests/unit/test_rate_limit.py index 0121ff6..d4d2c71 100644 --- a/tests/unit/test_rate_limit.py +++ b/tests/unit/test_rate_limit.py @@ -179,7 +179,9 @@ def test_middleware_passes_request_when_limiter_allows() -> None: assert response.status_code == 200 # The bearer is hashed before becoming the bucket key — the # plaintext token must not appear in the bucket dict. - expected_digest = hashlib.sha256(b"t1").hexdigest()[:16] + # 32 hex chars (128 bits) — bumped from 16 to close a token-grinding + # collision attack. See PR #6 / commit f40049e. + expected_digest = hashlib.sha256(b"t1").hexdigest()[:32] assert limiter.calls == [f"token:{expected_digest}"]