diff --git a/.gitignore b/.gitignore index 4e9b44e..558a2f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +litellm_pr_draft.md __pycache__/ *.py[cod] *$py.class diff --git a/README.md b/README.md index e4d51b9..05d1ad5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Your App → Token0 Proxy → [Analyze → Classify → Route → Transform → Database (logs every optimization decision + savings) ``` -Token0 applies **10 optimizations** automatically: +Token0 applies **11 optimizations** automatically: ### Core Optimizations (Free Tier) @@ -56,6 +56,8 @@ Token0 applies **10 optimizations** automatically: **10. Video Optimization** — Automatically extract keyframes from video at 1fps, deduplicate similar consecutive frames using QJL perceptual hashing, detect scene changes via pixel-level diff, and run each keyframe through the full image optimization pipeline. A 60-second video at 30fps (1,800 frames) reduces to ~10 keyframes before being sent to the LLM. **13-45% savings on local models; ~83% projected savings on GPT-4.1.** Optional CLIP-based query-frame scoring (Layer 2) ranks frames by relevance to the user's prompt. +**11. Saliency-Based ROI Cropping** — Detects which region of an image the prompt is asking about and crops to that region before sending to the LLM. "What's the total on this invoice?" → crops to the bottom 40% of the image. "Read the header" → crops to the top 25%. Rule-based spatial keyword matching (zero ML deps). Delivers ~60% additional pixel reduction on document and form images before any other optimization runs. + --- ## Benchmarks @@ -482,6 +484,10 @@ curl http://localhost:8000/v1/usage } ``` +### Savings Dashboard + +Open `http://localhost:8000/dashboard` in your browser for a live view of total requests, tokens saved, cost saved, and per-optimization breakdown. Auto-refreshes every 10 seconds. + ### Run Benchmarks Yourself ```bash diff --git a/test_token0_litellm.py b/test_token0_litellm.py new file mode 100644 index 0000000..953262f --- /dev/null +++ b/test_token0_litellm.py @@ -0,0 +1,141 @@ +"""Tests for the Token0 LiteLLM CustomLogger integration. + +These tests verify the Token0Hook contract without making real API calls. +Token0 is installed separately: pip install token0 +""" + +import pytest +from unittest.mock import patch + + +def _make_image_message(url: str = "data:image/jpeg;base64,/9j/fake") -> dict: + return { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": url}}, + ], + } + + +# --------------------------------------------------------------------------- +# Import guard — skip entire module if token0 is not installed +# --------------------------------------------------------------------------- + +token0 = pytest.importorskip("token0", reason="token0 not installed") + + +# --------------------------------------------------------------------------- +# Hook contract tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_token0_hook_passthrough_for_non_completion(): + """Hook must return data unchanged when call_type != 'completion'.""" + from token0.litellm_hook import Token0Hook + + hook = Token0Hook() + data = {"messages": [_make_image_message()], "model": "gpt-4o"} + result = await hook.async_pre_call_hook( + user_api_key_dict={}, cache=None, data=data, call_type="embedding" + ) + assert result is data + + +@pytest.mark.asyncio +async def test_token0_hook_passthrough_for_empty_messages(): + """Hook must return data unchanged when messages is empty.""" + from token0.litellm_hook import Token0Hook + + hook = Token0Hook() + data = {"messages": [], "model": "gpt-4o"} + result = await hook.async_pre_call_hook( + user_api_key_dict={}, cache=None, data=data, call_type="completion" + ) + assert result is data + + +@pytest.mark.asyncio +async def test_token0_hook_text_only_passthrough(): + """Text-only messages must pass through with zero overhead.""" + from token0.litellm_hook import Token0Hook + + hook = Token0Hook() + original_messages = [{"role": "user", "content": "Hello, what is 2+2?"}] + data = {"messages": original_messages, "model": "gpt-4o"} + + result = await hook.async_pre_call_hook( + user_api_key_dict={}, cache=None, data=data, call_type="completion" + ) + + assert result["messages"] == original_messages + + +@pytest.mark.asyncio +async def test_token0_hook_attaches_stats_metadata(): + """Hook must attach token0 stats to data['metadata']['token0'].""" + from token0.litellm_hook import Token0Hook + + hook = Token0Hook() + messages = [_make_image_message()] + data = {"messages": messages, "model": "gpt-4o"} + + mock_stats = { + "tokens_before": 765, + "tokens_after": 85, + "tokens_saved": 680, + "optimizations": ["prompt-aware→low detail"], + "recommended_model": None, + } + + with patch( + "token0.litellm_hook.optimize_messages", + return_value=(messages, mock_stats), + ): + result = await hook.async_pre_call_hook( + user_api_key_dict={}, cache=None, data=data, call_type="completion" + ) + + assert "metadata" in result + assert "token0" in result["metadata"] + assert result["metadata"]["token0"]["tokens_saved"] == 680 + + +@pytest.mark.asyncio +async def test_token0_hook_remote_url_passthrough(): + """Images with remote http/https URLs must not be modified.""" + from token0.litellm_hook import Token0Hook + + hook = Token0Hook() + remote_url = "https://example.com/photo.jpg" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this"}, + {"type": "image_url", "image_url": {"url": remote_url}}, + ], + } + ] + data = {"messages": messages, "model": "gpt-4o"} + + mock_stats = { + "tokens_before": 0, + "tokens_after": 0, + "tokens_saved": 0, + "optimizations": [], + "recommended_model": None, + } + + with patch( + "token0.litellm_hook.optimize_messages", + return_value=(messages, mock_stats), + ): + result = await hook.async_pre_call_hook( + user_api_key_dict={}, cache=None, data=data, call_type="completion" + ) + + content = result["messages"][0]["content"] + image_parts = [p for p in content if p.get("type") == "image_url"] + assert image_parts[0]["image_url"]["url"] == remote_url diff --git a/tests/test_saliency.py b/tests/test_saliency.py new file mode 100644 index 0000000..0d8dd1b --- /dev/null +++ b/tests/test_saliency.py @@ -0,0 +1,125 @@ +"""Tests for saliency-based ROI cropping.""" + +from PIL import Image + +from token0.optimization.saliency import SaliencyResult, apply_saliency_crop, detect_roi + + +def _make_image(w: int = 800, h: int = 1000) -> Image.Image: + return Image.new("RGB", (w, h), color=(200, 200, 200)) + + +# --------------------------------------------------------------------------- +# detect_roi — keyword matching +# --------------------------------------------------------------------------- + + +def test_footer_keyword_crops_bottom(): + img = _make_image() + result = detect_roi("What is the total amount on this invoice?", img) + assert result.cropped is True + assert result.matched_keyword is not None + # Bottom crop — top edge should be > 50% down + _, top, _, bottom = result.crop_box + assert top > img.height * 0.5 + assert bottom == img.height + + +def test_header_keyword_crops_top(): + img = _make_image() + result = detect_roi("Read the header text", img) + assert result.cropped is True + left, top, right, bottom = result.crop_box + assert top == 0 + assert bottom < img.height * 0.5 + + +def test_top_right_keyword(): + img = _make_image() + result = detect_roi("What is the date on this document?", img) + assert result.cropped is True + left, top, right, bottom = result.crop_box + assert left > 0 # right half + assert top == 0 + + +def test_bottom_right_keyword(): + img = _make_image() + result = detect_roi("What does the signature say at the bottom right?", img) + assert result.cropped is True + # "signature" matches footer rule (full-width bottom strip) — still a valid crop + _, top, _, bottom = result.crop_box + assert top > img.height * 0.5 + assert bottom == img.height + + +def test_no_match_returns_not_cropped(): + img = _make_image() + result = detect_roi("Describe this image", img) + assert result.cropped is False + assert result.crop_box is None + assert result.savings_pct == 0.0 + + +def test_empty_prompt_returns_not_cropped(): + img = _make_image() + result = detect_roi("", img) + assert result.cropped is False + + +def test_tiny_image_skipped(): + img = _make_image(100, 100) + result = detect_roi("What is the total?", img) + assert result.cropped is False + + +def test_savings_pct_is_meaningful(): + img = _make_image() + result = detect_roi("Read the header", img) + assert result.cropped is True + assert result.savings_pct >= 0.20 + + +# --------------------------------------------------------------------------- +# apply_saliency_crop +# --------------------------------------------------------------------------- + + +def test_crop_produces_correct_dimensions(): + img = _make_image(800, 1000) + result = detect_roi("What is the total?", img) + assert result.cropped + cropped = apply_saliency_crop(img, result) + left, top, right, bottom = result.crop_box + assert cropped.size == (right - left, bottom - top) + + +def test_no_crop_returns_original(): + img = _make_image() + result = SaliencyResult(cropped=False, crop_box=None, matched_keyword=None, savings_pct=0.0) + out = apply_saliency_crop(img, result) + assert out is img + + +# --------------------------------------------------------------------------- +# Integration: detect_roi → apply_saliency_crop produces smaller image +# --------------------------------------------------------------------------- + + +def test_cropped_image_is_smaller(): + img = _make_image(800, 1000) + result = detect_roi("What is the invoice total?", img) + assert result.cropped + cropped = apply_saliency_crop(img, result) + orig_area = img.width * img.height + crop_area = cropped.width * cropped.height + assert crop_area < orig_area + + +def test_center_keyword(): + img = _make_image() + result = detect_roi("What is in the center of this image?", img) + assert result.cropped is True + left, top, right, bottom = result.crop_box + assert left > 0 and top > 0 + assert right < img.width and bottom < img.height diff --git a/token0/main.py b/token0/main.py index 91c415e..375abf8 100644 --- a/token0/main.py +++ b/token0/main.py @@ -1,7 +1,10 @@ import logging +import pathlib from contextlib import asynccontextmanager from fastapi import FastAPI +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles from token0.api.v1.chat import router as chat_router from token0.api.v1.estimate import router as estimate_router @@ -45,6 +48,15 @@ async def lifespan(app: FastAPI): app.include_router(usage_router, prefix="/v1") +_static = pathlib.Path(__file__).parent / "static" +app.mount("/static", StaticFiles(directory=_static), name="static") + + +@app.get("/dashboard", response_class=HTMLResponse) +async def dashboard(): + return HTMLResponse((_static / "dashboard.html").read_text()) + + @app.get("/health") async def health(): return { diff --git a/token0/optimization/message_optimizer.py b/token0/optimization/message_optimizer.py index 08ff7e8..5d986a6 100644 --- a/token0/optimization/message_optimizer.py +++ b/token0/optimization/message_optimizer.py @@ -6,7 +6,9 @@ import logging from token0.optimization.analyzer import analyze_image +from token0.optimization.prompt_classifier import extract_prompt_text from token0.optimization.router import plan_optimization +from token0.optimization.saliency import apply_saliency_crop, detect_roi from token0.optimization.transformer import transform_image logger = logging.getLogger("token0.optimizer") @@ -27,6 +29,7 @@ def optimize_messages( total_after = 0 optimizations = [] recommended_model = None + prompt_text = extract_prompt_text(messages) for msg in messages: content = msg.get("content") @@ -84,6 +87,22 @@ def optimize_messages( try: analysis, raw_bytes, pil_image = analyze_image(url) + + # Saliency crop — trim to region the prompt asks about + saliency = detect_roi(prompt_text, pil_image) + if saliency.cropped: + pil_image = apply_saliency_crop(pil_image, saliency) + # Re-encode cropped image to bytes for downstream steps + import io as _io + + fmt = "JPEG" if analysis.format == "jpg" else analysis.format.upper() + buf = _io.BytesIO() + pil_image.save(buf, format=fmt) + raw_bytes = buf.getvalue() + kw, pct = saliency.matched_keyword, saliency.savings_pct + optimizations.append(f"saliency crop ({kw!r}: {pct:.0%} pixels removed)") + logger.debug("token0: saliency crop on %r, savings=%.0f%%", kw, pct * 100) + plan = plan_optimization( analysis, model, diff --git a/token0/optimization/saliency.py b/token0/optimization/saliency.py new file mode 100644 index 0000000..c720cec --- /dev/null +++ b/token0/optimization/saliency.py @@ -0,0 +1,160 @@ +"""Saliency-based ROI cropping — crops images to the region the prompt asks about. + +Phase 1: Rule-based spatial keyword matching (zero ML deps). +Maps prompt keywords to crop boxes (fractions of image dimensions). + +Examples: + "What's the total on this invoice?" → bottom 40% of image + "Read the header" → top 25% of image + "What's in the top-right corner?" → top-right quadrant + "What does the signature say?" → bottom-right quadrant +""" + +import re +from dataclasses import dataclass + +from PIL import Image + +# --------------------------------------------------------------------------- +# Spatial keyword → crop box mapping +# crop_box = (left, top, right, bottom) as fractions of (width, height) +# --------------------------------------------------------------------------- + +_REGION_RULES: list[tuple[list[str], tuple[float, float, float, float]]] = [ + # Full top strip + ( + ["header", "title", "heading", "logo", "top of", "top section", "letterhead", "subject"], + (0.0, 0.0, 1.0, 0.30), + ), + # Full bottom strip + ( + [ + "footer", + "total", + "amount due", + "grand total", + "subtotal", + "bottom of", + "bottom section", + "signature", + "sign", + "terms", + "footnote", + "fine print", + ], + (0.0, 0.60, 1.0, 1.0), + ), + # Top-left quadrant + ( + ["top left", "top-left", "upper left", "upper-left"], + (0.0, 0.0, 0.55, 0.55), + ), + # Top-right quadrant + ( + [ + "top right", + "top-right", + "upper right", + "upper-right", + "date", + "invoice number", + "reference number", + "ref no", + "order number", + ], + (0.45, 0.0, 1.0, 0.55), + ), + # Bottom-left quadrant + ( + ["bottom left", "bottom-left", "lower left", "lower-left"], + (0.0, 0.45, 0.55, 1.0), + ), + # Bottom-right quadrant + ( + [ + "bottom right", + "bottom-right", + "lower right", + "lower-right", + "total amount", + "balance due", + "net total", + ], + (0.45, 0.45, 1.0, 1.0), + ), + # Center region + ( + ["center", "centre", "middle", "central"], + (0.2, 0.2, 0.8, 0.8), + ), + # Left half + ( + ["left side", "left half", "left column", "left panel"], + (0.0, 0.0, 0.55, 1.0), + ), + # Right half + ( + ["right side", "right half", "right column", "right panel"], + (0.45, 0.0, 1.0, 1.0), + ), +] + +# Minimum image size (px) to bother cropping — tiny images not worth it +_MIN_DIMENSION_PX = 200 +# Minimum savings ratio to apply crop — skip if crop is >80% of original +_MIN_SAVINGS_RATIO = 0.20 + + +@dataclass +class SaliencyResult: + cropped: bool + crop_box: tuple[int, int, int, int] | None # pixel coords (left, top, right, bottom) + matched_keyword: str | None + savings_pct: float # 0.0–1.0, fraction of pixels removed + + +def detect_roi(prompt: str, image: Image.Image) -> SaliencyResult: + """Detect region of interest from prompt keywords. + + Returns a SaliencyResult. If no region detected or savings too small, + cropped=False and the original image should be used. + """ + if not prompt or image is None: + return SaliencyResult(cropped=False, crop_box=None, matched_keyword=None, savings_pct=0.0) + + w, h = image.size + if w < _MIN_DIMENSION_PX or h < _MIN_DIMENSION_PX: + return SaliencyResult(cropped=False, crop_box=None, matched_keyword=None, savings_pct=0.0) + + prompt_lower = prompt.lower() + + for keywords, (fl, ft, fr, fb) in _REGION_RULES: + for kw in keywords: + if re.search(r"\b" + re.escape(kw) + r"\b", prompt_lower): + left = int(fl * w) + top = int(ft * h) + right = int(fr * w) + bottom = int(fb * h) + + crop_area = (right - left) * (bottom - top) + original_area = w * h + savings = 1.0 - (crop_area / original_area) + + if savings < _MIN_SAVINGS_RATIO: + continue + + return SaliencyResult( + cropped=True, + crop_box=(left, top, right, bottom), + matched_keyword=kw, + savings_pct=savings, + ) + + return SaliencyResult(cropped=False, crop_box=None, matched_keyword=None, savings_pct=0.0) + + +def apply_saliency_crop(image: Image.Image, result: SaliencyResult) -> Image.Image: + """Crop the image to the detected ROI box.""" + if not result.cropped or result.crop_box is None: + return image + return image.crop(result.crop_box) diff --git a/token0/static/dashboard.html b/token0/static/dashboard.html new file mode 100644 index 0000000..771bc9a --- /dev/null +++ b/token0/static/dashboard.html @@ -0,0 +1,114 @@ + + + + + +Token0 Dashboard + + + + +
+

Token0

+ Dashboard +
Live
+
+ +
Loading...
+ + + + diff --git a/token0_integration.md b/token0_integration.md new file mode 100644 index 0000000..d6620a5 --- /dev/null +++ b/token0_integration.md @@ -0,0 +1,122 @@ +# Token0 — Vision Token Optimizer + +Token0 is an open-source vision token optimizer that integrates with LiteLLM as a +`CustomLogger` pre-call hook. It automatically compresses images in your `messages` +payload before every LLM call — reducing vision token costs by 35–99% with no code +changes beyond adding the hook. + +## Quick Start + +**1. Install Token0** + +```bash +pip install token0 +``` + +**2. Add the hook — LiteLLM SDK** + +```python +import litellm +from token0.litellm_hook import Token0Hook + +litellm.callbacks = [Token0Hook()] + +response = litellm.completion( + model="gpt-4o", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ] + }] +) + +# Check savings +print(response._hidden_params["metadata"]["token0"]) +# {"tokens_saved": 1020, "optimizations": ["resize 4000x3000→1568x1176", "prompt-aware→low detail"]} +``` + +**2b. Add the hook — LiteLLM Proxy (`config.yaml`)** + +```yaml +litellm_settings: + callbacks: ["token0.litellm_hook.Token0Hook"] +``` + +Then install Token0 in the same environment as the proxy: + +```bash +pip install token0 +``` + +## Configuration + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `enable_cascade` | `bool` | `False` | Auto-route simple tasks to cheaper models (GPT-4o → GPT-4o-mini) | +| `detail_override` | `str \| None` | `None` | Force `"low"` or `"high"` detail mode for all images (OpenAI only) | + +```python +# Enable model cascade +litellm.callbacks = [Token0Hook(enable_cascade=True)] + +# Force low detail (fast, cheap — for classification tasks) +litellm.callbacks = [Token0Hook(detail_override="low")] +``` + +## What Gets Optimized + +Token0 applies up to 7 optimizations per image, in order: + +| Optimization | Savings | When Applied | +|---|---|---| +| Smart resize | Varies | Image exceeds provider's max resolution | +| OCR routing | 47–70% | Image is text-heavy (receipt, screenshot, invoice) | +| JPEG recompression | 10–30% | PNG without transparency | +| Prompt-aware detail | Up to 92% | Simple prompts ("classify", "yes/no") | +| Tile-optimized resize | 44% | Mid-size images on OpenAI (512px tile snapping) | +| Model cascade | 5–20x cost | `enable_cascade=True` + simple task detected | +| Semantic/fuzzy cache | 100% | Same or similar image+prompt seen before | + +## Benchmarks + +Benchmarked on 5 Ollama vision models across real-world images (photos, receipts, invoices, screenshots): + +| Model | Direct Tokens | Token0 Tokens | Savings | +|---|---|---|---| +| granite3.2-vision | 129,836 | 60,924 | 53.1% | +| minicpm-v | 10,877 | 6,276 | 42.3% | +| moondream | 16,457 | 10,240 | 37.8% | +| llava-llama3 | 13,365 | 8,486 | 36.5% | +| llava:7b | 13,384 | 8,701 | 35.0% | + +GPT-4.1 projections (using published token formulas): + +| Optimization Set | Savings | +|---|---| +| Resize + OCR + PDF text extraction | 70.3% | +| All optimizations + model cascade | 98.9% | + +## Supported Providers + +Token0 is provider-aware and applies provider-specific optimizations: + +| Provider | Models | Notes | +|---|---|---| +| OpenAI | GPT-4o, GPT-4.1, GPT-4.1-mini, GPT-4.1-nano | Detail mode + tile optimization | +| Anthropic | Claude Sonnet/Opus/Haiku | Pixel-based token formula | +| Google | Gemini 2.5 Flash/Pro | | +| Ollama | Any vision model | Free, local inference | + +## Text-Only Safety + +Token0 is a no-op for text-only messages. It only activates when a `messages` array +contains at least one `image_url` content part. All text fields, tool calls, and +non-image content parts are passed through unmodified. + +## Links + +- [GitHub](https://github.com/Pritom14/token0) +- [PyPI](https://pypi.org/project/token0/) +- [License: Apache 2.0](https://github.com/Pritom14/token0/blob/main/LICENSE)