From 969e2162202f0b826039776385283c88e46e8f38 Mon Sep 17 00:00:00 2001 From: Pritom Mazumdar Date: Fri, 27 Mar 2026 17:45:47 +0530 Subject: [PATCH 1/3] added support for litellm --- README.md | 36 ++++++- tests/test_litellm_hook.py | 202 +++++++++++++++++++++++++++++++++++++ token0/litellm_hook.py | 177 ++++++++++++++++++++++++++++++++ 3 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 tests/test_litellm_hook.py create mode 100644 token0/litellm_hook.py diff --git a/README.md b/README.md index 92c76a5..ff20a16 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ Using OpenAI's published token formulas on real images: ### Additional Test Coverage -Token0 includes **93 unit tests** and benchmarks across multiple suites: +Token0 includes **103 unit tests** and benchmarks across multiple suites: | Suite | Tests | What It Validates | |---|---|---| @@ -165,6 +165,7 @@ Token0 includes **93 unit tests** and benchmarks across multiple suites: | `tasks` | 4 | Task types: classification, description, extraction, Q&A | | `real` | 5 | Real-world photos, receipts, invoices, screenshots | | `streaming` | 7 | SSE streaming: format, content, stats, image optimization | +| `litellm` | 10 | LiteLLM hook: passthrough, optimization, OCR, cascade, async | --- @@ -257,6 +258,39 @@ for chunk in stream: # Final chunk includes token0 optimization stats ``` +### Use With LiteLLM + +Already using [LiteLLM](https://github.com/BerriAI/litellm)? Add Token0 as a callback hook — no proxy needed: + +```python +import litellm +from token0.litellm_hook import Token0Hook + +litellm.callbacks = [Token0Hook()] + +# All your existing litellm calls now get image optimization for free +response = litellm.completion( + model="gpt-4o", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ] + }] +) + +# Stats available in response metadata +# response._hidden_params["metadata"]["token0"]["tokens_saved"] +``` + +Or in LiteLLM proxy `config.yaml`: + +```yaml +litellm_settings: + callbacks: ["token0.litellm_hook.Token0Hook"] +``` + ### Use With Ollama (free, fully local) ```bash diff --git a/tests/test_litellm_hook.py b/tests/test_litellm_hook.py new file mode 100644 index 0000000..9a85860 --- /dev/null +++ b/tests/test_litellm_hook.py @@ -0,0 +1,202 @@ +"""Tests for LiteLLM integration hook.""" + +import pytest + +from tests.conftest import make_image, make_text_image +from token0.litellm_hook import Token0Hook, _optimize_messages + + +class TestOptimizeMessages: + def test_text_only_passthrough(self): + """Text-only messages pass through unchanged.""" + messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "Hello"}, + ] + optimized, stats = _optimize_messages(messages, "gpt-4o") + + assert optimized == messages + assert stats["tokens_saved"] == 0 + assert stats["optimizations"] == [] + + def test_large_image_gets_optimized(self): + """Large images trigger resize optimization.""" + _, data_uri = make_image(4000, 3000, "blue", "JPEG") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this"}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ], + } + ] + optimized, stats = _optimize_messages(messages, "gpt-4o") + + assert len(stats["optimizations"]) > 0 + # Image should still be present (resized, not OCR'd) + parts = optimized[0]["content"] + assert any(p.get("type") == "image_url" for p in parts) + + def test_text_heavy_image_ocr_routed(self): + """Text-heavy images get OCR routed.""" + _, data_uri = make_text_image(800, 600, lines=25, fmt="PNG") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Read this document"}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ], + } + ] + optimized, stats = _optimize_messages(messages, "gpt-4o") + + assert stats["tokens_saved"] > 0 + # OCR route replaces image with text + parts = optimized[0]["content"] + text_parts = [p for p in parts if p.get("type") == "text"] + assert len(text_parts) == 2 # original text + extracted text + + def test_non_data_uri_passthrough(self): + """URLs (not base64) pass through unchanged.""" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + { + "type": "image_url", + "image_url": {"url": "https://example.com/image.jpg"}, + }, + ], + } + ] + optimized, stats = _optimize_messages(messages, "gpt-4o") + + assert optimized == messages + assert stats["tokens_saved"] == 0 + + def test_multiple_images(self): + """Multiple images in one message are each optimized.""" + _, uri1 = make_image(4000, 3000, "blue", "JPEG") + _, uri2 = make_image(3000, 2000, "red", "JPEG") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Compare these"}, + {"type": "image_url", "image_url": {"url": uri1}}, + {"type": "image_url", "image_url": {"url": uri2}}, + ], + } + ] + optimized, stats = _optimize_messages(messages, "gpt-4o") + + parts = optimized[0]["content"] + image_parts = [p for p in parts if p.get("type") == "image_url"] + assert len(image_parts) == 2 + + def test_cascade_recommends_cheaper_model(self): + """Model cascade suggests cheaper alternative.""" + _, data_uri = make_image(800, 600, "red", "JPEG") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is this?"}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ], + } + ] + # enable_cascade with a known model + _, stats = _optimize_messages(messages, "gpt-4o", enable_cascade=True) + # Cascade may or may not trigger depending on prompt classification + # Just verify the field exists + assert "recommended_model" in stats + + def test_stats_structure(self): + """Stats dict has all expected keys.""" + _, data_uri = make_image(800, 600, "red", "JPEG") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Hello"}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ], + } + ] + _, stats = _optimize_messages(messages, "gpt-4o") + + assert "tokens_before" in stats + assert "tokens_after" in stats + assert "tokens_saved" in stats + assert "optimizations" in stats + assert "recommended_model" in stats + assert stats["tokens_saved"] == stats["tokens_before"] - stats["tokens_after"] + + +class TestToken0HookIntegration: + @pytest.mark.asyncio + async def test_hook_modifies_data(self): + """Hook modifies data dict with optimized messages.""" + _, data_uri = make_image(4000, 3000, "blue", "JPEG") + hook = Token0Hook() + data = { + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this"}, + {"type": "image_url", "image_url": {"url": data_uri}}, + ], + } + ], + } + + result = await hook.async_pre_call_hook( + user_api_key_dict={}, + cache=None, + data=data, + call_type="completion", + ) + + assert "metadata" in result + assert "token0" in result["metadata"] + assert result["metadata"]["token0"]["tokens_saved"] >= 0 + + @pytest.mark.asyncio + async def test_hook_skips_non_completion(self): + """Hook ignores non-completion call types.""" + hook = Token0Hook() + data = {"model": "dall-e-3", "prompt": "A cat"} + + result = await hook.async_pre_call_hook( + user_api_key_dict={}, + cache=None, + data=data, + call_type="image_generation", + ) + + assert result == data + assert "metadata" not in result + + @pytest.mark.asyncio + async def test_hook_text_only_no_metadata_overhead(self): + """Text-only requests get minimal metadata.""" + hook = Token0Hook() + data = { + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}], + } + + result = await hook.async_pre_call_hook( + user_api_key_dict={}, + cache=None, + data=data, + call_type="completion", + ) + + assert result["metadata"]["token0"]["tokens_saved"] == 0 diff --git a/token0/litellm_hook.py b/token0/litellm_hook.py new file mode 100644 index 0000000..5560f13 --- /dev/null +++ b/token0/litellm_hook.py @@ -0,0 +1,177 @@ +"""LiteLLM integration — Token0 as a pre-call hook. + +Usage in litellm proxy_config.yaml: + + litellm_settings: + callbacks: ["token0.litellm_hook.Token0Hook"] + +Or programmatically: + + import litellm + from token0.litellm_hook import Token0Hook + litellm.callbacks = [Token0Hook()] +""" + +import logging + +from litellm.integrations.custom_logger import CustomLogger + +from token0.optimization.analyzer import analyze_image +from token0.optimization.router import plan_optimization +from token0.optimization.transformer import transform_image + +logger = logging.getLogger("token0.litellm") + + +class Token0Hook(CustomLogger): + """LiteLLM hook that optimizes vision tokens before LLM calls.""" + + def __init__( + self, + enable_cascade: bool = False, + detail_override: str | None = None, + ): + self.enable_cascade = enable_cascade + self.detail_override = detail_override + + async def async_pre_call_hook( + self, + user_api_key_dict, + cache, + data: dict, + call_type: str, + ) -> dict: + """Optimize images in messages before the LLM call.""" + if call_type != "completion": + return data + + messages = data.get("messages") + if not messages: + return data + + model = data.get("model", "") + optimized_messages, stats = _optimize_messages( + messages, + model, + detail_override=self.detail_override, + enable_cascade=self.enable_cascade, + ) + + data["messages"] = optimized_messages + + if stats["tokens_saved"] > 0: + logger.info( + "token0: %d tokens saved (%s)", + stats["tokens_saved"], + ", ".join(stats["optimizations"]), + ) + + # Attach stats for downstream logging/callbacks + data.setdefault("metadata", {}) + data["metadata"]["token0"] = stats + + # Apply model cascade if recommended + if stats.get("recommended_model"): + data["model"] = stats["recommended_model"] + logger.info("token0: cascade %s → %s", model, stats["recommended_model"]) + + return data + + +def _optimize_messages( + messages: list[dict], + model: str, + detail_override: str | None = None, + enable_cascade: bool = False, +) -> tuple[list[dict], dict]: + """Optimize images in a list of message dicts. + + Returns (optimized_messages, stats_dict). + """ + optimized = [] + total_before = 0 + total_after = 0 + optimizations = [] + recommended_model = None + + for msg in messages: + content = msg.get("content") + + # Text-only message — pass through + if isinstance(content, str) or content is None: + optimized.append(msg) + continue + + # Multi-part content — check for images + if not isinstance(content, list): + optimized.append(msg) + continue + + opt_parts = [] + for part in content: + if part.get("type") != "image_url": + opt_parts.append(part) + continue + + image_url = part.get("image_url", {}) + url = image_url.get("url", "") + + # Only optimize base64 data URIs + if not url.startswith("data:"): + opt_parts.append(part) + continue + + try: + analysis, raw_bytes, pil_image = analyze_image(url) + plan = plan_optimization( + analysis, + model, + detail_override=detail_override, + enable_cascade=enable_cascade, + ) + + total_before += plan.estimated_tokens_before + total_after += plan.estimated_tokens_after + optimizations.extend(plan.reasons) + + if plan.recommended_model and recommended_model is None: + recommended_model = plan.recommended_model + + if plan.use_ocr_route: + result = transform_image(plan, analysis, raw_bytes, pil_image) + opt_parts.append( + { + "type": "text", + "text": f"[Extracted text from image]:\n{result['content']}", + } + ) + elif any([plan.resize, plan.recompress_jpeg, plan.force_detail_low]): + result = transform_image(plan, analysis, raw_bytes, pil_image) + detail = "low" if plan.force_detail_low else image_url.get("detail", "auto") + opt_parts.append( + { + "type": "image_url", + "image_url": { + "url": f"data:{result['media_type']};base64,{result['base64']}", + "detail": detail, + }, + } + ) + else: + opt_parts.append(part) + + except Exception: + logger.warning("token0: failed to optimize image, passing through", exc_info=True) + opt_parts.append(part) + + optimized.append({"role": msg["role"], "content": opt_parts}) + + stats = { + "tokens_before": total_before, + "tokens_after": total_after, + "tokens_saved": total_before - total_after, + "optimizations": optimizations, + "recommended_model": recommended_model, + } + + return optimized, stats From 577186b2e3eb0604e7e0efbd1c96958d2ebfaafd Mon Sep 17 00:00:00 2001 From: Pritom Mazumdar Date: Fri, 27 Mar 2026 17:48:21 +0530 Subject: [PATCH 2/3] bumped up version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 92bfdb4..ce17fdc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "token0" -version = "0.2.0" +version = "0.2.1" description = "Open-source API proxy that makes vision LLM calls 5-10x cheaper" readme = "README.md" license = "Apache-2.0" From 03f9204eb8589638e4be0b45428bd08951e8c815 Mon Sep 17 00:00:00 2001 From: Pritom Mazumdar Date: Fri, 27 Mar 2026 17:54:50 +0530 Subject: [PATCH 3/3] CI fixes --- tests/test_litellm_hook.py | 6 ++++-- token0/litellm_hook.py | 7 ++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/test_litellm_hook.py b/tests/test_litellm_hook.py index 9a85860..c9076bc 100644 --- a/tests/test_litellm_hook.py +++ b/tests/test_litellm_hook.py @@ -2,8 +2,10 @@ import pytest -from tests.conftest import make_image, make_text_image -from token0.litellm_hook import Token0Hook, _optimize_messages +litellm = pytest.importorskip("litellm", reason="litellm not installed") + +from tests.conftest import make_image, make_text_image # noqa: E402 +from token0.litellm_hook import Token0Hook, _optimize_messages # noqa: E402 class TestOptimizeMessages: diff --git a/token0/litellm_hook.py b/token0/litellm_hook.py index 5560f13..193c6a0 100644 --- a/token0/litellm_hook.py +++ b/token0/litellm_hook.py @@ -14,7 +14,12 @@ import logging -from litellm.integrations.custom_logger import CustomLogger +try: + from litellm.integrations.custom_logger import CustomLogger +except ImportError: + raise ImportError( + "litellm is required for the Token0Hook integration. Install it with: pip install litellm" + ) from token0.optimization.analyzer import analyze_image from token0.optimization.router import plan_optimization