diff --git a/README.md b/README.md index 05d1ad5..b09a99d 100644 --- a/README.md +++ b/README.md @@ -449,6 +449,35 @@ response = llm.invoke([HumanMessage(content=[ Works with any LangChain chat model — ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc. +### Use With Instructor + +Already using [Instructor](https://github.com/jxnl/instructor) for structured outputs? Add Token0 as a pre-call hook: + +```python +import instructor +import openai +from token0.instructor_hook import Token0Hook + +client = instructor.from_openai(openai.OpenAI()) +client.on("completion:kwargs", Token0Hook()) + +# All calls now get image optimization automatically +response = client.chat.completions.create( + model="gpt-4.1", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What is the total on this invoice?"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ] + }], + response_model=MyModel, +) +# Invoice image cropped to bottom 40% (saliency) + OCR routed — ~90% token savings +``` + +Works with any instructor-supported provider — OpenAI, Anthropic, Google, Ollama. + ### Use With Ollama (free, fully local) ```bash diff --git a/pyproject.toml b/pyproject.toml index 27c08e4..5856d67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "token0" -version = "0.3.1" +version = "0.3.2" description = "Open-source API proxy that makes vision LLM calls 5-10x cheaper" readme = "README.md" license = "Apache-2.0" diff --git a/tests/test_instructor_hook.py b/tests/test_instructor_hook.py new file mode 100644 index 0000000..1dd3ccd --- /dev/null +++ b/tests/test_instructor_hook.py @@ -0,0 +1,189 @@ +"""Tests for the instructor integration hook. + +All tests are mock-only — no real LLM calls, no instructor dependency required. +""" + +from unittest.mock import patch + +from token0.instructor_hook import Token0Hook + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_MOCK_STATS_NO_SAVINGS = { + "tokens_before": 100, + "tokens_after": 100, + "tokens_saved": 0, + "optimizations": [], + "recommended_model": None, +} + +_MOCK_STATS_WITH_SAVINGS = { + "tokens_before": 765, + "tokens_after": 85, + "tokens_saved": 680, + "optimizations": ["prompt-aware -> low detail (simple task)"], + "recommended_model": None, +} + +_MOCK_STATS_CASCADE = { + "tokens_before": 765, + "tokens_after": 85, + "tokens_saved": 680, + "optimizations": ["cascade -> gpt-4o-mini"], + "recommended_model": "gpt-4o-mini", +} + +_IMAGE_MESSAGE = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,abc123"}}, + ], + } +] + +_TEXT_MESSAGE = [{"role": "user", "content": "Hello"}] + + +# --------------------------------------------------------------------------- +# Basic behaviour +# --------------------------------------------------------------------------- + + +def test_hook_is_callable(): + hook = Token0Hook() + assert callable(hook) + + +def test_empty_messages_passthrough(): + hook = Token0Hook() + kwargs = {"model": "gpt-4o", "messages": []} + result = hook(kwargs) + assert result == {"model": "gpt-4o", "messages": []} + + +def test_missing_messages_passthrough(): + hook = Token0Hook() + kwargs = {"model": "gpt-4o"} + result = hook(kwargs) + assert result == {"model": "gpt-4o"} + + +def test_text_only_passthrough(): + hook = Token0Hook() + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_TEXT_MESSAGE, _MOCK_STATS_NO_SAVINGS), + ): + result = hook({"model": "gpt-4o", "messages": _TEXT_MESSAGE}) + assert result["messages"] == _TEXT_MESSAGE + + +# --------------------------------------------------------------------------- +# Image optimization +# --------------------------------------------------------------------------- + + +def test_image_messages_are_optimized(): + hook = Token0Hook() + optimized = [{"role": "user", "content": [{"type": "text", "text": "[Extracted text]"}]}] + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(optimized, _MOCK_STATS_WITH_SAVINGS), + ) as mock_opt: + result = hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE}) + + mock_opt.assert_called_once_with( + _IMAGE_MESSAGE, "gpt-4o", detail_override=None, enable_cascade=False + ) + assert result["messages"] == optimized + + +def test_detail_override_passed_through(): + hook = Token0Hook(detail_override="low") + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS), + ) as mock_opt: + hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE}) + + _, call_kwargs = mock_opt.call_args + assert call_kwargs.get("detail_override") == "low" or mock_opt.call_args[0][2] == "low" + + +def test_enable_cascade_passed_through(): + hook = Token0Hook(enable_cascade=True) + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS), + ) as mock_opt: + hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE}) + + args, kwargs = mock_opt.call_args + enable_cascade = kwargs.get("enable_cascade", args[3] if len(args) > 3 else False) + assert enable_cascade is True + + +# --------------------------------------------------------------------------- +# Model cascade +# --------------------------------------------------------------------------- + + +def test_cascade_updates_model(): + hook = Token0Hook(enable_cascade=True) + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_IMAGE_MESSAGE, _MOCK_STATS_CASCADE), + ): + result = hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE}) + + assert result["model"] == "gpt-4o-mini" + + +def test_no_cascade_leaves_model_unchanged(): + hook = Token0Hook() + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_IMAGE_MESSAGE, _MOCK_STATS_WITH_SAVINGS), + ): + result = hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE}) + + assert result["model"] == "gpt-4o" + + +# --------------------------------------------------------------------------- +# kwargs passthrough +# --------------------------------------------------------------------------- + + +def test_extra_kwargs_preserved(): + hook = Token0Hook() + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS), + ): + result = hook( + { + "model": "gpt-4o", + "messages": _IMAGE_MESSAGE, + "temperature": 0.7, + "max_tokens": 512, + } + ) + + assert result["temperature"] == 0.7 + assert result["max_tokens"] == 512 + + +def test_no_model_key_still_works(): + hook = Token0Hook() + with patch( + "token0.instructor_hook.optimize_messages", + return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS), + ): + result = hook({"messages": _IMAGE_MESSAGE}) + + assert "messages" in result diff --git a/token0/instructor_hook.py b/token0/instructor_hook.py new file mode 100644 index 0000000..f52c26e --- /dev/null +++ b/token0/instructor_hook.py @@ -0,0 +1,86 @@ +"""Instructor integration — Token0 as a pre-call hook. + +Hooks into instructor's COMPLETION_KWARGS event to optimize vision tokens +before every LLM call. Works with any instructor-supported provider. + +Usage: + import instructor + import openai + from token0.instructor_hook import Token0Hook + + client = instructor.from_openai(openai.OpenAI()) + hook = Token0Hook() + client.on("completion:kwargs", hook) + + # All calls now get image optimization automatically + response = client.chat.completions.create( + model="gpt-4o", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What is the total on this invoice?"}, + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}} + ] + }], + response_model=MyModel, + ) + +Works with any instructor provider — OpenAI, Anthropic, Google, Ollama, etc. +No proxy required — runs as an in-process pre-call hook. +""" + +import logging +from typing import Any + +from token0.optimization.message_optimizer import optimize_messages + +logger = logging.getLogger("token0.instructor") + + +class Token0Hook: + """Instructor pre-call hook that optimizes vision tokens before LLM calls. + + Attach to any instructor client via client.on("completion:kwargs", Token0Hook()). + + Args: + enable_cascade: Auto-route simple tasks to cheaper models (default: False). + detail_override: Force "low" or "high" detail mode for OpenAI (default: auto). + """ + + def __init__( + self, + enable_cascade: bool = False, + detail_override: str | None = None, + ): + self.enable_cascade = enable_cascade + self.detail_override = detail_override + + def __call__(self, kwargs: dict[str, Any]) -> dict[str, Any]: + """Optimize images in kwargs["messages"] before the LLM call.""" + messages = kwargs.get("messages") + if not messages: + return kwargs + + model = kwargs.get("model", "") + optimized_messages, stats = optimize_messages( + messages, + model, + detail_override=self.detail_override, + enable_cascade=self.enable_cascade, + ) + + kwargs["messages"] = optimized_messages + + if stats["tokens_saved"] > 0: + logger.info( + "token0: %d tokens saved (%s)", + stats["tokens_saved"], + ", ".join(stats["optimizations"]), + ) + + # Cascade: switch to cheaper model if recommended + if stats.get("recommended_model"): + logger.info("token0: cascade %s -> %s", model, stats["recommended_model"]) + kwargs["model"] = stats["recommended_model"] + + return kwargs