Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,35 @@ response = llm.invoke([HumanMessage(content=[

Works with any LangChain chat model — ChatOpenAI, ChatAnthropic, ChatGoogleGenerativeAI, etc.

### Use With Instructor

Already using [Instructor](https://github.com/jxnl/instructor) for structured outputs? Add Token0 as a pre-call hook:

```python
import instructor
import openai
from token0.instructor_hook import Token0Hook

client = instructor.from_openai(openai.OpenAI())
client.on("completion:kwargs", Token0Hook())

# All calls now get image optimization automatically
response = client.chat.completions.create(
model="gpt-4.1",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "What is the total on this invoice?"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
]
}],
response_model=MyModel,
)
# Invoice image cropped to bottom 40% (saliency) + OCR routed — ~90% token savings
```

Works with any instructor-supported provider — OpenAI, Anthropic, Google, Ollama.

### Use With Ollama (free, fully local)

```bash
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "token0"
version = "0.3.1"
version = "0.3.2"
description = "Open-source API proxy that makes vision LLM calls 5-10x cheaper"
readme = "README.md"
license = "Apache-2.0"
Expand Down
189 changes: 189 additions & 0 deletions tests/test_instructor_hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
"""Tests for the instructor integration hook.

All tests are mock-only — no real LLM calls, no instructor dependency required.
"""

from unittest.mock import patch

from token0.instructor_hook import Token0Hook

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

_MOCK_STATS_NO_SAVINGS = {
"tokens_before": 100,
"tokens_after": 100,
"tokens_saved": 0,
"optimizations": [],
"recommended_model": None,
}

_MOCK_STATS_WITH_SAVINGS = {
"tokens_before": 765,
"tokens_after": 85,
"tokens_saved": 680,
"optimizations": ["prompt-aware -> low detail (simple task)"],
"recommended_model": None,
}

_MOCK_STATS_CASCADE = {
"tokens_before": 765,
"tokens_after": 85,
"tokens_saved": 680,
"optimizations": ["cascade -> gpt-4o-mini"],
"recommended_model": "gpt-4o-mini",
}

_IMAGE_MESSAGE = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,abc123"}},
],
}
]

_TEXT_MESSAGE = [{"role": "user", "content": "Hello"}]


# ---------------------------------------------------------------------------
# Basic behaviour
# ---------------------------------------------------------------------------


def test_hook_is_callable():
hook = Token0Hook()
assert callable(hook)


def test_empty_messages_passthrough():
hook = Token0Hook()
kwargs = {"model": "gpt-4o", "messages": []}
result = hook(kwargs)
assert result == {"model": "gpt-4o", "messages": []}


def test_missing_messages_passthrough():
hook = Token0Hook()
kwargs = {"model": "gpt-4o"}
result = hook(kwargs)
assert result == {"model": "gpt-4o"}


def test_text_only_passthrough():
hook = Token0Hook()
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_TEXT_MESSAGE, _MOCK_STATS_NO_SAVINGS),
):
result = hook({"model": "gpt-4o", "messages": _TEXT_MESSAGE})
assert result["messages"] == _TEXT_MESSAGE


# ---------------------------------------------------------------------------
# Image optimization
# ---------------------------------------------------------------------------


def test_image_messages_are_optimized():
hook = Token0Hook()
optimized = [{"role": "user", "content": [{"type": "text", "text": "[Extracted text]"}]}]
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(optimized, _MOCK_STATS_WITH_SAVINGS),
) as mock_opt:
result = hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE})

mock_opt.assert_called_once_with(
_IMAGE_MESSAGE, "gpt-4o", detail_override=None, enable_cascade=False
)
assert result["messages"] == optimized


def test_detail_override_passed_through():
hook = Token0Hook(detail_override="low")
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS),
) as mock_opt:
hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE})

_, call_kwargs = mock_opt.call_args
assert call_kwargs.get("detail_override") == "low" or mock_opt.call_args[0][2] == "low"


def test_enable_cascade_passed_through():
hook = Token0Hook(enable_cascade=True)
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS),
) as mock_opt:
hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE})

args, kwargs = mock_opt.call_args
enable_cascade = kwargs.get("enable_cascade", args[3] if len(args) > 3 else False)
assert enable_cascade is True


# ---------------------------------------------------------------------------
# Model cascade
# ---------------------------------------------------------------------------


def test_cascade_updates_model():
hook = Token0Hook(enable_cascade=True)
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_IMAGE_MESSAGE, _MOCK_STATS_CASCADE),
):
result = hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE})

assert result["model"] == "gpt-4o-mini"


def test_no_cascade_leaves_model_unchanged():
hook = Token0Hook()
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_IMAGE_MESSAGE, _MOCK_STATS_WITH_SAVINGS),
):
result = hook({"model": "gpt-4o", "messages": _IMAGE_MESSAGE})

assert result["model"] == "gpt-4o"


# ---------------------------------------------------------------------------
# kwargs passthrough
# ---------------------------------------------------------------------------


def test_extra_kwargs_preserved():
hook = Token0Hook()
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS),
):
result = hook(
{
"model": "gpt-4o",
"messages": _IMAGE_MESSAGE,
"temperature": 0.7,
"max_tokens": 512,
}
)

assert result["temperature"] == 0.7
assert result["max_tokens"] == 512


def test_no_model_key_still_works():
hook = Token0Hook()
with patch(
"token0.instructor_hook.optimize_messages",
return_value=(_IMAGE_MESSAGE, _MOCK_STATS_NO_SAVINGS),
):
result = hook({"messages": _IMAGE_MESSAGE})

assert "messages" in result
86 changes: 86 additions & 0 deletions token0/instructor_hook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Instructor integration — Token0 as a pre-call hook.

Hooks into instructor's COMPLETION_KWARGS event to optimize vision tokens
before every LLM call. Works with any instructor-supported provider.

Usage:
import instructor
import openai
from token0.instructor_hook import Token0Hook

client = instructor.from_openai(openai.OpenAI())
hook = Token0Hook()
client.on("completion:kwargs", hook)

# All calls now get image optimization automatically
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "What is the total on this invoice?"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
]
}],
response_model=MyModel,
)

Works with any instructor provider — OpenAI, Anthropic, Google, Ollama, etc.
No proxy required — runs as an in-process pre-call hook.
"""

import logging
from typing import Any

from token0.optimization.message_optimizer import optimize_messages

logger = logging.getLogger("token0.instructor")


class Token0Hook:
"""Instructor pre-call hook that optimizes vision tokens before LLM calls.

Attach to any instructor client via client.on("completion:kwargs", Token0Hook()).

Args:
enable_cascade: Auto-route simple tasks to cheaper models (default: False).
detail_override: Force "low" or "high" detail mode for OpenAI (default: auto).
"""

def __init__(
self,
enable_cascade: bool = False,
detail_override: str | None = None,
):
self.enable_cascade = enable_cascade
self.detail_override = detail_override

def __call__(self, kwargs: dict[str, Any]) -> dict[str, Any]:
"""Optimize images in kwargs["messages"] before the LLM call."""
messages = kwargs.get("messages")
if not messages:
return kwargs

model = kwargs.get("model", "")
optimized_messages, stats = optimize_messages(
messages,
model,
detail_override=self.detail_override,
enable_cascade=self.enable_cascade,
)

kwargs["messages"] = optimized_messages

if stats["tokens_saved"] > 0:
logger.info(
"token0: %d tokens saved (%s)",
stats["tokens_saved"],
", ".join(stats["optimizations"]),
)

# Cascade: switch to cheaper model if recommended
if stats.get("recommended_model"):
logger.info("token0: cascade %s -> %s", model, stats["recommended_model"])
kwargs["model"] = stats["recommended_model"]

return kwargs
Loading