Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions tee_gateway/controllers/chat_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,13 @@ def _create_non_streaming_response(chat_request: CreateChatCompletionRequest):
# TODO: If no usage is returned, we should compute it here.
usage = extract_usage(response)
if usage:
openai_response["usage"] = usage
# Surface the standard OpenAI usage triple on the response; the
# reasoning split rides along to the cost calculator via `usage`.
openai_response["usage"] = {
"prompt_tokens": usage["prompt_tokens"],
"completion_tokens": usage["completion_tokens"],
"total_tokens": usage["total_tokens"],
}
web_search_count = (
extract_web_search_count(response) if chat_request.web_search else 0
)
Expand Down Expand Up @@ -625,6 +631,14 @@ def generate():
for k, v in response.usage_metadata.items():
if isinstance(v, (int, float)):
final_usage[k] = v
# Thinking tokens are billed at the cheaper text rate; they
# live in the nested output_token_details dict (skipped by
# the int/float loop above), so pull them out explicitly.
_otd = response.usage_metadata.get("output_token_details")
if isinstance(_otd, dict) and isinstance(
_otd.get("reasoning"), (int, float)
):
final_usage["reasoning"] = _otd["reasoning"]
chunks_iter: list = []
elif anthropic_structured_content is not None:
# Emit the pre-computed structured result as a single chunk.
Expand Down Expand Up @@ -780,6 +794,16 @@ def generate():
for k, v in chunk.usage_metadata.items():
if isinstance(v, (int, float)):
final_usage[k] = final_usage.get(k, 0) + v
# Thinking tokens (billed at the cheaper text rate) are
# nested in output_token_details and emitted as deltas like
# the top-level counts, so accumulate them the same way.
_otd = chunk.usage_metadata.get("output_token_details")
if isinstance(_otd, dict) and isinstance(
_otd.get("reasoning"), (int, float)
):
final_usage["reasoning"] = (
final_usage.get("reasoning", 0) + _otd["reasoning"]
)

# Flush buffered tool calls for OpenAI/Anthropic
if buffer_tool_calls and buffered_tool_calls:
Expand Down Expand Up @@ -856,9 +880,15 @@ def generate():
if chat_request.web_search
else 0
)
# Pass thinking tokens to the cost calculator (for the image
# dual-rate split) without polluting the OpenAI usage triple.
cost_usage = dict(
final_data["usage"],
reasoning_tokens=final_usage.get("reasoning", 0),
)
cost = compute_session_cost(
chat_request.model,
final_data["usage"],
cost_usage,
web_search_count=web_search_count,
)
if cost is not None:
Expand Down
5 changes: 5 additions & 0 deletions tee_gateway/llm_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,10 +421,15 @@ def extract_usage(response) -> Optional[Dict[str, int]]:
"""Extract token usage from a LangChain response object."""
if hasattr(response, "usage_metadata") and response.usage_metadata:
meta = response.usage_metadata
# Thinking tokens, when present, are folded into output_tokens but also
# broken out here. Image-output models bill them at the cheaper
# text/thinking rate (see compute_session_cost), so surface them.
details = meta.get("output_token_details") or {}
return {
"prompt_tokens": meta.get("input_tokens", 0),
"completion_tokens": meta.get("output_tokens", 0),
"total_tokens": meta.get("total_tokens", 0),
"reasoning_tokens": details.get("reasoning", 0),
}
return None

Expand Down
24 changes: 18 additions & 6 deletions tee_gateway/model_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ class ModelConfig:
# Flat USD price per generated image, for ``image_generation`` models. Token
# prices are ignored for these models (set to 0 in the registry).
per_image_price_usd: Optional[Decimal] = None
# USD per image-modality output token, for ``image_output`` models (Gemini
# "nano banana"). These providers bill image output at a higher rate than
# text/thinking output: image tokens at this rate, text + thinking tokens at
# ``output_price_usd``. ``None`` => single-rate billing (all output at
# ``output_price_usd``). langchain folds image+text+thinking into one
# ``output_tokens`` count and only breaks out thinking (``reasoning``), so the
# billing splits reasoning at ``output_price_usd`` and the remainder here.
image_output_price_usd: Optional[Decimal] = None
# Per-search USD surcharge billed when native web search is used. ``None``
# means "use the provider default" (see WEB_SEARCH_PRICE_USD_BY_PROVIDER);
# set an explicit value here to override a single model's web-search price.
Expand Down Expand Up @@ -225,24 +233,28 @@ class SupportedModel(Enum):
output_price_usd=Decimal("0.0000015"),
thinking_budget=0,
)
# Native image generation ("nano banana"). Image output is billed as output
# tokens (~1290 tokens per image); pricing mirrors Google's image token rate.
# Native image generation ("nano banana"). Google bills output at two rates:
# text/thinking at $1.50/MTok and images at $30/MTok (~1290 tokens per
# 1024x1024 image ≈ $0.039/image); input (text/image) is $0.30/MTok.
GEMINI_2_5_FLASH_IMAGE = ModelConfig(
provider="google",
api_name="gemini-2.5-flash-image",
input_price_usd=Decimal("0.0000003"),
output_price_usd=Decimal("0.00003"),
output_price_usd=Decimal("0.0000015"),
image_output=True,
image_output_price_usd=Decimal("0.00003"),
)
# Native image generation ("nano banana 2"), the latest Gemini image model.
# Image output is billed as output tokens (~1120 tokens per 1024x1024 image
# ≈ $0.067/image at $60/MTok); input (text/image) is $0.50/MTok.
# Google bills output at two rates: text/thinking at $3/MTok and images at
# $60/MTok (~1120 tokens per 1K image ≈ $0.067/image; $0.045/0.101/0.151 at
# 0.5K/2K/4K); input (text/image) is $0.50/MTok.
GEMINI_3_1_FLASH_IMAGE = ModelConfig(
provider="google",
api_name="gemini-3.1-flash-image",
input_price_usd=Decimal("0.0000005"),
output_price_usd=Decimal("0.00006"),
output_price_usd=Decimal("0.000003"),
image_output=True,
image_output_price_usd=Decimal("0.00006"),
)
GEMINI_3_5_FLASH = ModelConfig(
provider="google",
Expand Down
23 changes: 20 additions & 3 deletions tee_gateway/pricing.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,26 @@ def compute_session_cost(
in_tok = max(0, int(usage["prompt_tokens"]))
out_tok = max(0, int(usage["completion_tokens"]))

raw_usd = (Decimal(in_tok) * cfg.input_price_usd) + (
Decimal(out_tok) * cfg.output_price_usd
)
if cfg.image_output and cfg.image_output_price_usd is not None:
# Dual-rate output for Gemini image models: image-modality tokens are
# billed at image_output_price_usd, text + thinking at output_price_usd.
# langchain folds image+text+thinking into output_tokens and only
# breaks out thinking (reasoning), so bill reasoning at the text rate
# and the remainder (image, plus any small text caption) at the image
# rate. Conservative: never undercharges the image and is far below
# billing all output at the image rate (the previous behavior).
reasoning_tok = max(0, int(usage.get("reasoning_tokens", 0) or 0))
reasoning_tok = min(reasoning_tok, out_tok)
image_tok = out_tok - reasoning_tok
raw_usd = (
(Decimal(in_tok) * cfg.input_price_usd)
+ (Decimal(image_tok) * cfg.image_output_price_usd)
+ (Decimal(reasoning_tok) * cfg.output_price_usd)
)
else:
raw_usd = (Decimal(in_tok) * cfg.input_price_usd) + (
Decimal(out_tok) * cfg.output_price_usd
)

# Native web search is billed per search unit on top of token cost.
searches = max(0, int(web_search_count))
Expand Down
74 changes: 58 additions & 16 deletions tee_gateway/test/test_image_billing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
Gemini image-output models (e.g. ``gemini-2.5-flash-image``) bill each generated
image as ~1290 output tokens reported in ``candidates_token_count``. Our billing
relies on langchain-google-genai folding that field into
``usage_metadata.output_tokens`` so the image rides the normal token-priced path
(``output_tokens -> completion_tokens -> output_price_usd``). These tests pin
that assumption: if a future library bump stops folding image tokens into
``output_tokens``, or our pricing stops charging them, they fail loudly.
``usage_metadata.output_tokens`` so the image rides the token-priced path. Google
bills output at TWO rates, though — image-modality tokens at ``image_output_price_usd``
and text/thinking at the cheaper ``output_price_usd`` — so billing splits the
folded ``output_tokens`` using the ``reasoning`` count langchain breaks out:
thinking is charged at the text rate, the rest (image + any caption) at the image
rate. These tests pin both the langchain folding assumption and the split.

No network or API key required — we construct a synthetic Gemini response object
and inject a stub price feed.
Expand All @@ -19,6 +21,8 @@
from google.genai.types import GenerateContentResponse
from langchain_google_genai.chat_models import _response_to_result

from tee_gateway.llm_backend import extract_usage
from tee_gateway.model_registry import get_model_config
from tee_gateway.price_feed import get_price_feed, set_price_feed
from tee_gateway.pricing import compute_session_cost

Expand Down Expand Up @@ -104,33 +108,71 @@ def tearDown(self):
set_price_feed(self._prev_feed)

def _usage_dict(self, response) -> dict:
"""Mirror how chat_controller shapes usage_metadata into the OpenAI form."""
um = response.generations[0].message.usage_metadata
return {
"prompt_tokens": um["input_tokens"],
"completion_tokens": um["output_tokens"],
"total_tokens": um["total_tokens"],
}
"""Mirror how chat_controller bills: extract_usage carries reasoning."""
usage = extract_usage(response.generations[0].message)
assert usage is not None
return usage

def test_generated_image_is_charged_as_output_tokens(self):
def test_generated_image_is_charged_at_image_rate(self):
resp = _response_to_result(
_gemini_image_response(candidates_tokens=IMAGE_TOKENS)
)
cost = compute_session_cost(IMAGE_MODEL, self._usage_dict(resp))

self.assertIsNotNone(cost)
# Expected raw cost: 9 input + 1290 output tokens at the registry rates.
from tee_gateway.model_registry import get_model_config

# No thinking: all 1290 output tokens are image-modality, billed at the
# image rate (NOT the cheaper text/thinking output_price_usd).
cfg = get_model_config(IMAGE_MODEL)
self.assertIsNotNone(cfg.image_output_price_usd)
expected = (Decimal(9) * cfg.input_price_usd) + (
Decimal(IMAGE_TOKENS) * cfg.output_price_usd
Decimal(IMAGE_TOKENS) * cfg.image_output_price_usd
)
# settled_usd rounds the OPG integer up, so it is >= raw by at most one
# smallest unit (1e-18 USD here) — assert effectively-equal.
self.assertAlmostEqual(cost.cost_usd, expected, places=9)
self.assertGreater(cost.cost_opg, 0)

def test_thinking_tokens_billed_at_text_rate(self):
"""Thinking tokens are charged at output_price_usd, image at image rate."""
thoughts = 800
resp = _response_to_result(
_gemini_image_response(
candidates_tokens=IMAGE_TOKENS, thoughts_tokens=thoughts
)
)
usage = self._usage_dict(resp)
# langchain folds thoughts into output_tokens and breaks them out.
self.assertEqual(usage["reasoning_tokens"], thoughts)

cost = compute_session_cost(IMAGE_MODEL, usage)
self.assertIsNotNone(cost)

cfg = get_model_config(IMAGE_MODEL)
expected = (
(Decimal(9) * cfg.input_price_usd)
+ (Decimal(IMAGE_TOKENS) * cfg.image_output_price_usd)
+ (Decimal(thoughts) * cfg.output_price_usd)
)
self.assertAlmostEqual(cost.cost_usd, expected, places=9)

def test_thinking_is_cheaper_than_billing_all_at_image_rate(self):
"""Regression: thinking tokens must not be billed at the image rate."""
thoughts = 800
resp = _response_to_result(
_gemini_image_response(
candidates_tokens=IMAGE_TOKENS, thoughts_tokens=thoughts
)
)
cost = compute_session_cost(IMAGE_MODEL, self._usage_dict(resp))

cfg = get_model_config(IMAGE_MODEL)
# The old (buggy) behavior billed every output token at the image rate.
all_at_image_rate = (Decimal(9) * cfg.input_price_usd) + (
Decimal(IMAGE_TOKENS + thoughts) * cfg.image_output_price_usd
)
self.assertIsNotNone(cost)
self.assertLess(cost.cost_usd, all_at_image_rate)

def test_more_images_cost_more(self):
"""Cost scales with image tokens — not a flat per-request fee."""
one = _response_to_result(
Expand Down
3 changes: 3 additions & 0 deletions tee_gateway/test/test_price_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,9 @@ def _patch_model(
cfg = MagicMock()
cfg.input_price_usd = Decimal(input_price)
cfg.output_price_usd = Decimal(output_price)
# Not an image-output model: keep the single-rate output path.
cfg.image_output = False
cfg.image_output_price_usd = None
return patch("tee_gateway.pricing.get_model_config", return_value=cfg)

def test_calls_get_price(self):
Expand Down
5 changes: 4 additions & 1 deletion tests/test_pricing.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,10 @@ def test_gemini_3_1_flash_image_resolves(self):
cfg = get_model_config("gemini-3.1-flash-image")
self.assertEqual(cfg.provider, "google")
self.assertEqual(cfg.input_price_usd, Decimal("0.0000005"))
self.assertEqual(cfg.output_price_usd, Decimal("0.00006"))
# Output is dual-rate: text/thinking at output_price_usd, images at
# image_output_price_usd ($3 vs $60 per MTok).
self.assertEqual(cfg.output_price_usd, Decimal("0.000003"))
self.assertEqual(cfg.image_output_price_usd, Decimal("0.00006"))
self.assertTrue(cfg.image_output)

# ── xAI Grok ────────────────────────────────────────────────────────────
Expand Down
Loading