diff --git a/tee_gateway/controllers/chat_controller.py b/tee_gateway/controllers/chat_controller.py index 291a684..a217cc8 100644 --- a/tee_gateway/controllers/chat_controller.py +++ b/tee_gateway/controllers/chat_controller.py @@ -455,7 +455,13 @@ def _create_non_streaming_response(chat_request: CreateChatCompletionRequest): # TODO: If no usage is returned, we should compute it here. usage = extract_usage(response) if usage: - openai_response["usage"] = usage + # Surface the standard OpenAI usage triple on the response; the + # reasoning split rides along to the cost calculator via `usage`. + openai_response["usage"] = { + "prompt_tokens": usage["prompt_tokens"], + "completion_tokens": usage["completion_tokens"], + "total_tokens": usage["total_tokens"], + } web_search_count = ( extract_web_search_count(response) if chat_request.web_search else 0 ) @@ -625,6 +631,14 @@ def generate(): for k, v in response.usage_metadata.items(): if isinstance(v, (int, float)): final_usage[k] = v + # Thinking tokens are billed at the cheaper text rate; they + # live in the nested output_token_details dict (skipped by + # the int/float loop above), so pull them out explicitly. + _otd = response.usage_metadata.get("output_token_details") + if isinstance(_otd, dict) and isinstance( + _otd.get("reasoning"), (int, float) + ): + final_usage["reasoning"] = _otd["reasoning"] chunks_iter: list = [] elif anthropic_structured_content is not None: # Emit the pre-computed structured result as a single chunk. @@ -780,6 +794,16 @@ def generate(): for k, v in chunk.usage_metadata.items(): if isinstance(v, (int, float)): final_usage[k] = final_usage.get(k, 0) + v + # Thinking tokens (billed at the cheaper text rate) are + # nested in output_token_details and emitted as deltas like + # the top-level counts, so accumulate them the same way. + _otd = chunk.usage_metadata.get("output_token_details") + if isinstance(_otd, dict) and isinstance( + _otd.get("reasoning"), (int, float) + ): + final_usage["reasoning"] = ( + final_usage.get("reasoning", 0) + _otd["reasoning"] + ) # Flush buffered tool calls for OpenAI/Anthropic if buffer_tool_calls and buffered_tool_calls: @@ -856,9 +880,15 @@ def generate(): if chat_request.web_search else 0 ) + # Pass thinking tokens to the cost calculator (for the image + # dual-rate split) without polluting the OpenAI usage triple. + cost_usage = dict( + final_data["usage"], + reasoning_tokens=final_usage.get("reasoning", 0), + ) cost = compute_session_cost( chat_request.model, - final_data["usage"], + cost_usage, web_search_count=web_search_count, ) if cost is not None: diff --git a/tee_gateway/llm_backend.py b/tee_gateway/llm_backend.py index 0e80d22..924d88a 100644 --- a/tee_gateway/llm_backend.py +++ b/tee_gateway/llm_backend.py @@ -421,10 +421,15 @@ def extract_usage(response) -> Optional[Dict[str, int]]: """Extract token usage from a LangChain response object.""" if hasattr(response, "usage_metadata") and response.usage_metadata: meta = response.usage_metadata + # Thinking tokens, when present, are folded into output_tokens but also + # broken out here. Image-output models bill them at the cheaper + # text/thinking rate (see compute_session_cost), so surface them. + details = meta.get("output_token_details") or {} return { "prompt_tokens": meta.get("input_tokens", 0), "completion_tokens": meta.get("output_tokens", 0), "total_tokens": meta.get("total_tokens", 0), + "reasoning_tokens": details.get("reasoning", 0), } return None diff --git a/tee_gateway/model_registry.py b/tee_gateway/model_registry.py index b1b790c..67c1796 100644 --- a/tee_gateway/model_registry.py +++ b/tee_gateway/model_registry.py @@ -34,6 +34,14 @@ class ModelConfig: # Flat USD price per generated image, for ``image_generation`` models. Token # prices are ignored for these models (set to 0 in the registry). per_image_price_usd: Optional[Decimal] = None + # USD per image-modality output token, for ``image_output`` models (Gemini + # "nano banana"). These providers bill image output at a higher rate than + # text/thinking output: image tokens at this rate, text + thinking tokens at + # ``output_price_usd``. ``None`` => single-rate billing (all output at + # ``output_price_usd``). langchain folds image+text+thinking into one + # ``output_tokens`` count and only breaks out thinking (``reasoning``), so the + # billing splits reasoning at ``output_price_usd`` and the remainder here. + image_output_price_usd: Optional[Decimal] = None # Per-search USD surcharge billed when native web search is used. ``None`` # means "use the provider default" (see WEB_SEARCH_PRICE_USD_BY_PROVIDER); # set an explicit value here to override a single model's web-search price. @@ -225,24 +233,28 @@ class SupportedModel(Enum): output_price_usd=Decimal("0.0000015"), thinking_budget=0, ) - # Native image generation ("nano banana"). Image output is billed as output - # tokens (~1290 tokens per image); pricing mirrors Google's image token rate. + # Native image generation ("nano banana"). Google bills output at two rates: + # text/thinking at $1.50/MTok and images at $30/MTok (~1290 tokens per + # 1024x1024 image ≈ $0.039/image); input (text/image) is $0.30/MTok. GEMINI_2_5_FLASH_IMAGE = ModelConfig( provider="google", api_name="gemini-2.5-flash-image", input_price_usd=Decimal("0.0000003"), - output_price_usd=Decimal("0.00003"), + output_price_usd=Decimal("0.0000015"), image_output=True, + image_output_price_usd=Decimal("0.00003"), ) # Native image generation ("nano banana 2"), the latest Gemini image model. - # Image output is billed as output tokens (~1120 tokens per 1024x1024 image - # ≈ $0.067/image at $60/MTok); input (text/image) is $0.50/MTok. + # Google bills output at two rates: text/thinking at $3/MTok and images at + # $60/MTok (~1120 tokens per 1K image ≈ $0.067/image; $0.045/0.101/0.151 at + # 0.5K/2K/4K); input (text/image) is $0.50/MTok. GEMINI_3_1_FLASH_IMAGE = ModelConfig( provider="google", api_name="gemini-3.1-flash-image", input_price_usd=Decimal("0.0000005"), - output_price_usd=Decimal("0.00006"), + output_price_usd=Decimal("0.000003"), image_output=True, + image_output_price_usd=Decimal("0.00006"), ) GEMINI_3_5_FLASH = ModelConfig( provider="google", diff --git a/tee_gateway/pricing.py b/tee_gateway/pricing.py index 6a9ef06..1bd952a 100644 --- a/tee_gateway/pricing.py +++ b/tee_gateway/pricing.py @@ -75,9 +75,26 @@ def compute_session_cost( in_tok = max(0, int(usage["prompt_tokens"])) out_tok = max(0, int(usage["completion_tokens"])) - raw_usd = (Decimal(in_tok) * cfg.input_price_usd) + ( - Decimal(out_tok) * cfg.output_price_usd - ) + if cfg.image_output and cfg.image_output_price_usd is not None: + # Dual-rate output for Gemini image models: image-modality tokens are + # billed at image_output_price_usd, text + thinking at output_price_usd. + # langchain folds image+text+thinking into output_tokens and only + # breaks out thinking (reasoning), so bill reasoning at the text rate + # and the remainder (image, plus any small text caption) at the image + # rate. Conservative: never undercharges the image and is far below + # billing all output at the image rate (the previous behavior). + reasoning_tok = max(0, int(usage.get("reasoning_tokens", 0) or 0)) + reasoning_tok = min(reasoning_tok, out_tok) + image_tok = out_tok - reasoning_tok + raw_usd = ( + (Decimal(in_tok) * cfg.input_price_usd) + + (Decimal(image_tok) * cfg.image_output_price_usd) + + (Decimal(reasoning_tok) * cfg.output_price_usd) + ) + else: + raw_usd = (Decimal(in_tok) * cfg.input_price_usd) + ( + Decimal(out_tok) * cfg.output_price_usd + ) # Native web search is billed per search unit on top of token cost. searches = max(0, int(web_search_count)) diff --git a/tee_gateway/test/test_image_billing.py b/tee_gateway/test/test_image_billing.py index ad22bcd..dbb2fc5 100644 --- a/tee_gateway/test/test_image_billing.py +++ b/tee_gateway/test/test_image_billing.py @@ -3,10 +3,12 @@ Gemini image-output models (e.g. ``gemini-2.5-flash-image``) bill each generated image as ~1290 output tokens reported in ``candidates_token_count``. Our billing relies on langchain-google-genai folding that field into -``usage_metadata.output_tokens`` so the image rides the normal token-priced path -(``output_tokens -> completion_tokens -> output_price_usd``). These tests pin -that assumption: if a future library bump stops folding image tokens into -``output_tokens``, or our pricing stops charging them, they fail loudly. +``usage_metadata.output_tokens`` so the image rides the token-priced path. Google +bills output at TWO rates, though — image-modality tokens at ``image_output_price_usd`` +and text/thinking at the cheaper ``output_price_usd`` — so billing splits the +folded ``output_tokens`` using the ``reasoning`` count langchain breaks out: +thinking is charged at the text rate, the rest (image + any caption) at the image +rate. These tests pin both the langchain folding assumption and the split. No network or API key required — we construct a synthetic Gemini response object and inject a stub price feed. @@ -19,6 +21,8 @@ from google.genai.types import GenerateContentResponse from langchain_google_genai.chat_models import _response_to_result +from tee_gateway.llm_backend import extract_usage +from tee_gateway.model_registry import get_model_config from tee_gateway.price_feed import get_price_feed, set_price_feed from tee_gateway.pricing import compute_session_cost @@ -104,33 +108,71 @@ def tearDown(self): set_price_feed(self._prev_feed) def _usage_dict(self, response) -> dict: - """Mirror how chat_controller shapes usage_metadata into the OpenAI form.""" - um = response.generations[0].message.usage_metadata - return { - "prompt_tokens": um["input_tokens"], - "completion_tokens": um["output_tokens"], - "total_tokens": um["total_tokens"], - } + """Mirror how chat_controller bills: extract_usage carries reasoning.""" + usage = extract_usage(response.generations[0].message) + assert usage is not None + return usage - def test_generated_image_is_charged_as_output_tokens(self): + def test_generated_image_is_charged_at_image_rate(self): resp = _response_to_result( _gemini_image_response(candidates_tokens=IMAGE_TOKENS) ) cost = compute_session_cost(IMAGE_MODEL, self._usage_dict(resp)) self.assertIsNotNone(cost) - # Expected raw cost: 9 input + 1290 output tokens at the registry rates. - from tee_gateway.model_registry import get_model_config - + # No thinking: all 1290 output tokens are image-modality, billed at the + # image rate (NOT the cheaper text/thinking output_price_usd). cfg = get_model_config(IMAGE_MODEL) + self.assertIsNotNone(cfg.image_output_price_usd) expected = (Decimal(9) * cfg.input_price_usd) + ( - Decimal(IMAGE_TOKENS) * cfg.output_price_usd + Decimal(IMAGE_TOKENS) * cfg.image_output_price_usd ) # settled_usd rounds the OPG integer up, so it is >= raw by at most one # smallest unit (1e-18 USD here) — assert effectively-equal. self.assertAlmostEqual(cost.cost_usd, expected, places=9) self.assertGreater(cost.cost_opg, 0) + def test_thinking_tokens_billed_at_text_rate(self): + """Thinking tokens are charged at output_price_usd, image at image rate.""" + thoughts = 800 + resp = _response_to_result( + _gemini_image_response( + candidates_tokens=IMAGE_TOKENS, thoughts_tokens=thoughts + ) + ) + usage = self._usage_dict(resp) + # langchain folds thoughts into output_tokens and breaks them out. + self.assertEqual(usage["reasoning_tokens"], thoughts) + + cost = compute_session_cost(IMAGE_MODEL, usage) + self.assertIsNotNone(cost) + + cfg = get_model_config(IMAGE_MODEL) + expected = ( + (Decimal(9) * cfg.input_price_usd) + + (Decimal(IMAGE_TOKENS) * cfg.image_output_price_usd) + + (Decimal(thoughts) * cfg.output_price_usd) + ) + self.assertAlmostEqual(cost.cost_usd, expected, places=9) + + def test_thinking_is_cheaper_than_billing_all_at_image_rate(self): + """Regression: thinking tokens must not be billed at the image rate.""" + thoughts = 800 + resp = _response_to_result( + _gemini_image_response( + candidates_tokens=IMAGE_TOKENS, thoughts_tokens=thoughts + ) + ) + cost = compute_session_cost(IMAGE_MODEL, self._usage_dict(resp)) + + cfg = get_model_config(IMAGE_MODEL) + # The old (buggy) behavior billed every output token at the image rate. + all_at_image_rate = (Decimal(9) * cfg.input_price_usd) + ( + Decimal(IMAGE_TOKENS + thoughts) * cfg.image_output_price_usd + ) + self.assertIsNotNone(cost) + self.assertLess(cost.cost_usd, all_at_image_rate) + def test_more_images_cost_more(self): """Cost scales with image tokens — not a flat per-request fee.""" one = _response_to_result( diff --git a/tee_gateway/test/test_price_feed.py b/tee_gateway/test/test_price_feed.py index 17af102..b6ba19e 100644 --- a/tee_gateway/test/test_price_feed.py +++ b/tee_gateway/test/test_price_feed.py @@ -401,6 +401,9 @@ def _patch_model( cfg = MagicMock() cfg.input_price_usd = Decimal(input_price) cfg.output_price_usd = Decimal(output_price) + # Not an image-output model: keep the single-rate output path. + cfg.image_output = False + cfg.image_output_price_usd = None return patch("tee_gateway.pricing.get_model_config", return_value=cfg) def test_calls_get_price(self): diff --git a/tests/test_pricing.py b/tests/test_pricing.py index 0b4a0c4..cb41d8c 100644 --- a/tests/test_pricing.py +++ b/tests/test_pricing.py @@ -238,7 +238,10 @@ def test_gemini_3_1_flash_image_resolves(self): cfg = get_model_config("gemini-3.1-flash-image") self.assertEqual(cfg.provider, "google") self.assertEqual(cfg.input_price_usd, Decimal("0.0000005")) - self.assertEqual(cfg.output_price_usd, Decimal("0.00006")) + # Output is dual-rate: text/thinking at output_price_usd, images at + # image_output_price_usd ($3 vs $60 per MTok). + self.assertEqual(cfg.output_price_usd, Decimal("0.000003")) + self.assertEqual(cfg.image_output_price_usd, Decimal("0.00006")) self.assertTrue(cfg.image_output) # ── xAI Grok ────────────────────────────────────────────────────────────