OpenGradient · adambalogh · Jun 3, 2026 · Jun 3, 2026
diff --git a/tee_gateway/controllers/chat_controller.py b/tee_gateway/controllers/chat_controller.py
@@ -455,7 +455,13 @@ def _create_non_streaming_response(chat_request: CreateChatCompletionRequest):
         # TODO: If no usage is returned, we should compute it here.
         usage = extract_usage(response)
         if usage:
-            openai_response["usage"] = usage
+            # Surface the standard OpenAI usage triple on the response; the
+            # reasoning split rides along to the cost calculator via `usage`.
+            openai_response["usage"] = {
+                "prompt_tokens": usage["prompt_tokens"],
+                "completion_tokens": usage["completion_tokens"],
+                "total_tokens": usage["total_tokens"],
+            }
             web_search_count = (
                 extract_web_search_count(response) if chat_request.web_search else 0
             )
@@ -625,6 +631,14 @@ def generate():
                         for k, v in response.usage_metadata.items():
                             if isinstance(v, (int, float)):
                                 final_usage[k] = v
+                        # Thinking tokens are billed at the cheaper text rate; they
+                        # live in the nested output_token_details dict (skipped by
+                        # the int/float loop above), so pull them out explicitly.
+                        _otd = response.usage_metadata.get("output_token_details")
+                        if isinstance(_otd, dict) and isinstance(
+                            _otd.get("reasoning"), (int, float)
+                        ):
+                            final_usage["reasoning"] = _otd["reasoning"]
                     chunks_iter: list = []
                 elif anthropic_structured_content is not None:
                     # Emit the pre-computed structured result as a single chunk.
@@ -780,6 +794,16 @@ def generate():
                         for k, v in chunk.usage_metadata.items():
                             if isinstance(v, (int, float)):
                                 final_usage[k] = final_usage.get(k, 0) + v
+                        # Thinking tokens (billed at the cheaper text rate) are
+                        # nested in output_token_details and emitted as deltas like
+                        # the top-level counts, so accumulate them the same way.
+                        _otd = chunk.usage_metadata.get("output_token_details")
+                        if isinstance(_otd, dict) and isinstance(
+                            _otd.get("reasoning"), (int, float)
+                        ):
+                            final_usage["reasoning"] = (
+                                final_usage.get("reasoning", 0) + _otd["reasoning"]
+                            )
 
                 # Flush buffered tool calls for OpenAI/Anthropic
                 if buffer_tool_calls and buffered_tool_calls:
@@ -856,9 +880,15 @@ def generate():
                         if chat_request.web_search
                         else 0
                     )
+                    # Pass thinking tokens to the cost calculator (for the image
+                    # dual-rate split) without polluting the OpenAI usage triple.
+                    cost_usage = dict(
+                        final_data["usage"],
+                        reasoning_tokens=final_usage.get("reasoning", 0),
+                    )
                     cost = compute_session_cost(
                         chat_request.model,
-                        final_data["usage"],
+                        cost_usage,
                         web_search_count=web_search_count,
                     )
                     if cost is not None:

diff --git a/tee_gateway/llm_backend.py b/tee_gateway/llm_backend.py
@@ -421,10 +421,15 @@ def extract_usage(response) -> Optional[Dict[str, int]]:
     """Extract token usage from a LangChain response object."""
     if hasattr(response, "usage_metadata") and response.usage_metadata:
         meta = response.usage_metadata
+        # Thinking tokens, when present, are folded into output_tokens but also
+        # broken out here. Image-output models bill them at the cheaper
+        # text/thinking rate (see compute_session_cost), so surface them.
+        details = meta.get("output_token_details") or {}
         return {
             "prompt_tokens": meta.get("input_tokens", 0),
             "completion_tokens": meta.get("output_tokens", 0),
             "total_tokens": meta.get("total_tokens", 0),
+            "reasoning_tokens": details.get("reasoning", 0),
         }
     return None
 

diff --git a/tee_gateway/model_registry.py b/tee_gateway/model_registry.py
@@ -34,6 +34,14 @@ class ModelConfig:
     # Flat USD price per generated image, for ``image_generation`` models. Token
     # prices are ignored for these models (set to 0 in the registry).
     per_image_price_usd: Optional[Decimal] = None
+    # USD per image-modality output token, for ``image_output`` models (Gemini
+    # "nano banana"). These providers bill image output at a higher rate than
+    # text/thinking output: image tokens at this rate, text + thinking tokens at
+    # ``output_price_usd``. ``None`` => single-rate billing (all output at
+    # ``output_price_usd``). langchain folds image+text+thinking into one
+    # ``output_tokens`` count and only breaks out thinking (``reasoning``), so the
+    # billing splits reasoning at ``output_price_usd`` and the remainder here.
+    image_output_price_usd: Optional[Decimal] = None
     # Per-search USD surcharge billed when native web search is used. ``None``
     # means "use the provider default" (see WEB_SEARCH_PRICE_USD_BY_PROVIDER);
     # set an explicit value here to override a single model's web-search price.
@@ -225,24 +233,28 @@ class SupportedModel(Enum):
         output_price_usd=Decimal("0.0000015"),
         thinking_budget=0,
     )
-    # Native image generation ("nano banana"). Image output is billed as output
-    # tokens (~1290 tokens per image); pricing mirrors Google's image token rate.
+    # Native image generation ("nano banana"). Google bills output at two rates:
+    # text/thinking at $1.50/MTok and images at $30/MTok (~1290 tokens per
+    # 1024x1024 image ≈ $0.039/image); input (text/image) is $0.30/MTok.
     GEMINI_2_5_FLASH_IMAGE = ModelConfig(
         provider="google",
         api_name="gemini-2.5-flash-image",
         input_price_usd=Decimal("0.0000003"),
-        output_price_usd=Decimal("0.00003"),
+        output_price_usd=Decimal("0.0000015"),
         image_output=True,
+        image_output_price_usd=Decimal("0.00003"),
     )
     # Native image generation ("nano banana 2"), the latest Gemini image model.
-    # Image output is billed as output tokens (~1120 tokens per 1024x1024 image
-    # ≈ $0.067/image at $60/MTok); input (text/image) is $0.50/MTok.
+    # Google bills output at two rates: text/thinking at $3/MTok and images at
+    # $60/MTok (~1120 tokens per 1K image ≈ $0.067/image; $0.045/0.101/0.151 at
+    # 0.5K/2K/4K); input (text/image) is $0.50/MTok.
     GEMINI_3_1_FLASH_IMAGE = ModelConfig(
         provider="google",
         api_name="gemini-3.1-flash-image",
         input_price_usd=Decimal("0.0000005"),
-        output_price_usd=Decimal("0.00006"),
+        output_price_usd=Decimal("0.000003"),
         image_output=True,
+        image_output_price_usd=Decimal("0.00006"),
     )
     GEMINI_3_5_FLASH = ModelConfig(
         provider="google",

diff --git a/tee_gateway/pricing.py b/tee_gateway/pricing.py
@@ -75,9 +75,26 @@ def compute_session_cost(
         in_tok = max(0, int(usage["prompt_tokens"]))
         out_tok = max(0, int(usage["completion_tokens"]))
 
-        raw_usd = (Decimal(in_tok) * cfg.input_price_usd) + (
-            Decimal(out_tok) * cfg.output_price_usd
-        )
+        if cfg.image_output and cfg.image_output_price_usd is not None:
+            # Dual-rate output for Gemini image models: image-modality tokens are
+            # billed at image_output_price_usd, text + thinking at output_price_usd.
+            # langchain folds image+text+thinking into output_tokens and only
+            # breaks out thinking (reasoning), so bill reasoning at the text rate
+            # and the remainder (image, plus any small text caption) at the image
+            # rate. Conservative: never undercharges the image and is far below
+            # billing all output at the image rate (the previous behavior).
+            reasoning_tok = max(0, int(usage.get("reasoning_tokens", 0) or 0))
+            reasoning_tok = min(reasoning_tok, out_tok)
+            image_tok = out_tok - reasoning_tok
+            raw_usd = (
+                (Decimal(in_tok) * cfg.input_price_usd)
+                + (Decimal(image_tok) * cfg.image_output_price_usd)
+                + (Decimal(reasoning_tok) * cfg.output_price_usd)
+            )
+        else:
+            raw_usd = (Decimal(in_tok) * cfg.input_price_usd) + (
+                Decimal(out_tok) * cfg.output_price_usd
+            )
 
         # Native web search is billed per search unit on top of token cost.
         searches = max(0, int(web_search_count))

diff --git a/tee_gateway/test/test_image_billing.py b/tee_gateway/test/test_image_billing.py
@@ -3,10 +3,12 @@
 Gemini image-output models (e.g. ``gemini-2.5-flash-image``) bill each generated
 image as ~1290 output tokens reported in ``candidates_token_count``. Our billing
 relies on langchain-google-genai folding that field into
-``usage_metadata.output_tokens`` so the image rides the normal token-priced path
-(``output_tokens -> completion_tokens -> output_price_usd``). These tests pin
-that assumption: if a future library bump stops folding image tokens into
-``output_tokens``, or our pricing stops charging them, they fail loudly.
+``usage_metadata.output_tokens`` so the image rides the token-priced path. Google
+bills output at TWO rates, though — image-modality tokens at ``image_output_price_usd``
+and text/thinking at the cheaper ``output_price_usd`` — so billing splits the
+folded ``output_tokens`` using the ``reasoning`` count langchain breaks out:
+thinking is charged at the text rate, the rest (image + any caption) at the image
+rate. These tests pin both the langchain folding assumption and the split.
 
 No network or API key required — we construct a synthetic Gemini response object
 and inject a stub price feed.
@@ -19,6 +21,8 @@
 from google.genai.types import GenerateContentResponse
 from langchain_google_genai.chat_models import _response_to_result
 
+from tee_gateway.llm_backend import extract_usage
+from tee_gateway.model_registry import get_model_config
 from tee_gateway.price_feed import get_price_feed, set_price_feed
 from tee_gateway.pricing import compute_session_cost
 
@@ -104,33 +108,71 @@ def tearDown(self):
             set_price_feed(self._prev_feed)
 
     def _usage_dict(self, response) -> dict:
-        """Mirror how chat_controller shapes usage_metadata into the OpenAI form."""
-        um = response.generations[0].message.usage_metadata
-        return {
-            "prompt_tokens": um["input_tokens"],
-            "completion_tokens": um["output_tokens"],
-            "total_tokens": um["total_tokens"],
-        }
+        """Mirror how chat_controller bills: extract_usage carries reasoning."""
+        usage = extract_usage(response.generations[0].message)
+        assert usage is not None
+        return usage
 
-    def test_generated_image_is_charged_as_output_tokens(self):
+    def test_generated_image_is_charged_at_image_rate(self):
         resp = _response_to_result(
             _gemini_image_response(candidates_tokens=IMAGE_TOKENS)
         )
         cost = compute_session_cost(IMAGE_MODEL, self._usage_dict(resp))
 
         self.assertIsNotNone(cost)
-        # Expected raw cost: 9 input + 1290 output tokens at the registry rates.
-        from tee_gateway.model_registry import get_model_config
-
+        # No thinking: all 1290 output tokens are image-modality, billed at the
+        # image rate (NOT the cheaper text/thinking output_price_usd).
         cfg = get_model_config(IMAGE_MODEL)
+        self.assertIsNotNone(cfg.image_output_price_usd)
         expected = (Decimal(9) * cfg.input_price_usd) + (
-            Decimal(IMAGE_TOKENS) * cfg.output_price_usd
+            Decimal(IMAGE_TOKENS) * cfg.image_output_price_usd
         )
         # settled_usd rounds the OPG integer up, so it is >= raw by at most one
         # smallest unit (1e-18 USD here) — assert effectively-equal.
         self.assertAlmostEqual(cost.cost_usd, expected, places=9)
         self.assertGreater(cost.cost_opg, 0)
 
+    def test_thinking_tokens_billed_at_text_rate(self):
+        """Thinking tokens are charged at output_price_usd, image at image rate."""
+        thoughts = 800
+        resp = _response_to_result(
+            _gemini_image_response(
+                candidates_tokens=IMAGE_TOKENS, thoughts_tokens=thoughts
+            )
+        )
+        usage = self._usage_dict(resp)
+        # langchain folds thoughts into output_tokens and breaks them out.
+        self.assertEqual(usage["reasoning_tokens"], thoughts)
+
+        cost = compute_session_cost(IMAGE_MODEL, usage)
+        self.assertIsNotNone(cost)
+
+        cfg = get_model_config(IMAGE_MODEL)
+        expected = (
+            (Decimal(9) * cfg.input_price_usd)
+            + (Decimal(IMAGE_TOKENS) * cfg.image_output_price_usd)
+            + (Decimal(thoughts) * cfg.output_price_usd)
+        )
+        self.assertAlmostEqual(cost.cost_usd, expected, places=9)
+
+    def test_thinking_is_cheaper_than_billing_all_at_image_rate(self):
+        """Regression: thinking tokens must not be billed at the image rate."""
+        thoughts = 800
+        resp = _response_to_result(
+            _gemini_image_response(
+                candidates_tokens=IMAGE_TOKENS, thoughts_tokens=thoughts
+            )
+        )
+        cost = compute_session_cost(IMAGE_MODEL, self._usage_dict(resp))
+
+        cfg = get_model_config(IMAGE_MODEL)
+        # The old (buggy) behavior billed every output token at the image rate.
+        all_at_image_rate = (Decimal(9) * cfg.input_price_usd) + (
+            Decimal(IMAGE_TOKENS + thoughts) * cfg.image_output_price_usd
+        )
+        self.assertIsNotNone(cost)
+        self.assertLess(cost.cost_usd, all_at_image_rate)
+
     def test_more_images_cost_more(self):
         """Cost scales with image tokens — not a flat per-request fee."""
         one = _response_to_result(

diff --git a/tee_gateway/test/test_price_feed.py b/tee_gateway/test/test_price_feed.py
@@ -401,6 +401,9 @@ def _patch_model(
         cfg = MagicMock()
         cfg.input_price_usd = Decimal(input_price)
         cfg.output_price_usd = Decimal(output_price)
+        # Not an image-output model: keep the single-rate output path.
+        cfg.image_output = False
+        cfg.image_output_price_usd = None
         return patch("tee_gateway.pricing.get_model_config", return_value=cfg)
 
     def test_calls_get_price(self):

diff --git a/tests/test_pricing.py b/tests/test_pricing.py
@@ -238,7 +238,10 @@ def test_gemini_3_1_flash_image_resolves(self):
         cfg = get_model_config("gemini-3.1-flash-image")
         self.assertEqual(cfg.provider, "google")
         self.assertEqual(cfg.input_price_usd, Decimal("0.0000005"))
-        self.assertEqual(cfg.output_price_usd, Decimal("0.00006"))
+        # Output is dual-rate: text/thinking at output_price_usd, images at
+        # image_output_price_usd ($3 vs $60 per MTok).
+        self.assertEqual(cfg.output_price_usd, Decimal("0.000003"))
+        self.assertEqual(cfg.image_output_price_usd, Decimal("0.00006"))
         self.assertTrue(cfg.image_output)
 
     # ── xAI Grok ────────────────────────────────────────────────────────────