From 7161a4931c6be8d4d859f4f0293ae881449db2cd Mon Sep 17 00:00:00 2001
From: Jorben <jorbenzhu@gmail.com>
Date: Thu, 11 Jun 2026 00:40:19 +0800
Subject: [PATCH 1/2] =?UTF-8?q?refactor(usage):=20=E2=99=BB=EF=B8=8F=20nor?=
 =?UTF-8?q?malize=20token=20usage=20accounting=20across=20providers?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Standardize how `input`, `cache_read`, and `cache_write` are computed
in the Anthropic and Google protocol modules so they follow a uniform
model:

- Anthropic: take final cache read/write counts from `message_delta`
  (the `message_start` values are placeholders) so the three-segment
  billable input remains non-overlapping.
- Google: subtract `cached_content_token_count` from `prompt_token_count`
  to derive the uncached `input`, matching OpenAI's convention where
  `input + cache_read == prompt_tokens`.

This allows downstream code to use a single formula for the true context
footprint of any request, exposed via the new `Usage::context_size()`
method. Comprehensive documentation now explains the per-protocol field
semantics at both the module and struct level.

Also removes the implicit `total_tokens` recomputation in `Usage::add()`
since that field carries provider-specific wire semantics and must not
be re-aggregated across requests or protocols.
---
 src/protocol/anthropic.rs | 16 ++++++++
 src/protocol/google.rs    | 13 +++++-
 src/types/usage.rs        | 84 +++++++++++++++++++++++++++++++++++++--
 3 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/src/protocol/anthropic.rs b/src/protocol/anthropic.rs
index da03e3f..c3a8514 100644
--- a/src/protocol/anthropic.rs
+++ b/src/protocol/anthropic.rs
@@ -560,6 +560,14 @@ struct MessageDelta {
 struct MessageDeltaUsage {
     #[serde(default)]
     output_tokens: u64,
+    // Anthropic may report the final cache read / write counts in the
+    // `message_delta` event (after `message_start`). Track them so the
+    // resulting `Usage` reflects the true three-segment billable input
+    // (input + cache_read + cache_write).
+    #[serde(default)]
+    cache_read_input_tokens: u64,
+    #[serde(default)]
+    cache_creation_input_tokens: u64,
 }
 
 // ============================================================================
@@ -1474,6 +1482,14 @@ async fn run_stream(
                         }
                         if let Some(usage) = delta_data.usage {
                             output.usage.output = usage.output_tokens;
+                            // Anthropic reports the final cache read / write
+                            // counters in `message_delta` (the values from
+                            // `message_start` are placeholders). Always take
+                            // the delta's values when present so the three
+                            // segments (input / cache_read / cache_write)
+                            // remain non-overlapping and billable.
+                            output.usage.cache_read = usage.cache_read_input_tokens;
+                            output.usage.cache_write = usage.cache_creation_input_tokens;
                             output.usage.total_tokens = output.usage.input
                                 + output.usage.output
                                 + output.usage.cache_read
diff --git a/src/protocol/google.rs b/src/protocol/google.rs
index 3e4cf4c..312e1d9 100644
--- a/src/protocol/google.rs
+++ b/src/protocol/google.rs
@@ -1075,9 +1075,20 @@ async fn run_stream(
                     }
 
                     // Handle usage metadata
+                    //
+                    // Semantic alignment with other protocols (OpenAI):
+                    //   `prompt_token_count` is the *total* prompt size INCLUDING
+                    //   any cached portion, so we subtract the cached slice to
+                    //   get the uncached `input`. `cache_read` holds the cached
+                    //   slice, so `input + cache_read == prompt_token_count`.
+                    //   After this normalization, `Usage::context_size()`
+                    //   (input + output + cache_read + cache_write) returns the
+                    //   true context footprint for Google as well.
                     if let Some(ref usage) = chunk_data.usage_metadata {
                         saw_usage_metadata = true;
-                        output.usage.input = usage.prompt_token_count;
+                        output.usage.input = usage
+                            .prompt_token_count
+                            .saturating_sub(usage.cached_content_token_count);
                         output.usage.output =
                             usage.candidates_token_count + usage.thoughts_token_count;
                         output.usage.cache_read = usage.cached_content_token_count;
diff --git a/src/types/usage.rs b/src/types/usage.rs
index e0501aa..5a939a8 100644
--- a/src/types/usage.rs
+++ b/src/types/usage.rs
@@ -1,19 +1,76 @@
 //! Token usage and cost tracking.
+//!
+//! ## Field semantics across protocols
+//!
+//! Providers report cache in two distinct shapes. All protocol modules in
+//! `src/protocol/` normalize the wire values into a single internal model so
+//! downstream code can use a uniform formula:
+//!
+//! | Field        | OpenAI / Google                                       | Anthropic                              |
+//! | ------------ | ----------------------------------------------------- | -------------------------------------- |
+//! | `input`      | prompt tokens **excluding** the cached slice          | non-cached input (mutually exclusive)  |
+//! | `cache_read` | cached slice of the prompt (subset of original total) | cache hit tokens (mutually exclusive)  |
+//! | `cache_write`| always 0 (no write-side API on these providers)      | tokens written to cache this request   |
+//! | `output`     | generated tokens                                      | generated tokens                       |
+//!
+//! After normalization:
+//! - OpenAI / Google: `input + cache_read` equals the original
+//!   `prompt_tokens` / `prompt_token_count` from the wire response.
+//! - Anthropic: `input + cache_read + cache_write` is the three non-overlapping
+//!   components of billable input tokens.
+//!
+//! In **every** case the true context footprint of one request is
+//! `input + output + cache_read + cache_write` — see
+//! [`Usage::context_size`].
 
 use serde::{Deserialize, Serialize};
 
 /// Token usage information.
+///
+/// Field semantics are documented at the module level — see the table in
+/// `src/types/usage.rs` for the per-protocol meaning of each counter.
+///
+/// The recommended way to get the true context footprint of a request across
+/// all providers is [`Usage::context_size`], which always returns
+/// `input + output + cache_read + cache_write` regardless of the original
+/// protocol's accounting convention.
 #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)]
 pub struct Usage {
-    /// Number of input tokens.
+    /// Number of non-cached input tokens.
+    ///
+    /// - OpenAI / Google: `prompt_tokens - cached_tokens` (cached slice
+    ///   has been split out into [`Self::cache_read`]).
+    /// - Anthropic: raw `input_tokens` (cache hits / writes are reported
+    ///   separately and do **not** overlap with this counter).
     pub input: u64,
-    /// Number of output tokens.
+    /// Number of output tokens (model-generated).
     pub output: u64,
     /// Number of cached tokens read.
+    ///
+    /// - OpenAI / Google: subset of the original prompt token total — when
+    ///   added back to [`Self::input`] it reproduces the wire `prompt_tokens`.
+    /// - Anthropic: mutually exclusive with [`Self::input`] and
+    ///   [`Self::cache_write`].
     pub cache_read: u64,
-    /// Number of tokens written to cache.
+    /// Number of tokens written to cache. Only Anthropic populates this
+    /// counter; it is always `0` for OpenAI / Google.
     pub cache_write: u64,
     /// Total tokens used.
+    ///
+    /// This field is the **wire-level** total (when the provider exposes one)
+    /// or a synthesized sum for protocols that don't emit one. Its exact
+    /// meaning depends on the source protocol:
+    ///
+    /// - OpenAI Chat Completions / Responses: `prompt_tokens + completion_tokens`.
+    ///   `cache_read` is *implicit* in `prompt_tokens` and is **not** added
+    ///   again here.
+    /// - Google Generative AI: `totalTokenCount` from the API.
+    /// - Anthropic: `input + output + cache_read + cache_write`
+    ///   (synthesized client-side).
+    ///
+    /// For a protocol-agnostic context footprint, use
+    /// [`Usage::context_size`] instead. Do not sum `total_tokens` across
+    /// different providers.
     pub total_tokens: u64,
     /// Cost breakdown.
     pub cost: UsageCost,
@@ -37,13 +94,32 @@ impl Usage {
         }
     }
 
+    /// Protocol-agnostic context footprint of the request.
+    ///
+    /// Returns `input + output + cache_read + cache_write`. After the
+    /// normalization performed in each protocol module, this is the true
+    /// "how many tokens does this exchange touch" number for **every**
+    /// provider — including the cached portion for OpenAI / Google and the
+    /// full three-segment billable input for Anthropic.
+    pub fn context_size(&self) -> u64 {
+        self.input + self.output + self.cache_read + self.cache_write
+    }
+
     /// Add another usage to this one.
+    ///
+    /// The four token counters are summed component-wise. The cached
+    /// `total_tokens` field is **left untouched** because it carries the
+    /// wire-level semantics of whichever request populated it last and
+    /// is not safe to recompute across protocols (or even within the same
+    /// provider, where `total_tokens` is `prompt + completion` and would
+    /// double-count the cached slice if summed as
+    /// `input + output + cache_read + cache_write`). Use
+    /// [`Usage::context_size`] for the protocol-agnostic footprint.
     pub fn add(&mut self, other: &Usage) {
         self.input += other.input;
         self.output += other.output;
         self.cache_read += other.cache_read;
         self.cache_write += other.cache_write;
-        self.total_tokens = self.input + self.output + self.cache_read + self.cache_write;
         self.cost.input += other.cost.input;
         self.cost.output += other.cost.output;
         self.cost.cache_read += other.cost.cache_read;

From a3686da96c27bbd118b09f7008307616b8993e76 Mon Sep 17 00:00:00 2001
From: Jorben <jorbenzhu@gmail.com>
Date: Thu, 11 Jun 2026 00:49:10 +0800
Subject: [PATCH 2/2] fix(usage): preserve message_start cache counters when
 message_delta omits them; update test_usage_add for new contract
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Anthropic: change MessageDeltaUsage cache fields to Option<u64> so a
  message_delta event without cache fields no longer clobbers the values
  populated from message_start.
- test_usage_add: assert the new `add` contract — total_tokens is not
  recomputed, use context_size() for the cross-protocol footprint.
---
 src/protocol/anthropic.rs | 29 ++++++++++++++++++-----------
 tests/test_types.rs       | 13 +++++++++----
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/protocol/anthropic.rs b/src/protocol/anthropic.rs
index c3a8514..5d650b0 100644
--- a/src/protocol/anthropic.rs
+++ b/src/protocol/anthropic.rs
@@ -561,13 +561,13 @@ struct MessageDeltaUsage {
     #[serde(default)]
     output_tokens: u64,
     // Anthropic may report the final cache read / write counts in the
-    // `message_delta` event (after `message_start`). Track them so the
-    // resulting `Usage` reflects the true three-segment billable input
-    // (input + cache_read + cache_write).
+    // `message_delta` event (after `message_start`). The values are
+    // optional: when Anthropic omits the field the `message_start` values
+    // are authoritative, and we must NOT clobber them with `0`.
     #[serde(default)]
-    cache_read_input_tokens: u64,
+    cache_read_input_tokens: Option<u64>,
     #[serde(default)]
-    cache_creation_input_tokens: u64,
+    cache_creation_input_tokens: Option<u64>,
 }
 
 // ============================================================================
@@ -1484,12 +1484,19 @@ async fn run_stream(
                             output.usage.output = usage.output_tokens;
                             // Anthropic reports the final cache read / write
                             // counters in `message_delta` (the values from
-                            // `message_start` are placeholders). Always take
-                            // the delta's values when present so the three
-                            // segments (input / cache_read / cache_write)
-                            // remain non-overlapping and billable.
-                            output.usage.cache_read = usage.cache_read_input_tokens;
-                            output.usage.cache_write = usage.cache_creation_input_tokens;
+                            // `message_start` are placeholders). Only override
+                            // the three segments when `message_delta`
+                            // explicitly carries the field — Anthropic omits
+                            // the field when the values are unchanged, and
+                            // blindly assigning `0` would clobber the
+                            // `message_start` values and break three-segment
+                            // billable accounting.
+                            if let Some(v) = usage.cache_read_input_tokens {
+                                output.usage.cache_read = v;
+                            }
+                            if let Some(v) = usage.cache_creation_input_tokens {
+                                output.usage.cache_write = v;
+                            }
                             output.usage.total_tokens = output.usage.input
                                 + output.usage.output
                                 + output.usage.cache_read
diff --git a/tests/test_types.rs b/tests/test_types.rs
index 207d8ac..94d8a85 100644
--- a/tests/test_types.rs
+++ b/tests/test_types.rs
@@ -858,13 +858,18 @@ fn test_usage_from_tokens() {
 
 #[test]
 fn test_usage_add() {
-    let mut u1 = Usage::from_tokens(100, 200); // total_tokens = 300
-    let u2 = Usage::from_tokens(50, 100); // total_tokens = 150
+    let mut u1 = Usage::from_tokens(100, 200); // total_tokens = 300 (from from_tokens)
+    let u2 = Usage::from_tokens(50, 100); // total_tokens = 150 (from from_tokens)
     u1.add(&u2);
     assert_eq!(u1.input, 150);
     assert_eq!(u1.output, 300);
-    // total_tokens is now recomputed as input + output + cache_read + cache_write
-    assert_eq!(u1.total_tokens, 450);
+    // `total_tokens` carries the wire-level total (set by `from_tokens` /
+    // each protocol module) and is **not** recomputed by `add` — recomputing
+    // it as `input + output + cache_read + cache_write` would double-count
+    // the cached slice for OpenAI / Google and is not safe across protocols.
+    // For a protocol-agnostic footprint, use `context_size()`.
+    assert_eq!(u1.total_tokens, 300);
+    assert_eq!(u1.context_size(), 150 + 300);
 }
 
 #[test]