tiylabs · HayWolf · Jun 10, 2026 · Jun 10, 2026 · github-actions · Jun 10, 2026
diff --git a/src/protocol/anthropic.rs b/src/protocol/anthropic.rs
@@ -560,6 +560,14 @@ struct MessageDelta {
 struct MessageDeltaUsage {
     #[serde(default)]
     output_tokens: u64,
+    // Anthropic may report the final cache read / write counts in the
+    // `message_delta` event (after `message_start`). The values are
+    // optional: when Anthropic omits the field the `message_start` values
+    // are authoritative, and we must NOT clobber them with `0`.
+    #[serde(default)]
+    cache_read_input_tokens: Option<u64>,
+    #[serde(default)]
+    cache_creation_input_tokens: Option<u64>,
 }
 
 // ============================================================================
@@ -1474,6 +1482,21 @@ async fn run_stream(
                         }
                         if let Some(usage) = delta_data.usage {
                             output.usage.output = usage.output_tokens;
+                            // Anthropic reports the final cache read / write
+                            // counters in `message_delta` (the values from
+                            // `message_start` are placeholders). Only override
+                            // the three segments when `message_delta`
+                            // explicitly carries the field — Anthropic omits
+                            // the field when the values are unchanged, and
+                            // blindly assigning `0` would clobber the
+                            // `message_start` values and break three-segment
+                            // billable accounting.
+                            if let Some(v) = usage.cache_read_input_tokens {
+                                output.usage.cache_read = v;
+                            }
+                            if let Some(v) = usage.cache_creation_input_tokens {
+                                output.usage.cache_write = v;
+                            }
                             output.usage.total_tokens = output.usage.input
                                 + output.usage.output
                                 + output.usage.cache_read

diff --git a/src/protocol/google.rs b/src/protocol/google.rs
@@ -1075,9 +1075,20 @@ async fn run_stream(
                     }
 
                     // Handle usage metadata
+                    //
+                    // Semantic alignment with other protocols (OpenAI):
+                    //   `prompt_token_count` is the *total* prompt size INCLUDING
+                    //   any cached portion, so we subtract the cached slice to
+                    //   get the uncached `input`. `cache_read` holds the cached
+                    //   slice, so `input + cache_read == prompt_token_count`.
+                    //   After this normalization, `Usage::context_size()`
+                    //   (input + output + cache_read + cache_write) returns the
+                    //   true context footprint for Google as well.
                     if let Some(ref usage) = chunk_data.usage_metadata {
                         saw_usage_metadata = true;
-                        output.usage.input = usage.prompt_token_count;
+                        output.usage.input = usage
+                            .prompt_token_count
+                            .saturating_sub(usage.cached_content_token_count);
                         output.usage.output =
                             usage.candidates_token_count + usage.thoughts_token_count;
                         output.usage.cache_read = usage.cached_content_token_count;

diff --git a/src/types/usage.rs b/src/types/usage.rs
@@ -1,19 +1,76 @@
 //! Token usage and cost tracking.
+//!
+//! ## Field semantics across protocols
+//!
+//! Providers report cache in two distinct shapes. All protocol modules in
+//! `src/protocol/` normalize the wire values into a single internal model so
+//! downstream code can use a uniform formula:
+//!
+//! | Field        | OpenAI / Google                                       | Anthropic                              |
+//! | ------------ | ----------------------------------------------------- | -------------------------------------- |
+//! | `input`      | prompt tokens **excluding** the cached slice          | non-cached input (mutually exclusive)  |
+//! | `cache_read` | cached slice of the prompt (subset of original total) | cache hit tokens (mutually exclusive)  |
+//! | `cache_write`| always 0 (no write-side API on these providers)      | tokens written to cache this request   |
+//! | `output`     | generated tokens                                      | generated tokens                       |
+//!
+//! After normalization:
+//! - OpenAI / Google: `input + cache_read` equals the original
+//!   `prompt_tokens` / `prompt_token_count` from the wire response.
+//! - Anthropic: `input + cache_read + cache_write` is the three non-overlapping
+//!   components of billable input tokens.
+//!
+//! In **every** case the true context footprint of one request is
+//! `input + output + cache_read + cache_write` — see
+//! [`Usage::context_size`].
 
 use serde::{Deserialize, Serialize};
 
 /// Token usage information.
+///
+/// Field semantics are documented at the module level — see the table in
+/// `src/types/usage.rs` for the per-protocol meaning of each counter.
+///
+/// The recommended way to get the true context footprint of a request across
+/// all providers is [`Usage::context_size`], which always returns
+/// `input + output + cache_read + cache_write` regardless of the original
+/// protocol's accounting convention.
 #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)]
 pub struct Usage {
-    /// Number of input tokens.
+    /// Number of non-cached input tokens.
+    ///
+    /// - OpenAI / Google: `prompt_tokens - cached_tokens` (cached slice
+    ///   has been split out into [`Self::cache_read`]).
+    /// - Anthropic: raw `input_tokens` (cache hits / writes are reported
+    ///   separately and do **not** overlap with this counter).
     pub input: u64,
-    /// Number of output tokens.
+    /// Number of output tokens (model-generated).
     pub output: u64,
     /// Number of cached tokens read.
+    ///
+    /// - OpenAI / Google: subset of the original prompt token total — when
+    ///   added back to [`Self::input`] it reproduces the wire `prompt_tokens`.
+    /// - Anthropic: mutually exclusive with [`Self::input`] and
+    ///   [`Self::cache_write`].
     pub cache_read: u64,
-    /// Number of tokens written to cache.
+    /// Number of tokens written to cache. Only Anthropic populates this
+    /// counter; it is always `0` for OpenAI / Google.
     pub cache_write: u64,
     /// Total tokens used.
+    ///
+    /// This field is the **wire-level** total (when the provider exposes one)
+    /// or a synthesized sum for protocols that don't emit one. Its exact
+    /// meaning depends on the source protocol:
+    ///
+    /// - OpenAI Chat Completions / Responses: `prompt_tokens + completion_tokens`.
+    ///   `cache_read` is *implicit* in `prompt_tokens` and is **not** added
+    ///   again here.
+    /// - Google Generative AI: `totalTokenCount` from the API.
+    /// - Anthropic: `input + output + cache_read + cache_write`
+    ///   (synthesized client-side).
+    ///
+    /// For a protocol-agnostic context footprint, use
+    /// [`Usage::context_size`] instead. Do not sum `total_tokens` across
+    /// different providers.
     pub total_tokens: u64,
     /// Cost breakdown.
     pub cost: UsageCost,
@@ -37,13 +94,32 @@ impl Usage {
         }
     }
 
+    /// Protocol-agnostic context footprint of the request.
+    ///
+    /// Returns `input + output + cache_read + cache_write`. After the
+    /// normalization performed in each protocol module, this is the true
+    /// "how many tokens does this exchange touch" number for **every**
+    /// provider — including the cached portion for OpenAI / Google and the
+    /// full three-segment billable input for Anthropic.
+    pub fn context_size(&self) -> u64 {
+        self.input + self.output + self.cache_read + self.cache_write
+    }
+
     /// Add another usage to this one.
+    ///
+    /// The four token counters are summed component-wise. The cached
+    /// `total_tokens` field is **left untouched** because it carries the
+    /// wire-level semantics of whichever request populated it last and
+    /// is not safe to recompute across protocols (or even within the same
+    /// provider, where `total_tokens` is `prompt + completion` and would
+    /// double-count the cached slice if summed as
+    /// `input + output + cache_read + cache_write`). Use
+    /// [`Usage::context_size`] for the protocol-agnostic footprint.
     pub fn add(&mut self, other: &Usage) {
         self.input += other.input;
         self.output += other.output;
         self.cache_read += other.cache_read;
         self.cache_write += other.cache_write;
-        self.total_tokens = self.input + self.output + self.cache_read + self.cache_write;
         self.cost.input += other.cost.input;
         self.cost.output += other.cost.output;
         self.cost.cache_read += other.cost.cache_read;

diff --git a/tests/test_types.rs b/tests/test_types.rs
@@ -858,13 +858,18 @@ fn test_usage_from_tokens() {
 
 #[test]
 fn test_usage_add() {
-    let mut u1 = Usage::from_tokens(100, 200); // total_tokens = 300
-    let u2 = Usage::from_tokens(50, 100); // total_tokens = 150
+    let mut u1 = Usage::from_tokens(100, 200); // total_tokens = 300 (from from_tokens)
+    let u2 = Usage::from_tokens(50, 100); // total_tokens = 150 (from from_tokens)
     u1.add(&u2);
     assert_eq!(u1.input, 150);
     assert_eq!(u1.output, 300);
-    // total_tokens is now recomputed as input + output + cache_read + cache_write
-    assert_eq!(u1.total_tokens, 450);
+    // `total_tokens` carries the wire-level total (set by `from_tokens` /
+    // each protocol module) and is **not** recomputed by `add` — recomputing
+    // it as `input + output + cache_read + cache_write` would double-count
+    // the cached slice for OpenAI / Google and is not safe across protocols.
+    // For a protocol-agnostic footprint, use `context_size()`.
+    assert_eq!(u1.total_tokens, 300);
+    assert_eq!(u1.context_size(), 150 + 300);
 }
 
 #[test]