From 7161a4931c6be8d4d859f4f0293ae881449db2cd Mon Sep 17 00:00:00 2001 From: Jorben Date: Thu, 11 Jun 2026 00:40:19 +0800 Subject: [PATCH 1/2] =?UTF-8?q?refactor(usage):=20=E2=99=BB=EF=B8=8F=20nor?= =?UTF-8?q?malize=20token=20usage=20accounting=20across=20providers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standardize how `input`, `cache_read`, and `cache_write` are computed in the Anthropic and Google protocol modules so they follow a uniform model: - Anthropic: take final cache read/write counts from `message_delta` (the `message_start` values are placeholders) so the three-segment billable input remains non-overlapping. - Google: subtract `cached_content_token_count` from `prompt_token_count` to derive the uncached `input`, matching OpenAI's convention where `input + cache_read == prompt_tokens`. This allows downstream code to use a single formula for the true context footprint of any request, exposed via the new `Usage::context_size()` method. Comprehensive documentation now explains the per-protocol field semantics at both the module and struct level. Also removes the implicit `total_tokens` recomputation in `Usage::add()` since that field carries provider-specific wire semantics and must not be re-aggregated across requests or protocols. --- src/protocol/anthropic.rs | 16 ++++++++ src/protocol/google.rs | 13 +++++- src/types/usage.rs | 84 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/src/protocol/anthropic.rs b/src/protocol/anthropic.rs index da03e3f..c3a8514 100644 --- a/src/protocol/anthropic.rs +++ b/src/protocol/anthropic.rs @@ -560,6 +560,14 @@ struct MessageDelta { struct MessageDeltaUsage { #[serde(default)] output_tokens: u64, + // Anthropic may report the final cache read / write counts in the + // `message_delta` event (after `message_start`). Track them so the + // resulting `Usage` reflects the true three-segment billable input + // (input + cache_read + cache_write). + #[serde(default)] + cache_read_input_tokens: u64, + #[serde(default)] + cache_creation_input_tokens: u64, } // ============================================================================ @@ -1474,6 +1482,14 @@ async fn run_stream( } if let Some(usage) = delta_data.usage { output.usage.output = usage.output_tokens; + // Anthropic reports the final cache read / write + // counters in `message_delta` (the values from + // `message_start` are placeholders). Always take + // the delta's values when present so the three + // segments (input / cache_read / cache_write) + // remain non-overlapping and billable. + output.usage.cache_read = usage.cache_read_input_tokens; + output.usage.cache_write = usage.cache_creation_input_tokens; output.usage.total_tokens = output.usage.input + output.usage.output + output.usage.cache_read diff --git a/src/protocol/google.rs b/src/protocol/google.rs index 3e4cf4c..312e1d9 100644 --- a/src/protocol/google.rs +++ b/src/protocol/google.rs @@ -1075,9 +1075,20 @@ async fn run_stream( } // Handle usage metadata + // + // Semantic alignment with other protocols (OpenAI): + // `prompt_token_count` is the *total* prompt size INCLUDING + // any cached portion, so we subtract the cached slice to + // get the uncached `input`. `cache_read` holds the cached + // slice, so `input + cache_read == prompt_token_count`. + // After this normalization, `Usage::context_size()` + // (input + output + cache_read + cache_write) returns the + // true context footprint for Google as well. if let Some(ref usage) = chunk_data.usage_metadata { saw_usage_metadata = true; - output.usage.input = usage.prompt_token_count; + output.usage.input = usage + .prompt_token_count + .saturating_sub(usage.cached_content_token_count); output.usage.output = usage.candidates_token_count + usage.thoughts_token_count; output.usage.cache_read = usage.cached_content_token_count; diff --git a/src/types/usage.rs b/src/types/usage.rs index e0501aa..5a939a8 100644 --- a/src/types/usage.rs +++ b/src/types/usage.rs @@ -1,19 +1,76 @@ //! Token usage and cost tracking. +//! +//! ## Field semantics across protocols +//! +//! Providers report cache in two distinct shapes. All protocol modules in +//! `src/protocol/` normalize the wire values into a single internal model so +//! downstream code can use a uniform formula: +//! +//! | Field | OpenAI / Google | Anthropic | +//! | ------------ | ----------------------------------------------------- | -------------------------------------- | +//! | `input` | prompt tokens **excluding** the cached slice | non-cached input (mutually exclusive) | +//! | `cache_read` | cached slice of the prompt (subset of original total) | cache hit tokens (mutually exclusive) | +//! | `cache_write`| always 0 (no write-side API on these providers) | tokens written to cache this request | +//! | `output` | generated tokens | generated tokens | +//! +//! After normalization: +//! - OpenAI / Google: `input + cache_read` equals the original +//! `prompt_tokens` / `prompt_token_count` from the wire response. +//! - Anthropic: `input + cache_read + cache_write` is the three non-overlapping +//! components of billable input tokens. +//! +//! In **every** case the true context footprint of one request is +//! `input + output + cache_read + cache_write` — see +//! [`Usage::context_size`]. use serde::{Deserialize, Serialize}; /// Token usage information. +/// +/// Field semantics are documented at the module level — see the table in +/// `src/types/usage.rs` for the per-protocol meaning of each counter. +/// +/// The recommended way to get the true context footprint of a request across +/// all providers is [`Usage::context_size`], which always returns +/// `input + output + cache_read + cache_write` regardless of the original +/// protocol's accounting convention. #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)] pub struct Usage { - /// Number of input tokens. + /// Number of non-cached input tokens. + /// + /// - OpenAI / Google: `prompt_tokens - cached_tokens` (cached slice + /// has been split out into [`Self::cache_read`]). + /// - Anthropic: raw `input_tokens` (cache hits / writes are reported + /// separately and do **not** overlap with this counter). pub input: u64, - /// Number of output tokens. + /// Number of output tokens (model-generated). pub output: u64, /// Number of cached tokens read. + /// + /// - OpenAI / Google: subset of the original prompt token total — when + /// added back to [`Self::input`] it reproduces the wire `prompt_tokens`. + /// - Anthropic: mutually exclusive with [`Self::input`] and + /// [`Self::cache_write`]. pub cache_read: u64, - /// Number of tokens written to cache. + /// Number of tokens written to cache. Only Anthropic populates this + /// counter; it is always `0` for OpenAI / Google. pub cache_write: u64, /// Total tokens used. + /// + /// This field is the **wire-level** total (when the provider exposes one) + /// or a synthesized sum for protocols that don't emit one. Its exact + /// meaning depends on the source protocol: + /// + /// - OpenAI Chat Completions / Responses: `prompt_tokens + completion_tokens`. + /// `cache_read` is *implicit* in `prompt_tokens` and is **not** added + /// again here. + /// - Google Generative AI: `totalTokenCount` from the API. + /// - Anthropic: `input + output + cache_read + cache_write` + /// (synthesized client-side). + /// + /// For a protocol-agnostic context footprint, use + /// [`Usage::context_size`] instead. Do not sum `total_tokens` across + /// different providers. pub total_tokens: u64, /// Cost breakdown. pub cost: UsageCost, @@ -37,13 +94,32 @@ impl Usage { } } + /// Protocol-agnostic context footprint of the request. + /// + /// Returns `input + output + cache_read + cache_write`. After the + /// normalization performed in each protocol module, this is the true + /// "how many tokens does this exchange touch" number for **every** + /// provider — including the cached portion for OpenAI / Google and the + /// full three-segment billable input for Anthropic. + pub fn context_size(&self) -> u64 { + self.input + self.output + self.cache_read + self.cache_write + } + /// Add another usage to this one. + /// + /// The four token counters are summed component-wise. The cached + /// `total_tokens` field is **left untouched** because it carries the + /// wire-level semantics of whichever request populated it last and + /// is not safe to recompute across protocols (or even within the same + /// provider, where `total_tokens` is `prompt + completion` and would + /// double-count the cached slice if summed as + /// `input + output + cache_read + cache_write`). Use + /// [`Usage::context_size`] for the protocol-agnostic footprint. pub fn add(&mut self, other: &Usage) { self.input += other.input; self.output += other.output; self.cache_read += other.cache_read; self.cache_write += other.cache_write; - self.total_tokens = self.input + self.output + self.cache_read + self.cache_write; self.cost.input += other.cost.input; self.cost.output += other.cost.output; self.cost.cache_read += other.cost.cache_read; From a3686da96c27bbd118b09f7008307616b8993e76 Mon Sep 17 00:00:00 2001 From: Jorben Date: Thu, 11 Jun 2026 00:49:10 +0800 Subject: [PATCH 2/2] fix(usage): preserve message_start cache counters when message_delta omits them; update test_usage_add for new contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Anthropic: change MessageDeltaUsage cache fields to Option so a message_delta event without cache fields no longer clobbers the values populated from message_start. - test_usage_add: assert the new `add` contract — total_tokens is not recomputed, use context_size() for the cross-protocol footprint. --- src/protocol/anthropic.rs | 29 ++++++++++++++++++----------- tests/test_types.rs | 13 +++++++++---- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/protocol/anthropic.rs b/src/protocol/anthropic.rs index c3a8514..5d650b0 100644 --- a/src/protocol/anthropic.rs +++ b/src/protocol/anthropic.rs @@ -561,13 +561,13 @@ struct MessageDeltaUsage { #[serde(default)] output_tokens: u64, // Anthropic may report the final cache read / write counts in the - // `message_delta` event (after `message_start`). Track them so the - // resulting `Usage` reflects the true three-segment billable input - // (input + cache_read + cache_write). + // `message_delta` event (after `message_start`). The values are + // optional: when Anthropic omits the field the `message_start` values + // are authoritative, and we must NOT clobber them with `0`. #[serde(default)] - cache_read_input_tokens: u64, + cache_read_input_tokens: Option, #[serde(default)] - cache_creation_input_tokens: u64, + cache_creation_input_tokens: Option, } // ============================================================================ @@ -1484,12 +1484,19 @@ async fn run_stream( output.usage.output = usage.output_tokens; // Anthropic reports the final cache read / write // counters in `message_delta` (the values from - // `message_start` are placeholders). Always take - // the delta's values when present so the three - // segments (input / cache_read / cache_write) - // remain non-overlapping and billable. - output.usage.cache_read = usage.cache_read_input_tokens; - output.usage.cache_write = usage.cache_creation_input_tokens; + // `message_start` are placeholders). Only override + // the three segments when `message_delta` + // explicitly carries the field — Anthropic omits + // the field when the values are unchanged, and + // blindly assigning `0` would clobber the + // `message_start` values and break three-segment + // billable accounting. + if let Some(v) = usage.cache_read_input_tokens { + output.usage.cache_read = v; + } + if let Some(v) = usage.cache_creation_input_tokens { + output.usage.cache_write = v; + } output.usage.total_tokens = output.usage.input + output.usage.output + output.usage.cache_read diff --git a/tests/test_types.rs b/tests/test_types.rs index 207d8ac..94d8a85 100644 --- a/tests/test_types.rs +++ b/tests/test_types.rs @@ -858,13 +858,18 @@ fn test_usage_from_tokens() { #[test] fn test_usage_add() { - let mut u1 = Usage::from_tokens(100, 200); // total_tokens = 300 - let u2 = Usage::from_tokens(50, 100); // total_tokens = 150 + let mut u1 = Usage::from_tokens(100, 200); // total_tokens = 300 (from from_tokens) + let u2 = Usage::from_tokens(50, 100); // total_tokens = 150 (from from_tokens) u1.add(&u2); assert_eq!(u1.input, 150); assert_eq!(u1.output, 300); - // total_tokens is now recomputed as input + output + cache_read + cache_write - assert_eq!(u1.total_tokens, 450); + // `total_tokens` carries the wire-level total (set by `from_tokens` / + // each protocol module) and is **not** recomputed by `add` — recomputing + // it as `input + output + cache_read + cache_write` would double-count + // the cached slice for OpenAI / Google and is not safe across protocols. + // For a protocol-agnostic footprint, use `context_size()`. + assert_eq!(u1.total_tokens, 300); + assert_eq!(u1.context_size(), 150 + 300); } #[test]