From 4dfd3b5cad46a9e95ae1a8ed091569d9ef263a9f Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 7 Jun 2026 22:16:34 +0200 Subject: [PATCH 1/4] Enable FAL prompt cache validation Co-authored-by: OpenAI Codex --- agent/core/prompt_caching.py | 10 ++++++++-- agent/core/telemetry.py | 7 +++++++ tests/unit/test_prompt_caching.py | 5 ++++- tests/unit/test_telemetry_usage.py | 19 +++++++++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/agent/core/prompt_caching.py b/agent/core/prompt_caching.py index 04304bcd..38c58068 100644 --- a/agent/core/prompt_caching.py +++ b/agent/core/prompt_caching.py @@ -1,8 +1,11 @@ """Prompt-cache helpers for HF Router FAL requests. The HF Router/OpenRouter path uses provider-native prompt caching. Anthropic -models need explicit JSON ``cache_control`` content blocks; OpenAI models cache -eligible prefixes automatically and accept routing/retention hints in the body. +models keep explicit JSON ``cache_control`` content blocks for compatibility, +and also need the top-level ``cache_control`` hint on the OpenAI-compatible HF +Router path; the explicit markers alone are accepted there but do not produce +cache writes. OpenAI models cache eligible prefixes automatically and accept +routing/retention hints in the body. Headers like ``X-OpenRouter-Cache`` control response caching, not prompt caching through this route. """ @@ -67,6 +70,9 @@ def with_prompt_cache_params( if _is_openai_gpt55(llm_params): updates["prompt_cache_key"] = stable_session_id + if _uses_explicit_cache_control(llm_params): + updates["cache_control"] = dict(_CACHE_CONTROL) + if _is_openai_gpt55(llm_params): updates["prompt_cache_retention"] = "24h" diff --git a/agent/core/telemetry.py b/agent/core/telemetry.py index f71078b7..d3ec0abf 100644 --- a/agent/core/telemetry.py +++ b/agent/core/telemetry.py @@ -60,6 +60,13 @@ def _g(name, default=0): cache_read = details.get("cached_tokens", 0) or 0 else: cache_read = getattr(details, "cached_tokens", 0) or 0 + if not cache_creation: + details = _g("prompt_tokens_details", None) + if details is not None: + if isinstance(details, dict): + cache_creation = details.get("cache_write_tokens", 0) or 0 + else: + cache_creation = getattr(details, "cache_write_tokens", 0) or 0 return { "prompt_tokens": int(prompt), diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py index 95ec8ebf..86ab14a6 100644 --- a/tests/unit/test_prompt_caching.py +++ b/tests/unit/test_prompt_caching.py @@ -182,7 +182,10 @@ def test_prompt_cache_params_add_session_id_for_fal_router_model(): cached_params = with_prompt_cache_params(llm_params, session_id="session-1") assert cached_params is not llm_params - assert cached_params["extra_body"] == {"session_id": "session-1"} + assert cached_params["extra_body"] == { + "session_id": "session-1", + "cache_control": {"type": "ephemeral"}, + } assert "extra_body" not in llm_params diff --git a/tests/unit/test_telemetry_usage.py b/tests/unit/test_telemetry_usage.py index 4a1f3db9..dfb27bbb 100644 --- a/tests/unit/test_telemetry_usage.py +++ b/tests/unit/test_telemetry_usage.py @@ -13,6 +13,25 @@ async def send_event(self, event): self.events.append(event) +def test_extract_usage_reads_hf_router_cache_write_tokens(): + response = SimpleNamespace( + usage=SimpleNamespace( + prompt_tokens=100, + completion_tokens=10, + total_tokens=110, + prompt_tokens_details=SimpleNamespace( + cached_tokens=80, + cache_write_tokens=20, + ), + ) + ) + + usage = telemetry.extract_usage(response) + + assert usage["cache_read_tokens"] == 80 + assert usage["cache_creation_tokens"] == 20 + + @pytest.mark.asyncio async def test_record_hf_job_complete_emits_runtime_cost(monkeypatch): async def fake_catalog(): From 083d485cc1010c234b162733384a8f9e94d117b5 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 7 Jun 2026 22:26:20 +0200 Subject: [PATCH 2/4] Address FAL prompt cache review Co-authored-by: OpenAI Codex --- agent/core/telemetry.py | 27 ++++++++++++--------------- tests/unit/test_prompt_caching.py | 6 ++++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/agent/core/telemetry.py b/agent/core/telemetry.py index d3ec0abf..43b359c5 100644 --- a/agent/core/telemetry.py +++ b/agent/core/telemetry.py @@ -52,21 +52,18 @@ def _g(name, default=0): cache_read = _g("cache_read_input_tokens") cache_creation = _g("cache_creation_input_tokens") - - if not cache_read: - details = _g("prompt_tokens_details", None) - if details is not None: - if isinstance(details, dict): - cache_read = details.get("cached_tokens", 0) or 0 - else: - cache_read = getattr(details, "cached_tokens", 0) or 0 - if not cache_creation: - details = _g("prompt_tokens_details", None) - if details is not None: - if isinstance(details, dict): - cache_creation = details.get("cache_write_tokens", 0) or 0 - else: - cache_creation = getattr(details, "cache_write_tokens", 0) or 0 + details = _g("prompt_tokens_details", None) + + if not cache_read and details is not None: + if isinstance(details, dict): + cache_read = details.get("cached_tokens", 0) or 0 + else: + cache_read = getattr(details, "cached_tokens", 0) or 0 + if not cache_creation and details is not None: + if isinstance(details, dict): + cache_creation = details.get("cache_write_tokens", 0) or 0 + else: + cache_creation = getattr(details, "cache_write_tokens", 0) or 0 return { "prompt_tokens": int(prompt), diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py index 86ab14a6..424b8536 100644 --- a/tests/unit/test_prompt_caching.py +++ b/tests/unit/test_prompt_caching.py @@ -189,6 +189,12 @@ def test_prompt_cache_params_add_session_id_for_fal_router_model(): assert "extra_body" not in llm_params +def test_prompt_cache_params_adds_anthropic_cache_control_without_session_id(): + cached_params = with_prompt_cache_params(_anthropic_fal_params()) + + assert cached_params["extra_body"] == {"cache_control": {"type": "ephemeral"}} + + def test_prompt_cache_params_merges_gpt55_cache_hints(): llm_params = { **_gpt55_fal_params(), From fef620e06bb36fcd62c1794fed0ad31d67094db9 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 7 Jun 2026 22:32:51 +0200 Subject: [PATCH 3/4] Clarify session usage estimate copy Co-authored-by: OpenAI Codex --- frontend/src/components/UsageMeter.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/UsageMeter.tsx b/frontend/src/components/UsageMeter.tsx index 55f054cc..d915d651 100644 --- a/frontend/src/components/UsageMeter.tsx +++ b/frontend/src/components/UsageMeter.tsx @@ -230,7 +230,7 @@ export default function UsageMeter() { Usage - Billing window resets when you switch back to a task. + Estimated from HF account usage during this session. {error ? ( From db86503ff2042e9460d459420cd4a7b57527b407 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Sun, 7 Jun 2026 22:34:34 +0200 Subject: [PATCH 4/4] Update usage estimate copy Co-authored-by: OpenAI Codex --- frontend/src/components/UsageMeter.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/UsageMeter.tsx b/frontend/src/components/UsageMeter.tsx index d915d651..89a0d2f4 100644 --- a/frontend/src/components/UsageMeter.tsx +++ b/frontend/src/components/UsageMeter.tsx @@ -230,7 +230,7 @@ export default function UsageMeter() { Usage - Estimated from HF account usage during this session. + Estimated from HF account usage per session. {error ? (