Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions agent/core/prompt_caching.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Prompt-cache helpers for HF Router FAL requests.

The HF Router/OpenRouter path uses provider-native prompt caching. Anthropic
models need explicit JSON ``cache_control`` content blocks; OpenAI models cache
eligible prefixes automatically and accept routing/retention hints in the body.
models keep explicit JSON ``cache_control`` content blocks for compatibility,
and also need the top-level ``cache_control`` hint on the OpenAI-compatible HF
Router path; the explicit markers alone are accepted there but do not produce
cache writes. OpenAI models cache eligible prefixes automatically and accept
routing/retention hints in the body.
Headers like ``X-OpenRouter-Cache`` control response caching, not prompt
caching through this route.
"""
Expand Down Expand Up @@ -67,6 +70,9 @@ def with_prompt_cache_params(
if _is_openai_gpt55(llm_params):
updates["prompt_cache_key"] = stable_session_id

if _uses_explicit_cache_control(llm_params):
updates["cache_control"] = dict(_CACHE_CONTROL)

if _is_openai_gpt55(llm_params):
updates["prompt_cache_retention"] = "24h"

Expand Down
20 changes: 12 additions & 8 deletions agent/core/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,18 @@ def _g(name, default=0):

cache_read = _g("cache_read_input_tokens")
cache_creation = _g("cache_creation_input_tokens")

if not cache_read:
details = _g("prompt_tokens_details", None)
if details is not None:
if isinstance(details, dict):
cache_read = details.get("cached_tokens", 0) or 0
else:
cache_read = getattr(details, "cached_tokens", 0) or 0
details = _g("prompt_tokens_details", None)

if not cache_read and details is not None:
if isinstance(details, dict):
cache_read = details.get("cached_tokens", 0) or 0
else:
cache_read = getattr(details, "cached_tokens", 0) or 0
if not cache_creation and details is not None:
if isinstance(details, dict):
cache_creation = details.get("cache_write_tokens", 0) or 0
else:
cache_creation = getattr(details, "cache_write_tokens", 0) or 0

return {
"prompt_tokens": int(prompt),
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/components/UsageMeter.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ export default function UsageMeter() {
Usage
</Typography>
<Typography variant="caption" color="text.secondary">
Billing window resets when you switch back to a task.
Estimated from HF account usage per session.
</Typography>

{error ? (
Expand Down
11 changes: 10 additions & 1 deletion tests/unit/test_prompt_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,19 @@ def test_prompt_cache_params_add_session_id_for_fal_router_model():
cached_params = with_prompt_cache_params(llm_params, session_id="session-1")

assert cached_params is not llm_params
assert cached_params["extra_body"] == {"session_id": "session-1"}
assert cached_params["extra_body"] == {
"session_id": "session-1",
"cache_control": {"type": "ephemeral"},
}
assert "extra_body" not in llm_params


def test_prompt_cache_params_adds_anthropic_cache_control_without_session_id():
cached_params = with_prompt_cache_params(_anthropic_fal_params())

assert cached_params["extra_body"] == {"cache_control": {"type": "ephemeral"}}


def test_prompt_cache_params_merges_gpt55_cache_hints():
llm_params = {
**_gpt55_fal_params(),
Expand Down
19 changes: 19 additions & 0 deletions tests/unit/test_telemetry_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,25 @@ async def send_event(self, event):
self.events.append(event)


def test_extract_usage_reads_hf_router_cache_write_tokens():
response = SimpleNamespace(
usage=SimpleNamespace(
prompt_tokens=100,
completion_tokens=10,
total_tokens=110,
prompt_tokens_details=SimpleNamespace(
cached_tokens=80,
cache_write_tokens=20,
),
)
)

usage = telemetry.extract_usage(response)

assert usage["cache_read_tokens"] == 80
assert usage["cache_creation_tokens"] == 20


@pytest.mark.asyncio
async def test_record_hf_job_complete_emits_runtime_cost(monkeypatch):
async def fake_catalog():
Expand Down
Loading