Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions openhands-sdk/openhands/sdk/llm/utils/model_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,30 @@ def _supports_reasoning_effort(model: str | None) -> bool:
"claude-opus-4-7",
"claude-opus-4-8",
"claude-sonnet-4-6",
# NVIDIA Nemotron-3 Ultra (550B MoE).
#
# Marker emission is gated by the upstream actually honoring
# `cache_control: {"type": "ephemeral"}` and reporting cache hits in
# `usage.prompt_tokens_details.cached_tokens`. The OpenRouter route via
# DeepInfra does NOT honor this today (see OpenRouter prompt-caching
# docs: NVIDIA / DeepInfra are not in the supported-provider list;
# dashboard for the model shows ~0.2% cache rate across all traffic).
#
# We allowlist the SDK side anyway because:
# 1. Sending the markers on a route that ignores them is a no-op —
# not an error — so this change is safe to ship before any infra
# work lands.
# 2. The companion infra work routes Nemotron to a provider that
# DOES honor these markers (DeepInfra direct, NVIDIA NIM direct,
# or self-hosted vLLM). On that route the SDK must be sending
# markers for caching to register, so we want this in place
# first to avoid a coordinated cross-repo release.
#
# The substring `nemotron-3-ultra` matches every form we use:
# - `litellm_proxy/nemotron-3-ultra-550b-a55b`
# - `openrouter/nvidia/nemotron-3-ultra-550b-a55b`
# - `deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B` (case-insensitive)
"nemotron-3-ultra",
# Do NOT add Gemini: explicit cache_control markers freeze its cache at the
# static prefix and disable Google's implicit caching on the growing body
# (~6-14x cost). Gemini uses implicit prefix caching instead.
Expand Down
14 changes: 14 additions & 0 deletions tests/sdk/llm/test_model_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,20 @@ def test_extended_thinking_support(model, expected_extended_thinking):
("anthropic.claude-3-5-sonnet-20241022", True),
("anthropic.claude-3-haiku-20240307", True),
("anthropic.claude-3-opus-20240229", True),
# NVIDIA Nemotron-3 Ultra — every routing form we actually deploy.
# See the comment block on PROMPT_CACHE_MODELS for why this is
# allowlisted even before the infra route honors the markers.
("litellm_proxy/nemotron-3-ultra-550b-a55b", True),
("openrouter/nvidia/nemotron-3-ultra-550b-a55b", True),
# Case-insensitive: DeepInfra uses MixedCase in their model IDs.
("deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B", True),
# Negative — other Nemotron family members. We deliberately do
# NOT bulk-allowlist "nemotron"; each variant should be added
# only after its caching story has been verified separately.
# If the substring were just "nemotron", these would
# accidentally start emitting markers too.
("nvidia/llama-nemotron-rerank-vl-1b", False),
("nvidia/nemotron-3.5-content-safety", False),
# Gemini must NOT use explicit cache_control markers: they freeze the
# cache at the static prefix and disable Google's implicit caching.
("gemini-2.5-pro", False),
Expand Down
Loading