diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 4870fc387b..d9126b7807 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -146,6 +146,30 @@ def _supports_reasoning_effort(model: str | None) -> bool: "claude-opus-4-7", "claude-opus-4-8", "claude-sonnet-4-6", + # NVIDIA Nemotron-3 Ultra (550B MoE). + # + # Marker emission is gated by the upstream actually honoring + # `cache_control: {"type": "ephemeral"}` and reporting cache hits in + # `usage.prompt_tokens_details.cached_tokens`. The OpenRouter route via + # DeepInfra does NOT honor this today (see OpenRouter prompt-caching + # docs: NVIDIA / DeepInfra are not in the supported-provider list; + # dashboard for the model shows ~0.2% cache rate across all traffic). + # + # We allowlist the SDK side anyway because: + # 1. Sending the markers on a route that ignores them is a no-op — + # not an error — so this change is safe to ship before any infra + # work lands. + # 2. The companion infra work routes Nemotron to a provider that + # DOES honor these markers (DeepInfra direct, NVIDIA NIM direct, + # or self-hosted vLLM). On that route the SDK must be sending + # markers for caching to register, so we want this in place + # first to avoid a coordinated cross-repo release. + # + # The substring `nemotron-3-ultra` matches every form we use: + # - `litellm_proxy/nemotron-3-ultra-550b-a55b` + # - `openrouter/nvidia/nemotron-3-ultra-550b-a55b` + # - `deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B` (case-insensitive) + "nemotron-3-ultra", # Do NOT add Gemini: explicit cache_control markers freeze its cache at the # static prefix and disable Google's implicit caching on the growing body # (~6-14x cost). Gemini uses implicit prefix caching instead. diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index 139417d635..35634b072a 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -130,6 +130,20 @@ def test_extended_thinking_support(model, expected_extended_thinking): ("anthropic.claude-3-5-sonnet-20241022", True), ("anthropic.claude-3-haiku-20240307", True), ("anthropic.claude-3-opus-20240229", True), + # NVIDIA Nemotron-3 Ultra — every routing form we actually deploy. + # See the comment block on PROMPT_CACHE_MODELS for why this is + # allowlisted even before the infra route honors the markers. + ("litellm_proxy/nemotron-3-ultra-550b-a55b", True), + ("openrouter/nvidia/nemotron-3-ultra-550b-a55b", True), + # Case-insensitive: DeepInfra uses MixedCase in their model IDs. + ("deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B", True), + # Negative — other Nemotron family members. We deliberately do + # NOT bulk-allowlist "nemotron"; each variant should be added + # only after its caching story has been verified separately. + # If the substring were just "nemotron", these would + # accidentally start emitting markers too. + ("nvidia/llama-nemotron-rerank-vl-1b", False), + ("nvidia/nemotron-3.5-content-safety", False), # Gemini must NOT use explicit cache_control markers: they freeze the # cache at the static prefix and disable Google's implicit caching. ("gemini-2.5-pro", False),