OpenHands · juanmichelini · Jun 11, 2026
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -146,6 +146,30 @@ def _supports_reasoning_effort(model: str | None) -> bool:
     "claude-opus-4-7",
     "claude-opus-4-8",
     "claude-sonnet-4-6",
+    # NVIDIA Nemotron-3 Ultra (550B MoE).
+    #
+    # Marker emission is gated by the upstream actually honoring
+    # `cache_control: {"type": "ephemeral"}` and reporting cache hits in
+    # `usage.prompt_tokens_details.cached_tokens`. The OpenRouter route via
+    # DeepInfra does NOT honor this today (see OpenRouter prompt-caching
+    # docs: NVIDIA / DeepInfra are not in the supported-provider list;
+    # dashboard for the model shows ~0.2% cache rate across all traffic).
+    #
+    # We allowlist the SDK side anyway because:
+    #   1. Sending the markers on a route that ignores them is a no-op —
+    #      not an error — so this change is safe to ship before any infra
+    #      work lands.
+    #   2. The companion infra work routes Nemotron to a provider that
+    #      DOES honor these markers (DeepInfra direct, NVIDIA NIM direct,
+    #      or self-hosted vLLM). On that route the SDK must be sending
+    #      markers for caching to register, so we want this in place
+    #      first to avoid a coordinated cross-repo release.
+    #
+    # The substring `nemotron-3-ultra` matches every form we use:
+    #   - `litellm_proxy/nemotron-3-ultra-550b-a55b`
+    #   - `openrouter/nvidia/nemotron-3-ultra-550b-a55b`
+    #   - `deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B` (case-insensitive)
+    "nemotron-3-ultra",
     # Do NOT add Gemini: explicit cache_control markers freeze its cache at the
     # static prefix and disable Google's implicit caching on the growing body
     # (~6-14x cost). Gemini uses implicit prefix caching instead.

diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
@@ -130,6 +130,20 @@ def test_extended_thinking_support(model, expected_extended_thinking):
         ("anthropic.claude-3-5-sonnet-20241022", True),
         ("anthropic.claude-3-haiku-20240307", True),
         ("anthropic.claude-3-opus-20240229", True),
+        # NVIDIA Nemotron-3 Ultra — every routing form we actually deploy.
+        # See the comment block on PROMPT_CACHE_MODELS for why this is
+        # allowlisted even before the infra route honors the markers.
+        ("litellm_proxy/nemotron-3-ultra-550b-a55b", True),
+        ("openrouter/nvidia/nemotron-3-ultra-550b-a55b", True),
+        # Case-insensitive: DeepInfra uses MixedCase in their model IDs.
+        ("deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B", True),
+        # Negative — other Nemotron family members. We deliberately do
+        # NOT bulk-allowlist "nemotron"; each variant should be added
+        # only after its caching story has been verified separately.
+        # If the substring were just "nemotron", these would
+        # accidentally start emitting markers too.
+        ("nvidia/llama-nemotron-rerank-vl-1b", False),
+        ("nvidia/nemotron-3.5-content-safety", False),
         # Gemini must NOT use explicit cache_control markers: they freeze the
         # cache at the static prefix and disable Google's implicit caching.
         ("gemini-2.5-pro", False),