From a53f05306b32b5aa177c9f25e7e8a2b4debf64ce Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 11 Jun 2026 19:38:24 +0000
Subject: [PATCH] feat(llm): allowlist nemotron-3-ultra for prompt caching
 markers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add NVIDIA Nemotron-3 Ultra (550B MoE) to PROMPT_CACHE_MODELS so that
`get_features(model).supports_prompt_cache` returns True for every
deployed routing form:

  - litellm_proxy/nemotron-3-ultra-550b-a55b   (current eval route)
  - openrouter/nvidia/nemotron-3-ultra-550b-a55b
  - deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B  (planned)

When this gate is True, the SDK starts attaching
`cache_control: {"type": "ephemeral"}` markers to the long stable
prefix (system prompt + tool definitions + last user/tool turn), the
same scheme that gives Sonnet its ~3x cost reduction on long agent
conversations.

## Why this is safe to ship before the infra route changes

The current OpenRouter route to Nemotron goes through DeepInfra, which
does NOT honor cache_control markers (verified against OpenRouter's
official prompt-caching docs at
https://openrouter.ai/docs/guides/best-practices/prompt-caching —
NVIDIA / DeepInfra are not in the supported-provider list; the model
dashboard shows 0.2% global cache hit rate). On that route, sending
the markers is a silent no-op: providers that don't recognize the
field ignore it; no 400s, no behavior change.

This PR ships SDK-side now so that when the companion infra change
lands (routing Nemotron through a provider that DOES honor caching —
DeepInfra direct, NVIDIA NIM direct, or self-hosted vLLM ≥0.6.5),
caching activates immediately without requiring a coordinated
two-repo release.

## Why not just match "nemotron"

Each Nemotron-family model has its own caching story and deployment
path. Bulk-matching "nemotron" would silently enable markers for
Nemotron 3.5 Content Safety, Llama Nemotron Rerank, and any future
NVIDIA-family entries — none of which have been verified. The test
suite pins this: two negative cases (`nvidia/llama-nemotron-rerank-vl-1b`,
`nvidia/nemotron-3.5-content-safety`) fail loudly if anyone broadens
the substring without re-verifying.

## Tests

`tests/sdk/llm/test_model_features.py::test_prompt_cache_support` —
5 new parametrized cases (3 positive routing-form variants, 2 negative
Nemotron-family false-positive guards). All 154 existing
test_model_features cases still pass.

Lint + pyright clean.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/sdk/llm/utils/model_features.py | 24 +++++++++++++++++++
 tests/sdk/llm/test_model_features.py          | 14 +++++++++++
 2 files changed, 38 insertions(+)

diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index 4870fc387b..d9126b7807 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -146,6 +146,30 @@ def _supports_reasoning_effort(model: str | None) -> bool:
     "claude-opus-4-7",
     "claude-opus-4-8",
     "claude-sonnet-4-6",
+    # NVIDIA Nemotron-3 Ultra (550B MoE).
+    #
+    # Marker emission is gated by the upstream actually honoring
+    # `cache_control: {"type": "ephemeral"}` and reporting cache hits in
+    # `usage.prompt_tokens_details.cached_tokens`. The OpenRouter route via
+    # DeepInfra does NOT honor this today (see OpenRouter prompt-caching
+    # docs: NVIDIA / DeepInfra are not in the supported-provider list;
+    # dashboard for the model shows ~0.2% cache rate across all traffic).
+    #
+    # We allowlist the SDK side anyway because:
+    #   1. Sending the markers on a route that ignores them is a no-op —
+    #      not an error — so this change is safe to ship before any infra
+    #      work lands.
+    #   2. The companion infra work routes Nemotron to a provider that
+    #      DOES honor these markers (DeepInfra direct, NVIDIA NIM direct,
+    #      or self-hosted vLLM). On that route the SDK must be sending
+    #      markers for caching to register, so we want this in place
+    #      first to avoid a coordinated cross-repo release.
+    #
+    # The substring `nemotron-3-ultra` matches every form we use:
+    #   - `litellm_proxy/nemotron-3-ultra-550b-a55b`
+    #   - `openrouter/nvidia/nemotron-3-ultra-550b-a55b`
+    #   - `deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B` (case-insensitive)
+    "nemotron-3-ultra",
     # Do NOT add Gemini: explicit cache_control markers freeze its cache at the
     # static prefix and disable Google's implicit caching on the growing body
     # (~6-14x cost). Gemini uses implicit prefix caching instead.
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
index 139417d635..35634b072a 100644
--- a/tests/sdk/llm/test_model_features.py
+++ b/tests/sdk/llm/test_model_features.py
@@ -130,6 +130,20 @@ def test_extended_thinking_support(model, expected_extended_thinking):
         ("anthropic.claude-3-5-sonnet-20241022", True),
         ("anthropic.claude-3-haiku-20240307", True),
         ("anthropic.claude-3-opus-20240229", True),
+        # NVIDIA Nemotron-3 Ultra — every routing form we actually deploy.
+        # See the comment block on PROMPT_CACHE_MODELS for why this is
+        # allowlisted even before the infra route honors the markers.
+        ("litellm_proxy/nemotron-3-ultra-550b-a55b", True),
+        ("openrouter/nvidia/nemotron-3-ultra-550b-a55b", True),
+        # Case-insensitive: DeepInfra uses MixedCase in their model IDs.
+        ("deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B", True),
+        # Negative — other Nemotron family members. We deliberately do
+        # NOT bulk-allowlist "nemotron"; each variant should be added
+        # only after its caching story has been verified separately.
+        # If the substring were just "nemotron", these would
+        # accidentally start emitting markers too.
+        ("nvidia/llama-nemotron-rerank-vl-1b", False),
+        ("nvidia/nemotron-3.5-content-safety", False),
         # Gemini must NOT use explicit cache_control markers: they freeze the
         # cache at the static prefix and disable Google's implicit caching.
         ("gemini-2.5-pro", False),