From a53f05306b32b5aa177c9f25e7e8a2b4debf64ce Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 11 Jun 2026 19:38:24 +0000 Subject: [PATCH] feat(llm): allowlist nemotron-3-ultra for prompt caching markers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add NVIDIA Nemotron-3 Ultra (550B MoE) to PROMPT_CACHE_MODELS so that `get_features(model).supports_prompt_cache` returns True for every deployed routing form: - litellm_proxy/nemotron-3-ultra-550b-a55b (current eval route) - openrouter/nvidia/nemotron-3-ultra-550b-a55b - deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B (planned) When this gate is True, the SDK starts attaching `cache_control: {"type": "ephemeral"}` markers to the long stable prefix (system prompt + tool definitions + last user/tool turn), the same scheme that gives Sonnet its ~3x cost reduction on long agent conversations. ## Why this is safe to ship before the infra route changes The current OpenRouter route to Nemotron goes through DeepInfra, which does NOT honor cache_control markers (verified against OpenRouter's official prompt-caching docs at https://openrouter.ai/docs/guides/best-practices/prompt-caching — NVIDIA / DeepInfra are not in the supported-provider list; the model dashboard shows 0.2% global cache hit rate). On that route, sending the markers is a silent no-op: providers that don't recognize the field ignore it; no 400s, no behavior change. This PR ships SDK-side now so that when the companion infra change lands (routing Nemotron through a provider that DOES honor caching — DeepInfra direct, NVIDIA NIM direct, or self-hosted vLLM ≥0.6.5), caching activates immediately without requiring a coordinated two-repo release. ## Why not just match "nemotron" Each Nemotron-family model has its own caching story and deployment path. Bulk-matching "nemotron" would silently enable markers for Nemotron 3.5 Content Safety, Llama Nemotron Rerank, and any future NVIDIA-family entries — none of which have been verified. The test suite pins this: two negative cases (`nvidia/llama-nemotron-rerank-vl-1b`, `nvidia/nemotron-3.5-content-safety`) fail loudly if anyone broadens the substring without re-verifying. ## Tests `tests/sdk/llm/test_model_features.py::test_prompt_cache_support` — 5 new parametrized cases (3 positive routing-form variants, 2 negative Nemotron-family false-positive guards). All 154 existing test_model_features cases still pass. Lint + pyright clean. Co-authored-by: openhands --- .../openhands/sdk/llm/utils/model_features.py | 24 +++++++++++++++++++ tests/sdk/llm/test_model_features.py | 14 +++++++++++ 2 files changed, 38 insertions(+) diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 4870fc387b..d9126b7807 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -146,6 +146,30 @@ def _supports_reasoning_effort(model: str | None) -> bool: "claude-opus-4-7", "claude-opus-4-8", "claude-sonnet-4-6", + # NVIDIA Nemotron-3 Ultra (550B MoE). + # + # Marker emission is gated by the upstream actually honoring + # `cache_control: {"type": "ephemeral"}` and reporting cache hits in + # `usage.prompt_tokens_details.cached_tokens`. The OpenRouter route via + # DeepInfra does NOT honor this today (see OpenRouter prompt-caching + # docs: NVIDIA / DeepInfra are not in the supported-provider list; + # dashboard for the model shows ~0.2% cache rate across all traffic). + # + # We allowlist the SDK side anyway because: + # 1. Sending the markers on a route that ignores them is a no-op — + # not an error — so this change is safe to ship before any infra + # work lands. + # 2. The companion infra work routes Nemotron to a provider that + # DOES honor these markers (DeepInfra direct, NVIDIA NIM direct, + # or self-hosted vLLM). On that route the SDK must be sending + # markers for caching to register, so we want this in place + # first to avoid a coordinated cross-repo release. + # + # The substring `nemotron-3-ultra` matches every form we use: + # - `litellm_proxy/nemotron-3-ultra-550b-a55b` + # - `openrouter/nvidia/nemotron-3-ultra-550b-a55b` + # - `deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B` (case-insensitive) + "nemotron-3-ultra", # Do NOT add Gemini: explicit cache_control markers freeze its cache at the # static prefix and disable Google's implicit caching on the growing body # (~6-14x cost). Gemini uses implicit prefix caching instead. diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index 139417d635..35634b072a 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -130,6 +130,20 @@ def test_extended_thinking_support(model, expected_extended_thinking): ("anthropic.claude-3-5-sonnet-20241022", True), ("anthropic.claude-3-haiku-20240307", True), ("anthropic.claude-3-opus-20240229", True), + # NVIDIA Nemotron-3 Ultra — every routing form we actually deploy. + # See the comment block on PROMPT_CACHE_MODELS for why this is + # allowlisted even before the infra route honors the markers. + ("litellm_proxy/nemotron-3-ultra-550b-a55b", True), + ("openrouter/nvidia/nemotron-3-ultra-550b-a55b", True), + # Case-insensitive: DeepInfra uses MixedCase in their model IDs. + ("deepinfra/nvidia/Nemotron-3-Ultra-550B-A55B", True), + # Negative — other Nemotron family members. We deliberately do + # NOT bulk-allowlist "nemotron"; each variant should be added + # only after its caching story has been verified separately. + # If the substring were just "nemotron", these would + # accidentally start emitting markers too. + ("nvidia/llama-nemotron-rerank-vl-1b", False), + ("nvidia/nemotron-3.5-content-safety", False), # Gemini must NOT use explicit cache_control markers: they freeze the # cache at the static prefix and disable Google's implicit caching. ("gemini-2.5-pro", False),