diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 6321ae78a1..0c3f7abb21 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -102,6 +102,12 @@ def _normalized_supported_openai_params(model: str | None) -> frozenset[str]: REASONING_EFFORT_MODELS: list[str] = [ # https://www.anthropic.com/news/claude-fable-5 "claude-fable-5", + # LiteLLM recognizes the first-party "anthropic/claude-opus-4-8" id, but not + # the Bedrock cross-region inference ids (e.g. + # "bedrock/us.anthropic.claude-opus-4-8-v1:0"), which fall through to the + # non-reasoning branch and leak temperature/top_p. List explicitly until + # LiteLLM ships Bedrock metadata for this model. + "claude-opus-4-8", ] diff --git a/tests/sdk/llm/test_chat_options.py b/tests/sdk/llm/test_chat_options.py index 4f63e95270..091910ab3f 100644 --- a/tests/sdk/llm/test_chat_options.py +++ b/tests/sdk/llm/test_chat_options.py @@ -147,6 +147,32 @@ def test_claude_sonnet_4_6_strips_temp_and_top_p(): assert "top_p" not in out +def test_bedrock_opus_4_8_strips_temp_top_p_without_thinking_block(): + """Bedrock cross-region claude-opus-4-8 routes through the reasoning path. + + LiteLLM does not (yet) recognize the Bedrock cross-region inference id as a + reasoning model, so the SDK-side override must mark it as one. It must take + the reasoning_effort path (which strips temperature/top_p) and NOT the + extended-thinking path, which would inject the legacy + ``thinking.type=enabled`` block + ``interleaved-thinking`` header that + Anthropic now rejects for this model (see reverted #3427 / revert #3441). + """ + llm = DummyLLM( + model="bedrock/us.anthropic.claude-opus-4-8-v1:0", + top_p=1.0, # SDK default + temperature=0.0, # Often overridden by benchmarks (e.g. SWE-bench) + reasoning_effort="high", + ) + out = select_chat_options(llm, user_kwargs={}, has_tools=True) + + assert "temperature" not in out + assert "top_p" not in out + assert out.get("reasoning_effort") == "high" + # Must NOT take the legacy extended-thinking path. + assert "thinking" not in out + assert "anthropic-beta" not in out.get("extra_headers", {}) + + def test_extended_thinking_budget_clamped_below_max_tokens(): """Test that thinking.budget_tokens is clamped to max_output_tokens - 1.""" # Case 1: extended_thinking_budget exceeds max_output_tokens diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index ce69058f53..0aae7e6339 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -68,6 +68,16 @@ def test_model_matches(name, pattern, expected): ("claude-fable-5", True), ("anthropic/claude-fable-5", True), ("litellm_proxy/anthropic/claude-fable-5", True), + # claude-opus-4-8: LiteLLM recognizes the first-party id, but not the + # Bedrock cross-region inference ids, which must be caught by the + # SDK-side override so temperature/top_p are stripped before the request + # reaches Anthropic (which rejects temperature for this model). + ("claude-opus-4-8", True), + ("anthropic/claude-opus-4-8", True), + ("bedrock/us.anthropic.claude-opus-4-8-v1:0", True), + ("bedrock/eu.anthropic.claude-opus-4-8-v1:0", True), + ("bedrock/apac.anthropic.claude-opus-4-8-v1:0", True), + ("bedrock/global.anthropic.claude-opus-4-8-v1:0", True), # LiteLLM proxy with deployment path prefixes (prod/, dev/, staging/, test/) ("litellm_proxy/prod/claude-opus-4-5-20251101", True), ("litellm_proxy/dev/claude-opus-4-5", True),