From e0eb77c254f865603f95c3f84e2e26f6b9b3f486 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 19 Jun 2026 08:26:30 -0400 Subject: [PATCH 1/4] refactor: generalize model config with family-based resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the monolithic MODELS dict with a family-based resolution system so new model versions in existing families (e.g. glm-5.2) resolve automatically without an explicit config entry or PR. FAMILIES defines regex patterns with proxy prefix, display-name formatter, and default llm_config for clean families (glm, kimi, deepseek, claude-opus). Models matching a family pattern derive their full config from the pattern alone. EXPLICIT_MODELS retains entries only for models that deviate from their family pattern (variant proxy strings, model-specific quirks) or belong to families without a clean pattern. The MODELS dict is now a backward- compatible alias of EXPLICIT_MODELS. resolve_model_config(model_id) is the new single entry point: explicit entry → family pattern → KeyError. glm-5.2 is the first beneficiary — it resolves via the glm- family pattern with no explicit entry needed. Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 218 ++++++++++++++--------- tests/cross/test_resolve_model_config.py | 64 ++++++- 2 files changed, 192 insertions(+), 90 deletions(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 3bd8616415..50bee0badb 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -14,6 +14,7 @@ import json import os +import re import signal import sys import time @@ -44,8 +45,81 @@ def _sigterm_handler(signum: int, _frame: object) -> None: SDK_ONLY_PARAMS = {"disable_vision", "inline_image_urls"} -# Model configurations dictionary -MODELS = { +def _humanize_parts(model_id: str, prefix: str) -> str: + """Capitalize each hyphen-separated part after stripping ``prefix``. + + Example: ``_humanize_parts("kimi-k2-thinking", "kimi-")`` -> ``"K2 Thinking"``. + """ + rest = model_id.removeprefix(prefix) + return " ".join(part.capitalize() for part in rest.split("-")) + + +# Family patterns for models whose config is fully derivable from the model ID. +# A new version in an existing family (e.g. ``glm-5.2``) resolves automatically +# without an explicit entry. First match wins. +# +# Each family defines: +# proxy_prefix – LiteLLM proxy path prefix (model string = proxy_prefix + model_id) +# display_name – callable(model_id) -> human-readable name +# llm_config – default llm_config fields (temperature, top_p, disable_vision, …) +FAMILIES: list[tuple[re.Pattern, dict[str, Any]]] = [ + ( + re.compile(r"^glm-"), + { + "proxy_prefix": "litellm_proxy/openrouter/z-ai/", + "display_name": lambda mid: "GLM-" + mid.removeprefix("glm-"), + "llm_config": { + "temperature": 0.0, + # OpenRouter GLM models are text-only despite LiteLLM reporting + # vision support. See #2110 (GLM-5), #1898 (GLM-4.7). + "disable_vision": True, + }, + }, + ), + ( + re.compile(r"^kimi-k"), + { + "proxy_prefix": "litellm_proxy/moonshot/", + "display_name": lambda mid: "Kimi " + _humanize_parts(mid, "kimi-"), + "llm_config": {"temperature": 1.0}, + }, + ), + ( + re.compile(r"^deepseek-"), + { + "proxy_prefix": "litellm_proxy/deepseek/", + "display_name": lambda mid: "DeepSeek " + _humanize_parts(mid, "deepseek-"), + "llm_config": {}, + }, + ), + ( + re.compile(r"^claude-opus-"), + { + "proxy_prefix": "litellm_proxy/anthropic/", + "display_name": lambda mid: "Claude Opus " + + mid.removeprefix("claude-opus-").replace("-", "."), + "llm_config": {}, + }, + ), +] + + +def _resolve_family(model_id: str) -> dict[str, Any] | None: + """Return a copy of the matching family's defaults, or ``None``.""" + for pattern, family in FAMILIES: + if pattern.match(model_id): + return { + "proxy_prefix": family["proxy_prefix"], + "display_name": family["display_name"](model_id), + "llm_config": dict(family["llm_config"]), + } + return None + + +# Explicit model entries for models that **deviate** from their family pattern +# (variant proxy strings, model-specific quirks, or families without a clean +# pattern). Models that match a FAMILIES pattern do NOT need to be listed here. +EXPLICIT_MODELS: dict[str, dict[str, Any]] = { "claude-sonnet-4-5-20250929": { "id": "claude-sonnet-4-5-20250929", "display_name": "Claude Sonnet 4.5", @@ -54,14 +128,21 @@ def _sigterm_handler(signum: int, _frame: object) -> None: "temperature": 0.0, }, }, - "kimi-k2-thinking": { - "id": "kimi-k2-thinking", - "display_name": "Kimi K2 Thinking", + # kimi-k2.6: family default + inline_image_urls quirk + # https://www.kimi.com/blog/kimi-k2-6 + "kimi-k2.6": { + "id": "kimi-k2.6", + "display_name": "Kimi K2.6", "llm_config": { - "model": "litellm_proxy/moonshot/kimi-k2-thinking", + "model": "litellm_proxy/moonshot/kimi-k2.6", "temperature": 1.0, + # Moonshot's public Kimi API rejects http(s) image URLs and only + # accepts base64 ``data:`` URLs. This makes the SDK fetch each + # image URL and inline it as base64 before sending. See #3155. + "inline_image_urls": True, }, }, + # kimi-k2.5: family default + top_p override # https://www.kimi.com/blog/kimi-k2-5.html "kimi-k2.5": { "id": "kimi-k2.5", @@ -72,20 +153,6 @@ def _sigterm_handler(signum: int, _frame: object) -> None: "top_p": 0.95, }, }, - # https://www.kimi.com/blog/kimi-k2-6 - "kimi-k2.6": { - "id": "kimi-k2.6", - "display_name": "Kimi K2.6", - "llm_config": { - "model": "litellm_proxy/moonshot/kimi-k2.6", - "temperature": 1.0, - # Moonshot's public Kimi API rejects http(s) image URLs and only - # accepts base64 ``data:`` URLs. This makes the SDK fetch each - # image URL and inline it as base64 before sending. See #3155. - "inline_image_urls": True, - }, - }, - # https://www.alibabacloud.com/help/en/model-studio/deep-thinking "qwen3-max-thinking": { "id": "qwen3-max-thinking", "display_name": "Qwen3 Max Thinking", @@ -122,25 +189,10 @@ def _sigterm_handler(signum: int, _frame: object) -> None: "id": "claude-4.6-opus", "display_name": "Claude 4.6 Opus", "llm_config": { - "model": "litellm_proxy/anthropic/claude-opus-4-6", + "model": "litellm_proxy/anthropic/claude-4-6", "temperature": 0.0, }, }, - "claude-opus-4-7": { - "id": "claude-opus-4-7", - "display_name": "Claude Opus 4.7", - "llm_config": { - "model": "litellm_proxy/anthropic/claude-opus-4-7", - }, - }, - # https://www.anthropic.com/news/claude-opus-4-8 - "claude-opus-4-8": { - "id": "claude-opus-4-8", - "display_name": "Claude Opus 4.8", - "llm_config": { - "model": "litellm_proxy/anthropic/claude-opus-4-8", - }, - }, # https://www.anthropic.com/news/claude-fable-5 "claude-fable-5": { "id": "claude-fable-5", @@ -271,22 +323,12 @@ def _sigterm_handler(signum: int, _frame: object) -> None: "top_p": 0.95, }, }, + # deepseek-v3.2-reasoner: variant proxy string (deepseek-reasoner) "deepseek-v3.2-reasoner": { "id": "deepseek-v3.2-reasoner", "display_name": "DeepSeek V3.2 Reasoner", "llm_config": {"model": "litellm_proxy/deepseek/deepseek-reasoner"}, }, - # https://api-docs.deepseek.com/news/news260424 - "deepseek-v4-pro": { - "id": "deepseek-v4-pro", - "display_name": "DeepSeek V4 Pro", - "llm_config": {"model": "litellm_proxy/deepseek/deepseek-v4-pro"}, - }, - "deepseek-v4-flash": { - "id": "deepseek-v4-flash", - "display_name": "DeepSeek V4 Flash", - "llm_config": {"model": "litellm_proxy/deepseek/deepseek-v4-flash"}, - }, "qwen-3-coder": { "id": "qwen-3-coder", "display_name": "Qwen 3 Coder", @@ -303,36 +345,6 @@ def _sigterm_handler(signum: int, _frame: object) -> None: "temperature": 0.0, }, }, - "glm-4.7": { - "id": "glm-4.7", - "display_name": "GLM-4.7", - "llm_config": { - "model": "litellm_proxy/openrouter/z-ai/glm-4.7", - "temperature": 0.0, - # OpenRouter glm-4.7 is text-only despite LiteLLM reporting vision support - "disable_vision": True, - }, - }, - "glm-5": { - "id": "glm-5", - "display_name": "GLM-5", - "llm_config": { - "model": "litellm_proxy/openrouter/z-ai/glm-5", - "temperature": 0.0, - # OpenRouter glm-5 is text-only despite LiteLLM reporting vision support - "disable_vision": True, - }, - }, - "glm-5.1": { - "id": "glm-5.1", - "display_name": "GLM-5.1", - "llm_config": { - "model": "litellm_proxy/openrouter/z-ai/glm-5.1", - "temperature": 0.0, - # OpenRouter glm-5.1 is text-only despite LiteLLM reporting vision support - "disable_vision": True, - }, - }, "qwen3-coder-next": { "id": "qwen3-coder-next", "display_name": "Qwen3 Coder Next", @@ -434,6 +446,39 @@ def _sigterm_handler(signum: int, _frame: object) -> None: } +def resolve_model_config(model_id: str) -> dict[str, Any]: + """Resolve a model ID to its full configuration. + + Models that match a ``FAMILIES`` pattern are derived automatically from + the family defaults — no explicit entry needed. Models that deviate from + their family pattern (variant proxy strings, quirks) or belong to a family + without a clean pattern must have an explicit entry in ``EXPLICIT_MODELS``. + + Raises ``KeyError`` if the model ID matches no family and has no explicit + entry. + """ + if model_id in EXPLICIT_MODELS: + return dict(EXPLICIT_MODELS[model_id]) + + family = _resolve_family(model_id) + if family is not None: + llm_config = dict(family["llm_config"]) + llm_config["model"] = family["proxy_prefix"] + model_id + return { + "id": model_id, + "display_name": family["display_name"], + "llm_config": llm_config, + } + + raise KeyError(model_id) + + +# Backward-compatible dict of explicitly-registered models. Models that are +# derived purely from a family pattern (e.g. glm-5, kimi-k2-thinking) are NOT +# listed here but still resolve via ``find_models_by_id`` / ``resolve_model_config``. +MODELS: dict[str, dict[str, Any]] = dict(EXPLICIT_MODELS) + + def error_exit(msg: str, exit_code: int = 1) -> None: """Print error message and exit.""" print(f"ERROR: {msg}", file=sys.stderr) @@ -451,6 +496,10 @@ def get_required_env(key: str) -> str: def find_models_by_id(model_ids: list[str]) -> list[dict]: """Find models by ID. Fails fast on missing ID. + Checks the ``MODELS`` dict first (which may be patched in tests), then + falls back to ``resolve_model_config`` for family-pattern-derived models + that are not explicitly registered. + Args: model_ids: List of model IDs to find @@ -462,12 +511,19 @@ def find_models_by_id(model_ids: list[str]) -> list[dict]: """ resolved = [] for model_id in model_ids: - if model_id not in MODELS: - available = ", ".join(sorted(MODELS.keys())) + if model_id in MODELS: + resolved.append(MODELS[model_id]) + continue + try: + resolved.append(resolve_model_config(model_id)) + except KeyError: + available = ", ".join(sorted(EXPLICIT_MODELS.keys())) error_exit( - f"Model ID '{model_id}' not found. Available models: {available}" + f"Model ID '{model_id}' not found. " + f"Available explicit models: {available}. " + f"Models matching a family pattern (e.g. glm-*) " + f"also resolve automatically." ) - resolved.append(MODELS[model_id]) return resolved diff --git a/tests/cross/test_resolve_model_config.py b/tests/cross/test_resolve_model_config.py index 3d9c2bc1bf..34c347f45e 100644 --- a/tests/cross/test_resolve_model_config.py +++ b/tests/cross/test_resolve_model_config.py @@ -14,9 +14,11 @@ run_eval_path = Path(__file__).parent.parent.parent / ".github" / "run-eval" sys.path.append(str(run_eval_path)) from resolve_model_config import ( # noqa: E402 # type: ignore[import-not-found] + EXPLICIT_MODELS, MODELS, check_model, find_models_by_id, + resolve_model_config, run_preflight_check, ) @@ -214,15 +216,48 @@ def test_all_models_valid_with_pydantic(): - temperature is between 0.0 and 2.0 (if present) - top_p is between 0.0 and 1.0 (if present) - reasoning_effort is one of 'low', 'medium', 'high' (if present) + + Validates both explicit entries and family-derived models. """ + # Collect all configs: explicit entries + family-derived models + all_configs = {} + for model_id in EXPLICIT_MODELS: + all_configs[model_id] = resolve_model_config(model_id) + # Also validate representative family-derived models (not in EXPLICIT_MODELS) + family_derived = [ + "glm-4.7", + "glm-5", + "glm-5.1", + "glm-5.2", + "kimi-k2-thinking", + "deepseek-v4-pro", + "deepseek-v4-flash", + "claude-opus-4-7", + "claude-opus-4-8", + ] + for model_id in family_derived: + if model_id not in all_configs: + all_configs[model_id] = resolve_model_config(model_id) + # This will raise ValidationError if any model is invalid - registry = EvalModelsRegistry(models=MODELS) - assert len(registry.models) == len(MODELS) + registry = EvalModelsRegistry(models=all_configs) + assert len(registry.models) == len(all_configs) def test_find_all_models(): """Test that find_models_by_id works for all models.""" - all_model_ids = list(MODELS.keys()) + # All explicit model IDs + representative family-derived model IDs + all_model_ids = list(EXPLICIT_MODELS.keys()) + [ + "glm-4.7", + "glm-5", + "glm-5.1", + "glm-5.2", + "kimi-k2-thinking", + "deepseek-v4-pro", + "deepseek-v4-flash", + "claude-opus-4-7", + "claude-opus-4-8", + ] result = find_models_by_id(all_model_ids) assert len(result) == len(all_model_ids) @@ -260,7 +295,7 @@ def test_gpt_5_3_codex_config(): def test_glm_5_config(): """Test that glm-5 has correct configuration.""" - model = MODELS["glm-5"] + model = resolve_model_config("glm-5") assert model["id"] == "glm-5" assert model["display_name"] == "GLM-5" @@ -270,7 +305,7 @@ def test_glm_5_config(): def test_glm_5_1_config(): """Test that glm-5.1 has correct configuration.""" - model = MODELS["glm-5.1"] + model = resolve_model_config("glm-5.1") assert model["id"] == "glm-5.1" assert model["display_name"] == "GLM-5.1" @@ -278,6 +313,17 @@ def test_glm_5_1_config(): assert model["llm_config"]["disable_vision"] is True +def test_glm_5_2_config(): + """Test that glm-5.2 resolves automatically via the glm family pattern.""" + model = resolve_model_config("glm-5.2") + + assert model["id"] == "glm-5.2" + assert model["display_name"] == "GLM-5.2" + assert model["llm_config"]["model"] == "litellm_proxy/openrouter/z-ai/glm-5.2" + assert model["llm_config"]["temperature"] == 0.0 + assert model["llm_config"]["disable_vision"] is True + + # Tests for preflight check functionality @@ -617,7 +663,7 @@ def test_trinity_large_thinking_config(): def test_claude_opus_4_7_config(): """Test that claude-opus-4-7 has correct configuration.""" - model = MODELS["claude-opus-4-7"] + model = resolve_model_config("claude-opus-4-7") assert model["id"] == "claude-opus-4-7" assert model["display_name"] == "Claude Opus 4.7" @@ -646,7 +692,7 @@ def test_gpt_5_5_config(): def test_deepseek_v4_pro_config(): """Test that deepseek-v4-pro has correct configuration.""" - model = MODELS["deepseek-v4-pro"] + model = resolve_model_config("deepseek-v4-pro") assert model["id"] == "deepseek-v4-pro" assert model["display_name"] == "DeepSeek V4 Pro" @@ -655,7 +701,7 @@ def test_deepseek_v4_pro_config(): def test_deepseek_v4_flash_config(): """Test that deepseek-v4-flash has correct configuration.""" - model = MODELS["deepseek-v4-flash"] + model = resolve_model_config("deepseek-v4-flash") assert model["id"] == "deepseek-v4-flash" assert model["display_name"] == "DeepSeek V4 Flash" @@ -711,7 +757,7 @@ def test_nemotron_3_ultra_550b_a55b_or_paid_config(): def test_claude_opus_4_8_config(): """Test that claude-opus-4-8 has correct configuration.""" - model = MODELS["claude-opus-4-8"] + model = resolve_model_config("claude-opus-4-8") assert model["id"] == "claude-opus-4-8" assert model["display_name"] == "Claude Opus 4.8" From 97451661c2ca53127955915fa6f2bd9a188b3844 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 19 Jun 2026 08:51:26 -0400 Subject: [PATCH 2/4] fix: preserve claude-4.6-opus proxy string from main MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regression introduced during the refactor — the proxy string was accidentally changed from claude-opus-4-6 to claude-4-6. Restored to match main. Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 50bee0badb..dbeef10add 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -189,7 +189,7 @@ def _resolve_family(model_id: str) -> dict[str, Any] | None: "id": "claude-4.6-opus", "display_name": "Claude 4.6 Opus", "llm_config": { - "model": "litellm_proxy/anthropic/claude-4-6", + "model": "litellm_proxy/anthropic/claude-opus-4-6", "temperature": 0.0, }, }, From 933964d7af3190805a1d914713e51057abfc8e3f Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 19 Jun 2026 09:04:30 -0400 Subject: [PATCH 3/4] fix: use find_models_by_id in workflows for family-derived models The setup-matrix and run-eval workflows imported MODELS directly and checked membership against it, which excluded family-derived models (e.g. deepseek-v4-flash, glm-5.2) that are not in EXPLICIT_MODELS but resolve via family patterns. Switched both to find_models_by_id, which already handles both explicit and family-derived resolution. Co-authored-by: openhands --- .github/workflows/integration-runner.yml | 13 ++++++------ .github/workflows/run-eval.yml | 25 +++++++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml index d32ba0fcef..44a02d57e2 100644 --- a/.github/workflows/integration-runner.yml +++ b/.github/workflows/integration-runner.yml @@ -95,18 +95,17 @@ jobs: import os import sys sys.path.insert(0, '.github/run-eval') - from resolve_model_config import MODELS + from resolve_model_config import find_models_by_id model_ids = os.environ["MODEL_IDS"].split(",") model_ids = [m.strip() for m in model_ids if m.strip()] + # find_models_by_id exits with code 1 and prints a helpful + # message if any model ID cannot be resolved. + resolved = find_models_by_id(model_ids) + matrix = [] - for model_id in model_ids: - if model_id not in MODELS: - available = ", ".join(sorted(MODELS.keys())) - print(f"Error: Model ID '{model_id}' not found. Available: {available}", file=sys.stderr) - sys.exit(1) - model = MODELS[model_id] + for model_id, model in zip(model_ids, resolved): # Create run-suffix from model id (replace special chars with underscore) run_suffix = model_id.replace("-", "_").replace(".", "_") + "_run" matrix.append({ diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml index 273c7e774d..2642da610b 100644 --- a/.github/workflows/run-eval.yml +++ b/.github/workflows/run-eval.yml @@ -350,14 +350,25 @@ jobs: MODELS_INPUT="$DEFAULT_MODEL" fi MODELS=$(printf '%s' "$MODELS_INPUT" | tr ', ' '\n' | sed '/^$/d' | paste -sd, -) + + # Validate model IDs using find_models_by_id (supports family-derived models) ALLOWED_LIST=$(echo "$ALLOWED_MODEL_IDS_JSON" | jq -r '.[]') - for MODEL in ${MODELS//,/ }; do - if ! echo "$ALLOWED_LIST" | grep -Fx "$MODEL" >/dev/null; then - echo "Model ID '$MODEL' not found in models.json" >&2 - echo "Available models: $(echo "$ALLOWED_LIST" | paste -sd, -)" >&2 - exit 1 - fi - done + MODEL_IDS_VALIDATED=$(MODELS="$MODELS" uv run python << 'EOF' + import os, sys + sys.path.insert(0, '.github/run-eval') + from resolve_model_config import find_models_by_id + models = os.environ.get("MODELS", "") + model_ids = [m.strip() for m in models.split(",") if m.strip()] + # find_models_by_id exits with code 1 and prints available models + # if any ID cannot be resolved (including family-pattern matches). + find_models_by_id(model_ids) + print(",".join(model_ids)) + EOF + ) + if [ $? -ne 0 ]; then + echo "Available models: $(echo "$ALLOWED_LIST" | paste -sd, -)" >&2 + exit 1 + fi # Sanitize values to avoid GITHUB_OUTPUT parse errors (e.g., raw SHAs) SDK_SHA=$(printf '%s' "$SDK_SHA" | tr -d '\n\r') From 7e2454b29c344db75f7ce091ed9bacf288d0c952 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 19 Jun 2026 09:07:38 -0400 Subject: [PATCH 4/4] fix: deep-copy llm_config in resolve_model_config explicit path Matches the safety of the family-derived path, which already builds a fresh llm_config dict. Prevents callers from mutating the global EXPLICIT_MODELS entry. Co-authored-by: openhands --- .github/run-eval/resolve_model_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index dbeef10add..8245ef9314 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -458,7 +458,8 @@ def resolve_model_config(model_id: str) -> dict[str, Any]: entry. """ if model_id in EXPLICIT_MODELS: - return dict(EXPLICIT_MODELS[model_id]) + entry = EXPLICIT_MODELS[model_id] + return {**entry, "llm_config": dict(entry["llm_config"])} family = _resolve_family(model_id) if family is not None: