Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/run-eval/ADDINGMODEL.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,33 @@

This file (`resolve_model_config.py`) defines models available for evaluation. Models must be added here before they can be used in integration tests or evaluations.

## Two kinds of MODELS entries

`MODELS` accepts two shapes of `llm_config`:

1. **Plain LLM config** — a dict with a top-level `"model": "litellm_proxy/..."`,
plus optional parameters (`temperature`, `top_p`, `reasoning_effort`, …).
This is the overwhelmingly common case and what the rest of this guide is
about.
2. **Intelligent-router config** — a dict with `"kind": "intelligent-router-v0"`
and a `tiers` map (`model_id -> per-tier plain llm_config`). Used to dispatch
per-instance model selection at eval time (see
[`OpenHands/benchmarks#742`](https://github.com/OpenHands/benchmarks/pull/742)
and the `router-classified-3tier` entry below for the canonical example).

The preflight check (`check_model`) detects router entries via
`is_router_config(llm_config)` and recurses into each tier sub-model. So a
router entry's "model is reachable" test is automatically the AND of all its
tier sub-models' tests. If you add a new router entry, every tier sub-model
must already be a working `litellm_proxy/...` config — you do not need to add
anything else.

The serialization to `models_json` is identical for both shapes (the whole
`llm_config` dict is passed through as-is); the downstream
`OpenHands/evaluation/eval-job/scripts/build_matrix.py` and the benchmark
loader in `OpenHands/benchmarks` are responsible for handling the router
shape end-to-end.

## Critical Rules

**ONLY ADD NEW CONTENT - DO NOT MODIFY EXISTING CODE**
Expand Down
125 changes: 124 additions & 1 deletion .github/run-eval/resolve_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,9 +431,81 @@ def _sigterm_handler(signum: int, _frame: object) -> None:
"retry_max_wait": 120,
},
},
# Intelligent per-instance routing.
#
# This is NOT a single model. The ``llm_config`` here is an
# ``intelligent-router-v0`` payload understood by ``benchmarks.utils.llm_config``
# (added in OpenHands/benchmarks#742). When the benchmark loads this config
# via ``--llm-config-path``, each instance is classified once by
# ``classifier_model_id`` and the agent conversation is routed to the matching
# tier model.
#
# Default routing table (matches the iter5 classifier prompt and
# OpenHands/benchmarks sample config):
# Frontend -> kimi-k2.6
# Issue Resolution (other) -> minimax-m2.7
# Greenfield / Testing /
# Information Gathering -> gpt-5.5
#
# Each tier sub-model is independently validated by the recursive preflight
# in ``check_model`` below; the slug-derivation step in
# ``OpenHands/evaluation/eval-job/scripts/build_matrix.py`` falls back to
# the entry's ``id`` because there is no top-level ``model`` field.
"router-classified-3tier": {
"id": "router-classified-3tier",
"display_name": "Router (3-tier, classifier=minimax-m2.7)",
"llm_config": {
"kind": "intelligent-router-v0",
"classifier_model_id": "minimax-m2.7",
"fallback_model_id": "gpt-5.5",
"tiers": {
"kimi-k2.6": {
"model": "litellm_proxy/moonshot/kimi-k2.6",

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 Important: I exercised the actual resolver CLI with MODEL_IDS=router-classified-3tier against the live default proxy using the available LLM credentials. Preflight recursed correctly, but this tier failed with Invalid model name passed in model=moonshot/kimi-k2.6; running the plain MODEL_IDS=kimi-k2.6 entry failed the same way. Until the proxy/model name is provisioned or this tier is changed to a reachable model, the new router model aborts before writing GITHUB_OUTPUT, so the dispatching end is not fully ready.

Automated QA finding generated by an AI agent (OpenHands) on behalf of the requester.

"temperature": 1.0,
# See ``kimi-k2.6`` entry above for why this is needed.
"inline_image_urls": True,
},
"minimax-m2.7": {
"model": "litellm_proxy/minimax/MiniMax-M2.7",
"temperature": 1.0,
"top_p": 0.95,
},
"gpt-5.5": {
"model": "litellm_proxy/openai/gpt-5.5",
"reasoning_effort": "high",
},
},
"routing": {
"Frontend": "kimi-k2.6",
"Issue Resolution (other)": "minimax-m2.7",
"Greenfield": "gpt-5.5",
"Testing": "gpt-5.5",
"Information Gathering": "gpt-5.5",
},
"vision_capable_model_ids": ["kimi-k2.6", "gpt-5.5"],
},
},
}


# Discriminator string used by ``benchmarks.utils.intelligent_routing`` to tag
# router configs. Keep in sync with that module.
ROUTER_CONFIG_KIND = "intelligent-router-v0"


def is_router_config(llm_config: dict[str, Any]) -> bool:
"""Return True if ``llm_config`` is an intelligent-router-v0 payload.

Router payloads have no top-level ``model`` field; instead they carry a
``kind`` discriminator and a ``tiers`` map of model_id -> per-tier llm_config.
"""
return (
isinstance(llm_config, dict)
and llm_config.get("kind") == ROUTER_CONFIG_KIND
and isinstance(llm_config.get("tiers"), dict)
)


def error_exit(msg: str, exit_code: int = 1) -> None:
"""Print error message and exit."""
print(f"ERROR: {msg}", file=sys.stderr)
Expand Down Expand Up @@ -479,6 +551,11 @@ def check_model(
) -> tuple[bool, str]:
"""Check a single model with a simple completion request using litellm.

Router entries (``llm_config.kind == "intelligent-router-v0"``) have no
single downstream model. For those we recurse into each tier sub-model
and report aggregate success/failure; this catches per-tier configuration
or proxy-provisioning issues before the actual eval run.

Args:
model_config: Model configuration dict with 'llm_config' key
api_key: API key for authentication
Expand All @@ -491,8 +568,13 @@ def check_model(
import litellm

llm_config = model_config.get("llm_config", {})
display_name = model_config.get("display_name", llm_config.get("model", "unknown"))

# Router entries: recursively validate each tier sub-model.
if is_router_config(llm_config):
return _check_router_tiers(model_config, api_key, base_url, timeout)

model_name = llm_config.get("model", "unknown")
display_name = model_config.get("display_name", model_name)

try:
# Build kwargs from llm_config, excluding 'model' and SDK-specific params
Expand Down Expand Up @@ -556,6 +638,47 @@ def check_model(
test_model = check_model


def _check_router_tiers(
model_config: dict[str, Any],
api_key: str,
base_url: str,
timeout: int,
) -> tuple[bool, str]:
"""Recursively validate every tier sub-model of a router entry.

Aggregates per-tier results into a single (success, multi-line message)
return value so that the surrounding ``run_preflight_check`` loop can keep
its one-line-per-entry output format.
"""
llm_config = model_config["llm_config"]
display_name = model_config.get("display_name", "router")
tiers = llm_config.get("tiers", {})

if not tiers:
return False, f"✗ {display_name}: router has no tiers to validate"

lines = [f" {display_name}: validating {len(tiers)} tier model(s)..."]
all_ok = True
for tier_id, tier_cfg in tiers.items():
sub_config = {
"id": tier_id,
"display_name": f"{display_name} :: {tier_id}",
"llm_config": tier_cfg,
}
ok, sub_message = check_model(sub_config, api_key, base_url, timeout)
lines.append(f" {sub_message}")
if not ok:
all_ok = False

summary = (
f"✓ {display_name}: OK ({len(tiers)} tier(s))"
if all_ok
else f"✗ {display_name}: one or more tiers failed"
)
lines.append(summary)
return all_ok, "\n".join(lines)


def _check_proxy_reachable(
base_url: str, api_key: str | None = None, timeout: int = 10
) -> tuple[bool, str]:
Expand Down
Loading
Loading