From e86b8446165ec67cdc7ecff4ae33c20a5f4e5acf Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 10 Jun 2026 17:02:24 +0000 Subject: [PATCH] feat(run-eval): add router-classified-3tier MODELS entry + recursive preflight Companion change to OpenHands/benchmarks#742 (intelligent per-instance model routing). With this PR the SDK can dispatch a router-shaped llm_config to the evaluation pipeline; the benchmarks side already understands the intelligent-router-v0 shape and will classify each instance and route to the matching tier model. Changes: - New MODELS entry 'router-classified-3tier' (classifier=minimax-m2.7, tiers={kimi-k2.6, minimax-m2.7, gpt-5.5}, default iter5 routing). - New helpers ROUTER_CONFIG_KIND and is_router_config(). - check_model() now detects router entries and recurses into each tier sub-model, aggregating success/failure. - Pydantic validator in tests learns about RouterLLMConfig and the registry's llm_config is now 'RouterLLMConfig | LLMConfig'. - 14 new tests covering the new entry, is_router_config, and recursive preflight. Note: the matching OpenHands/evaluation change to eval-job/scripts/build_matrix.py (handle no-top-level-model router entries when deriving the GCS slug) is required for end-to-end dispatch and will be opened separately. Co-authored-by: openhands --- .github/run-eval/ADDINGMODEL.md | 27 +++ .github/run-eval/resolve_model_config.py | 125 ++++++++++- tests/cross/test_resolve_model_config.py | 263 ++++++++++++++++++++++- 3 files changed, 412 insertions(+), 3 deletions(-) diff --git a/.github/run-eval/ADDINGMODEL.md b/.github/run-eval/ADDINGMODEL.md index 7aa6ac0ab1..a3a62c6009 100644 --- a/.github/run-eval/ADDINGMODEL.md +++ b/.github/run-eval/ADDINGMODEL.md @@ -4,6 +4,33 @@ This file (`resolve_model_config.py`) defines models available for evaluation. Models must be added here before they can be used in integration tests or evaluations. +## Two kinds of MODELS entries + +`MODELS` accepts two shapes of `llm_config`: + +1. **Plain LLM config** — a dict with a top-level `"model": "litellm_proxy/..."`, + plus optional parameters (`temperature`, `top_p`, `reasoning_effort`, …). + This is the overwhelmingly common case and what the rest of this guide is + about. +2. **Intelligent-router config** — a dict with `"kind": "intelligent-router-v0"` + and a `tiers` map (`model_id -> per-tier plain llm_config`). Used to dispatch + per-instance model selection at eval time (see + [`OpenHands/benchmarks#742`](https://github.com/OpenHands/benchmarks/pull/742) + and the `router-classified-3tier` entry below for the canonical example). + +The preflight check (`check_model`) detects router entries via +`is_router_config(llm_config)` and recurses into each tier sub-model. So a +router entry's "model is reachable" test is automatically the AND of all its +tier sub-models' tests. If you add a new router entry, every tier sub-model +must already be a working `litellm_proxy/...` config — you do not need to add +anything else. + +The serialization to `models_json` is identical for both shapes (the whole +`llm_config` dict is passed through as-is); the downstream +`OpenHands/evaluation/eval-job/scripts/build_matrix.py` and the benchmark +loader in `OpenHands/benchmarks` are responsible for handling the router +shape end-to-end. + ## Critical Rules **ONLY ADD NEW CONTENT - DO NOT MODIFY EXISTING CODE** diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py index 3bd8616415..0f254dcf81 100755 --- a/.github/run-eval/resolve_model_config.py +++ b/.github/run-eval/resolve_model_config.py @@ -431,9 +431,81 @@ def _sigterm_handler(signum: int, _frame: object) -> None: "retry_max_wait": 120, }, }, + # Intelligent per-instance routing. + # + # This is NOT a single model. The ``llm_config`` here is an + # ``intelligent-router-v0`` payload understood by ``benchmarks.utils.llm_config`` + # (added in OpenHands/benchmarks#742). When the benchmark loads this config + # via ``--llm-config-path``, each instance is classified once by + # ``classifier_model_id`` and the agent conversation is routed to the matching + # tier model. + # + # Default routing table (matches the iter5 classifier prompt and + # OpenHands/benchmarks sample config): + # Frontend -> kimi-k2.6 + # Issue Resolution (other) -> minimax-m2.7 + # Greenfield / Testing / + # Information Gathering -> gpt-5.5 + # + # Each tier sub-model is independently validated by the recursive preflight + # in ``check_model`` below; the slug-derivation step in + # ``OpenHands/evaluation/eval-job/scripts/build_matrix.py`` falls back to + # the entry's ``id`` because there is no top-level ``model`` field. + "router-classified-3tier": { + "id": "router-classified-3tier", + "display_name": "Router (3-tier, classifier=minimax-m2.7)", + "llm_config": { + "kind": "intelligent-router-v0", + "classifier_model_id": "minimax-m2.7", + "fallback_model_id": "gpt-5.5", + "tiers": { + "kimi-k2.6": { + "model": "litellm_proxy/moonshot/kimi-k2.6", + "temperature": 1.0, + # See ``kimi-k2.6`` entry above for why this is needed. + "inline_image_urls": True, + }, + "minimax-m2.7": { + "model": "litellm_proxy/minimax/MiniMax-M2.7", + "temperature": 1.0, + "top_p": 0.95, + }, + "gpt-5.5": { + "model": "litellm_proxy/openai/gpt-5.5", + "reasoning_effort": "high", + }, + }, + "routing": { + "Frontend": "kimi-k2.6", + "Issue Resolution (other)": "minimax-m2.7", + "Greenfield": "gpt-5.5", + "Testing": "gpt-5.5", + "Information Gathering": "gpt-5.5", + }, + "vision_capable_model_ids": ["kimi-k2.6", "gpt-5.5"], + }, + }, } +# Discriminator string used by ``benchmarks.utils.intelligent_routing`` to tag +# router configs. Keep in sync with that module. +ROUTER_CONFIG_KIND = "intelligent-router-v0" + + +def is_router_config(llm_config: dict[str, Any]) -> bool: + """Return True if ``llm_config`` is an intelligent-router-v0 payload. + + Router payloads have no top-level ``model`` field; instead they carry a + ``kind`` discriminator and a ``tiers`` map of model_id -> per-tier llm_config. + """ + return ( + isinstance(llm_config, dict) + and llm_config.get("kind") == ROUTER_CONFIG_KIND + and isinstance(llm_config.get("tiers"), dict) + ) + + def error_exit(msg: str, exit_code: int = 1) -> None: """Print error message and exit.""" print(f"ERROR: {msg}", file=sys.stderr) @@ -479,6 +551,11 @@ def check_model( ) -> tuple[bool, str]: """Check a single model with a simple completion request using litellm. + Router entries (``llm_config.kind == "intelligent-router-v0"``) have no + single downstream model. For those we recurse into each tier sub-model + and report aggregate success/failure; this catches per-tier configuration + or proxy-provisioning issues before the actual eval run. + Args: model_config: Model configuration dict with 'llm_config' key api_key: API key for authentication @@ -491,8 +568,13 @@ def check_model( import litellm llm_config = model_config.get("llm_config", {}) + display_name = model_config.get("display_name", llm_config.get("model", "unknown")) + + # Router entries: recursively validate each tier sub-model. + if is_router_config(llm_config): + return _check_router_tiers(model_config, api_key, base_url, timeout) + model_name = llm_config.get("model", "unknown") - display_name = model_config.get("display_name", model_name) try: # Build kwargs from llm_config, excluding 'model' and SDK-specific params @@ -556,6 +638,47 @@ def check_model( test_model = check_model +def _check_router_tiers( + model_config: dict[str, Any], + api_key: str, + base_url: str, + timeout: int, +) -> tuple[bool, str]: + """Recursively validate every tier sub-model of a router entry. + + Aggregates per-tier results into a single (success, multi-line message) + return value so that the surrounding ``run_preflight_check`` loop can keep + its one-line-per-entry output format. + """ + llm_config = model_config["llm_config"] + display_name = model_config.get("display_name", "router") + tiers = llm_config.get("tiers", {}) + + if not tiers: + return False, f"✗ {display_name}: router has no tiers to validate" + + lines = [f" {display_name}: validating {len(tiers)} tier model(s)..."] + all_ok = True + for tier_id, tier_cfg in tiers.items(): + sub_config = { + "id": tier_id, + "display_name": f"{display_name} :: {tier_id}", + "llm_config": tier_cfg, + } + ok, sub_message = check_model(sub_config, api_key, base_url, timeout) + lines.append(f" {sub_message}") + if not ok: + all_ok = False + + summary = ( + f"✓ {display_name}: OK ({len(tiers)} tier(s))" + if all_ok + else f"✗ {display_name}: one or more tiers failed" + ) + lines.append(summary) + return all_ok, "\n".join(lines) + + def _check_proxy_reachable( base_url: str, api_key: str | None = None, timeout: int = 10 ) -> tuple[bool, str]: diff --git a/tests/cross/test_resolve_model_config.py b/tests/cross/test_resolve_model_config.py index 3d9c2bc1bf..ccd52bb7e9 100644 --- a/tests/cross/test_resolve_model_config.py +++ b/tests/cross/test_resolve_model_config.py @@ -15,8 +15,10 @@ sys.path.append(str(run_eval_path)) from resolve_model_config import ( # noqa: E402 # type: ignore[import-not-found] MODELS, + ROUTER_CONFIG_KIND, check_model, find_models_by_id, + is_router_config, run_preflight_check, ) @@ -64,12 +66,64 @@ def reasoning_effort_valid(cls, v: str | None) -> str | None: return v +class RouterLLMConfig(BaseModel): + """Pydantic model for ``intelligent-router-v0`` configuration validation. + + Router configs have no top-level ``model`` field; they carry a ``kind`` + discriminator and a ``tiers`` map. Each tier value must itself satisfy + :class:`LLMConfig` (i.e. be a real downstream model config). + """ + + kind: str + classifier_model_id: str + fallback_model_id: str + tiers: dict[str, LLMConfig] + routing: dict[str, str] + vision_capable_model_ids: list[str] | None = None + + @field_validator("kind") + @classmethod + def kind_must_be_router_v0(cls, v: str) -> str: + if v != ROUTER_CONFIG_KIND: + raise ValueError(f"router kind must be '{ROUTER_CONFIG_KIND}', got '{v}'") + return v + + @model_validator(mode="after") + def references_are_consistent(self) -> "RouterLLMConfig": + if self.classifier_model_id not in self.tiers: + raise ValueError( + f"classifier_model_id '{self.classifier_model_id}' " + "must be a key in tiers" + ) + if self.fallback_model_id not in self.tiers: + raise ValueError( + f"fallback_model_id '{self.fallback_model_id}' must be a key in tiers" + ) + unknown = {v for v in self.routing.values() if v not in self.tiers} + if unknown: + raise ValueError(f"routing targets not present in tiers: {sorted(unknown)}") + if self.vision_capable_model_ids is not None: + stray = [ + mid for mid in self.vision_capable_model_ids if mid not in self.tiers + ] + if stray: + raise ValueError( + f"vision_capable_model_ids not present in tiers: {stray}" + ) + return self + + class EvalModelConfig(BaseModel): - """Pydantic model for evaluation model configuration validation.""" + """Pydantic model for evaluation model configuration validation. + + ``llm_config`` is either a plain :class:`LLMConfig` or, when the entry + represents intelligent routing (``kind: intelligent-router-v0``), a + :class:`RouterLLMConfig`. + """ id: str display_name: str - llm_config: LLMConfig + llm_config: RouterLLMConfig | LLMConfig @field_validator("id") @classmethod @@ -754,3 +808,208 @@ def test_step_3_7_flash_config(): assert model["llm_config"]["num_retries"] >= 10 assert model["llm_config"]["retry_min_wait"] >= 15 assert model["llm_config"]["retry_max_wait"] >= 90 + + +# --------------------------------------------------------------------------- +# Tests for the intelligent-router-v0 entry (``router-classified-3tier``). +# --------------------------------------------------------------------------- + + +class TestRouterClassified3Tier: + """Tests for the ``router-classified-3tier`` MODELS entry.""" + + def test_entry_is_router_shaped(self): + model = MODELS["router-classified-3tier"] + + assert model["id"] == "router-classified-3tier" + # is_router_config is the canonical predicate consumed by check_model + # and (via re-export) by build_matrix in OpenHands/evaluation. + assert is_router_config(model["llm_config"]) + # Router payloads have NO top-level "model" — they would otherwise be + # mistaken for a regular LLM config by downstream tooling. + assert "model" not in model["llm_config"] + + def test_router_references_are_internally_consistent(self): + """classifier/fallback IDs and all routing targets must exist in tiers.""" + cfg = MODELS["router-classified-3tier"]["llm_config"] + tiers = cfg["tiers"] + + assert cfg["classifier_model_id"] in tiers + assert cfg["fallback_model_id"] in tiers + for category, target in cfg["routing"].items(): + assert target in tiers, ( + f"routing target for '{category}' missing from tiers: {target}" + ) + for vision_mid in cfg.get("vision_capable_model_ids", []): + assert vision_mid in tiers, ( + f"vision_capable model id missing from tiers: {vision_mid}" + ) + + def test_tier_configs_are_real_litellm_proxy_models(self): + """Each tier value must itself be a valid plain LLM config.""" + cfg = MODELS["router-classified-3tier"]["llm_config"] + for tier_id, tier_cfg in cfg["tiers"].items(): + assert tier_cfg["model"].startswith("litellm_proxy/"), ( + f"tier '{tier_id}' model must use litellm_proxy/: {tier_cfg['model']}" + ) + # Pydantic LLMConfig acceptance is enforced by the registry-wide + # ``test_all_models_valid_with_pydantic`` test, but we additionally + # exercise it directly here to give a focused failure message. + LLMConfig.model_validate(tier_cfg) + + def test_routing_table_matches_iter5_classifier_categories(self): + """The 5 categories from the iter5 classifier prompt must all be mapped.""" + cfg = MODELS["router-classified-3tier"]["llm_config"] + expected_categories = { + "Frontend", + "Issue Resolution (other)", + "Greenfield", + "Testing", + "Information Gathering", + } + assert set(cfg["routing"].keys()) == expected_categories + + def test_router_config_validates_with_pydantic(self): + """The router entry must satisfy the RouterLLMConfig validator.""" + # This is also covered transitively by ``test_all_models_valid_with_pydantic`` + # via ``EvalModelConfig.llm_config: RouterLLMConfig | LLMConfig``, but a + # direct call keeps the failure mode obvious if the union ordering ever + # changes. + cfg = MODELS["router-classified-3tier"]["llm_config"] + RouterLLMConfig.model_validate(cfg) + + +class TestIsRouterConfig: + """Tests for the ``is_router_config`` predicate.""" + + def test_plain_llm_config_is_not_router(self): + assert not is_router_config({"model": "litellm_proxy/foo"}) + + def test_missing_kind_is_not_router(self): + assert not is_router_config({"tiers": {"a": {"model": "litellm_proxy/x"}}}) + + def test_missing_tiers_is_not_router(self): + assert not is_router_config({"kind": ROUTER_CONFIG_KIND}) + + def test_wrong_kind_is_not_router(self): + assert not is_router_config( + {"kind": "some-other-kind", "tiers": {"a": {"model": "litellm_proxy/x"}}} + ) + + def test_canonical_router_payload_is_router(self): + assert is_router_config( + { + "kind": ROUTER_CONFIG_KIND, + "tiers": {"a": {"model": "litellm_proxy/x"}}, + } + ) + + def test_non_dict_inputs_are_not_router(self): + # The signature is typed as dict, but downstream code in + # build_matrix.py may pass arbitrary JSON values; we guard regardless. + assert not is_router_config(None) # type: ignore[arg-type] + assert not is_router_config([]) # type: ignore[arg-type] + assert not is_router_config("string") # type: ignore[arg-type] + + +class TestCheckModelRouterRecursion: + """Tests for ``check_model``'s recursion into router tiers. + + These tests exercise the real ``check_model`` code path with + ``litellm.completion`` mocked, the same pattern as the existing + ``TestTestModel`` class above. + """ + + def _router_entry(self) -> dict[str, Any]: + return { + "id": "router-test", + "display_name": "Router Test", + "llm_config": { + "kind": ROUTER_CONFIG_KIND, + "classifier_model_id": "a", + "fallback_model_id": "b", + "tiers": { + "a": {"model": "litellm_proxy/provider/a"}, + "b": {"model": "litellm_proxy/provider/b"}, + }, + "routing": {"Frontend": "a", "Issue Resolution (other)": "b"}, + }, + } + + def test_all_tiers_succeed(self): + """When every tier sub-model passes, the router entry passes.""" + mock_response = MagicMock() + mock_response.choices = [MagicMock(message=MagicMock(content="OK"))] + + with patch("litellm.completion", return_value=mock_response) as mock_completion: + success, message = check_model( + self._router_entry(), "k", "https://t", timeout=5 + ) + + assert success is True + assert "Router Test" in message + assert "2 tier(s)" in message + # Each tier triggered exactly one litellm completion call. + assert mock_completion.call_count == 2 + called_models = sorted( + call.kwargs["model"] for call in mock_completion.call_args_list + ) + assert called_models == [ + "litellm_proxy/provider/a", + "litellm_proxy/provider/b", + ] + + def test_one_tier_failure_fails_router(self): + """If a single tier fails, the router entry as a whole fails.""" + import litellm + + ok_response = MagicMock() + ok_response.choices = [MagicMock(message=MagicMock(content="OK"))] + + def side_effect(*_args, model: str, **_kwargs): + if model.endswith("/b"): + raise litellm.exceptions.NotFoundError( + "no such model", llm_provider="x", model=model + ) + return ok_response + + with patch("litellm.completion", side_effect=side_effect): + success, message = check_model( + self._router_entry(), "k", "https://t", timeout=5 + ) + + assert success is False + assert "one or more tiers failed" in message + # Diagnostic still surfaces the per-tier line. + assert "litellm_proxy/provider/b" in message or "not found" in message + + def test_empty_tiers_fails_loudly(self): + """A router with an empty tiers map should fail preflight, not crash.""" + entry = self._router_entry() + entry["llm_config"]["tiers"] = {} + # ``check_model`` short-circuits before calling litellm at all. + with patch("litellm.completion") as mock_completion: + success, message = check_model(entry, "k", "https://t", timeout=5) + assert success is False + assert "no tiers" in message + mock_completion.assert_not_called() + + def test_tier_config_params_are_forwarded(self): + """Tier-level temperature/top_p/etc. must reach litellm.completion.""" + entry = self._router_entry() + entry["llm_config"]["tiers"]["a"]["temperature"] = 0.5 + entry["llm_config"]["tiers"]["a"]["top_p"] = 0.9 + + mock_response = MagicMock() + mock_response.choices = [MagicMock(message=MagicMock(content="OK"))] + with patch("litellm.completion", return_value=mock_response) as mock_completion: + check_model(entry, "k", "https://t", timeout=5) + + # Find the call for tier 'a' specifically. + a_call = next( + call + for call in mock_completion.call_args_list + if call.kwargs["model"].endswith("/a") + ) + assert a_call.kwargs["temperature"] == 0.5 + assert a_call.kwargs["top_p"] == 0.9