From e86b8446165ec67cdc7ecff4ae33c20a5f4e5acf Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 10 Jun 2026 17:02:24 +0000
Subject: [PATCH] feat(run-eval): add router-classified-3tier MODELS entry +
 recursive preflight

Companion change to OpenHands/benchmarks#742 (intelligent per-instance model
routing). With this PR the SDK can dispatch a router-shaped llm_config to the
evaluation pipeline; the benchmarks side already understands the
intelligent-router-v0 shape and will classify each instance and route to the
matching tier model.

Changes:
- New MODELS entry 'router-classified-3tier' (classifier=minimax-m2.7,
  tiers={kimi-k2.6, minimax-m2.7, gpt-5.5}, default iter5 routing).
- New helpers ROUTER_CONFIG_KIND and is_router_config().
- check_model() now detects router entries and recurses into each tier
  sub-model, aggregating success/failure.
- Pydantic validator in tests learns about RouterLLMConfig and the registry's
  llm_config is now 'RouterLLMConfig | LLMConfig'.
- 14 new tests covering the new entry, is_router_config, and recursive preflight.

Note: the matching OpenHands/evaluation change to
eval-job/scripts/build_matrix.py (handle no-top-level-model router entries
when deriving the GCS slug) is required for end-to-end dispatch and will be
opened separately.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/run-eval/ADDINGMODEL.md          |  27 +++
 .github/run-eval/resolve_model_config.py | 125 ++++++++++-
 tests/cross/test_resolve_model_config.py | 263 ++++++++++++++++++++++-
 3 files changed, 412 insertions(+), 3 deletions(-)

diff --git a/.github/run-eval/ADDINGMODEL.md b/.github/run-eval/ADDINGMODEL.md
index 7aa6ac0ab1..a3a62c6009 100644
--- a/.github/run-eval/ADDINGMODEL.md
+++ b/.github/run-eval/ADDINGMODEL.md
@@ -4,6 +4,33 @@
 
 This file (`resolve_model_config.py`) defines models available for evaluation. Models must be added here before they can be used in integration tests or evaluations.
 
+## Two kinds of MODELS entries
+
+`MODELS` accepts two shapes of `llm_config`:
+
+1. **Plain LLM config** — a dict with a top-level `"model": "litellm_proxy/..."`,
+   plus optional parameters (`temperature`, `top_p`, `reasoning_effort`, …).
+   This is the overwhelmingly common case and what the rest of this guide is
+   about.
+2. **Intelligent-router config** — a dict with `"kind": "intelligent-router-v0"`
+   and a `tiers` map (`model_id -> per-tier plain llm_config`). Used to dispatch
+   per-instance model selection at eval time (see
+   [`OpenHands/benchmarks#742`](https://github.com/OpenHands/benchmarks/pull/742)
+   and the `router-classified-3tier` entry below for the canonical example).
+
+The preflight check (`check_model`) detects router entries via
+`is_router_config(llm_config)` and recurses into each tier sub-model. So a
+router entry's "model is reachable" test is automatically the AND of all its
+tier sub-models' tests. If you add a new router entry, every tier sub-model
+must already be a working `litellm_proxy/...` config — you do not need to add
+anything else.
+
+The serialization to `models_json` is identical for both shapes (the whole
+`llm_config` dict is passed through as-is); the downstream
+`OpenHands/evaluation/eval-job/scripts/build_matrix.py` and the benchmark
+loader in `OpenHands/benchmarks` are responsible for handling the router
+shape end-to-end.
+
 ## Critical Rules
 
 **ONLY ADD NEW CONTENT - DO NOT MODIFY EXISTING CODE**
diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
index 3bd8616415..0f254dcf81 100755
--- a/.github/run-eval/resolve_model_config.py
+++ b/.github/run-eval/resolve_model_config.py
@@ -431,9 +431,81 @@ def _sigterm_handler(signum: int, _frame: object) -> None:
             "retry_max_wait": 120,
         },
     },
+    # Intelligent per-instance routing.
+    #
+    # This is NOT a single model. The ``llm_config`` here is an
+    # ``intelligent-router-v0`` payload understood by ``benchmarks.utils.llm_config``
+    # (added in OpenHands/benchmarks#742). When the benchmark loads this config
+    # via ``--llm-config-path``, each instance is classified once by
+    # ``classifier_model_id`` and the agent conversation is routed to the matching
+    # tier model.
+    #
+    # Default routing table (matches the iter5 classifier prompt and
+    # OpenHands/benchmarks sample config):
+    #   Frontend                    -> kimi-k2.6
+    #   Issue Resolution (other)    -> minimax-m2.7
+    #   Greenfield / Testing /
+    #     Information Gathering     -> gpt-5.5
+    #
+    # Each tier sub-model is independently validated by the recursive preflight
+    # in ``check_model`` below; the slug-derivation step in
+    # ``OpenHands/evaluation/eval-job/scripts/build_matrix.py`` falls back to
+    # the entry's ``id`` because there is no top-level ``model`` field.
+    "router-classified-3tier": {
+        "id": "router-classified-3tier",
+        "display_name": "Router (3-tier, classifier=minimax-m2.7)",
+        "llm_config": {
+            "kind": "intelligent-router-v0",
+            "classifier_model_id": "minimax-m2.7",
+            "fallback_model_id": "gpt-5.5",
+            "tiers": {
+                "kimi-k2.6": {
+                    "model": "litellm_proxy/moonshot/kimi-k2.6",
+                    "temperature": 1.0,
+                    # See ``kimi-k2.6`` entry above for why this is needed.
+                    "inline_image_urls": True,
+                },
+                "minimax-m2.7": {
+                    "model": "litellm_proxy/minimax/MiniMax-M2.7",
+                    "temperature": 1.0,
+                    "top_p": 0.95,
+                },
+                "gpt-5.5": {
+                    "model": "litellm_proxy/openai/gpt-5.5",
+                    "reasoning_effort": "high",
+                },
+            },
+            "routing": {
+                "Frontend": "kimi-k2.6",
+                "Issue Resolution (other)": "minimax-m2.7",
+                "Greenfield": "gpt-5.5",
+                "Testing": "gpt-5.5",
+                "Information Gathering": "gpt-5.5",
+            },
+            "vision_capable_model_ids": ["kimi-k2.6", "gpt-5.5"],
+        },
+    },
 }
 
 
+# Discriminator string used by ``benchmarks.utils.intelligent_routing`` to tag
+# router configs. Keep in sync with that module.
+ROUTER_CONFIG_KIND = "intelligent-router-v0"
+
+
+def is_router_config(llm_config: dict[str, Any]) -> bool:
+    """Return True if ``llm_config`` is an intelligent-router-v0 payload.
+
+    Router payloads have no top-level ``model`` field; instead they carry a
+    ``kind`` discriminator and a ``tiers`` map of model_id -> per-tier llm_config.
+    """
+    return (
+        isinstance(llm_config, dict)
+        and llm_config.get("kind") == ROUTER_CONFIG_KIND
+        and isinstance(llm_config.get("tiers"), dict)
+    )
+
+
 def error_exit(msg: str, exit_code: int = 1) -> None:
     """Print error message and exit."""
     print(f"ERROR: {msg}", file=sys.stderr)
@@ -479,6 +551,11 @@ def check_model(
 ) -> tuple[bool, str]:
     """Check a single model with a simple completion request using litellm.
 
+    Router entries (``llm_config.kind == "intelligent-router-v0"``) have no
+    single downstream model. For those we recurse into each tier sub-model
+    and report aggregate success/failure; this catches per-tier configuration
+    or proxy-provisioning issues before the actual eval run.
+
     Args:
         model_config: Model configuration dict with 'llm_config' key
         api_key: API key for authentication
@@ -491,8 +568,13 @@ def check_model(
     import litellm
 
     llm_config = model_config.get("llm_config", {})
+    display_name = model_config.get("display_name", llm_config.get("model", "unknown"))
+
+    # Router entries: recursively validate each tier sub-model.
+    if is_router_config(llm_config):
+        return _check_router_tiers(model_config, api_key, base_url, timeout)
+
     model_name = llm_config.get("model", "unknown")
-    display_name = model_config.get("display_name", model_name)
 
     try:
         # Build kwargs from llm_config, excluding 'model' and SDK-specific params
@@ -556,6 +638,47 @@ def check_model(
 test_model = check_model
 
 
+def _check_router_tiers(
+    model_config: dict[str, Any],
+    api_key: str,
+    base_url: str,
+    timeout: int,
+) -> tuple[bool, str]:
+    """Recursively validate every tier sub-model of a router entry.
+
+    Aggregates per-tier results into a single (success, multi-line message)
+    return value so that the surrounding ``run_preflight_check`` loop can keep
+    its one-line-per-entry output format.
+    """
+    llm_config = model_config["llm_config"]
+    display_name = model_config.get("display_name", "router")
+    tiers = llm_config.get("tiers", {})
+
+    if not tiers:
+        return False, f"✗ {display_name}: router has no tiers to validate"
+
+    lines = [f"  {display_name}: validating {len(tiers)} tier model(s)..."]
+    all_ok = True
+    for tier_id, tier_cfg in tiers.items():
+        sub_config = {
+            "id": tier_id,
+            "display_name": f"{display_name} :: {tier_id}",
+            "llm_config": tier_cfg,
+        }
+        ok, sub_message = check_model(sub_config, api_key, base_url, timeout)
+        lines.append(f"    {sub_message}")
+        if not ok:
+            all_ok = False
+
+    summary = (
+        f"✓ {display_name}: OK ({len(tiers)} tier(s))"
+        if all_ok
+        else f"✗ {display_name}: one or more tiers failed"
+    )
+    lines.append(summary)
+    return all_ok, "\n".join(lines)
+
+
 def _check_proxy_reachable(
     base_url: str, api_key: str | None = None, timeout: int = 10
 ) -> tuple[bool, str]:
diff --git a/tests/cross/test_resolve_model_config.py b/tests/cross/test_resolve_model_config.py
index 3d9c2bc1bf..ccd52bb7e9 100644
--- a/tests/cross/test_resolve_model_config.py
+++ b/tests/cross/test_resolve_model_config.py
@@ -15,8 +15,10 @@
 sys.path.append(str(run_eval_path))
 from resolve_model_config import (  # noqa: E402  # type: ignore[import-not-found]
     MODELS,
+    ROUTER_CONFIG_KIND,
     check_model,
     find_models_by_id,
+    is_router_config,
     run_preflight_check,
 )
 
@@ -64,12 +66,64 @@ def reasoning_effort_valid(cls, v: str | None) -> str | None:
         return v
 
 
+class RouterLLMConfig(BaseModel):
+    """Pydantic model for ``intelligent-router-v0`` configuration validation.
+
+    Router configs have no top-level ``model`` field; they carry a ``kind``
+    discriminator and a ``tiers`` map. Each tier value must itself satisfy
+    :class:`LLMConfig` (i.e. be a real downstream model config).
+    """
+
+    kind: str
+    classifier_model_id: str
+    fallback_model_id: str
+    tiers: dict[str, LLMConfig]
+    routing: dict[str, str]
+    vision_capable_model_ids: list[str] | None = None
+
+    @field_validator("kind")
+    @classmethod
+    def kind_must_be_router_v0(cls, v: str) -> str:
+        if v != ROUTER_CONFIG_KIND:
+            raise ValueError(f"router kind must be '{ROUTER_CONFIG_KIND}', got '{v}'")
+        return v
+
+    @model_validator(mode="after")
+    def references_are_consistent(self) -> "RouterLLMConfig":
+        if self.classifier_model_id not in self.tiers:
+            raise ValueError(
+                f"classifier_model_id '{self.classifier_model_id}' "
+                "must be a key in tiers"
+            )
+        if self.fallback_model_id not in self.tiers:
+            raise ValueError(
+                f"fallback_model_id '{self.fallback_model_id}' must be a key in tiers"
+            )
+        unknown = {v for v in self.routing.values() if v not in self.tiers}
+        if unknown:
+            raise ValueError(f"routing targets not present in tiers: {sorted(unknown)}")
+        if self.vision_capable_model_ids is not None:
+            stray = [
+                mid for mid in self.vision_capable_model_ids if mid not in self.tiers
+            ]
+            if stray:
+                raise ValueError(
+                    f"vision_capable_model_ids not present in tiers: {stray}"
+                )
+        return self
+
+
 class EvalModelConfig(BaseModel):
-    """Pydantic model for evaluation model configuration validation."""
+    """Pydantic model for evaluation model configuration validation.
+
+    ``llm_config`` is either a plain :class:`LLMConfig` or, when the entry
+    represents intelligent routing (``kind: intelligent-router-v0``), a
+    :class:`RouterLLMConfig`.
+    """
 
     id: str
     display_name: str
-    llm_config: LLMConfig
+    llm_config: RouterLLMConfig | LLMConfig
 
     @field_validator("id")
     @classmethod
@@ -754,3 +808,208 @@ def test_step_3_7_flash_config():
     assert model["llm_config"]["num_retries"] >= 10
     assert model["llm_config"]["retry_min_wait"] >= 15
     assert model["llm_config"]["retry_max_wait"] >= 90
+
+
+# ---------------------------------------------------------------------------
+# Tests for the intelligent-router-v0 entry (``router-classified-3tier``).
+# ---------------------------------------------------------------------------
+
+
+class TestRouterClassified3Tier:
+    """Tests for the ``router-classified-3tier`` MODELS entry."""
+
+    def test_entry_is_router_shaped(self):
+        model = MODELS["router-classified-3tier"]
+
+        assert model["id"] == "router-classified-3tier"
+        # is_router_config is the canonical predicate consumed by check_model
+        # and (via re-export) by build_matrix in OpenHands/evaluation.
+        assert is_router_config(model["llm_config"])
+        # Router payloads have NO top-level "model" — they would otherwise be
+        # mistaken for a regular LLM config by downstream tooling.
+        assert "model" not in model["llm_config"]
+
+    def test_router_references_are_internally_consistent(self):
+        """classifier/fallback IDs and all routing targets must exist in tiers."""
+        cfg = MODELS["router-classified-3tier"]["llm_config"]
+        tiers = cfg["tiers"]
+
+        assert cfg["classifier_model_id"] in tiers
+        assert cfg["fallback_model_id"] in tiers
+        for category, target in cfg["routing"].items():
+            assert target in tiers, (
+                f"routing target for '{category}' missing from tiers: {target}"
+            )
+        for vision_mid in cfg.get("vision_capable_model_ids", []):
+            assert vision_mid in tiers, (
+                f"vision_capable model id missing from tiers: {vision_mid}"
+            )
+
+    def test_tier_configs_are_real_litellm_proxy_models(self):
+        """Each tier value must itself be a valid plain LLM config."""
+        cfg = MODELS["router-classified-3tier"]["llm_config"]
+        for tier_id, tier_cfg in cfg["tiers"].items():
+            assert tier_cfg["model"].startswith("litellm_proxy/"), (
+                f"tier '{tier_id}' model must use litellm_proxy/: {tier_cfg['model']}"
+            )
+            # Pydantic LLMConfig acceptance is enforced by the registry-wide
+            # ``test_all_models_valid_with_pydantic`` test, but we additionally
+            # exercise it directly here to give a focused failure message.
+            LLMConfig.model_validate(tier_cfg)
+
+    def test_routing_table_matches_iter5_classifier_categories(self):
+        """The 5 categories from the iter5 classifier prompt must all be mapped."""
+        cfg = MODELS["router-classified-3tier"]["llm_config"]
+        expected_categories = {
+            "Frontend",
+            "Issue Resolution (other)",
+            "Greenfield",
+            "Testing",
+            "Information Gathering",
+        }
+        assert set(cfg["routing"].keys()) == expected_categories
+
+    def test_router_config_validates_with_pydantic(self):
+        """The router entry must satisfy the RouterLLMConfig validator."""
+        # This is also covered transitively by ``test_all_models_valid_with_pydantic``
+        # via ``EvalModelConfig.llm_config: RouterLLMConfig | LLMConfig``, but a
+        # direct call keeps the failure mode obvious if the union ordering ever
+        # changes.
+        cfg = MODELS["router-classified-3tier"]["llm_config"]
+        RouterLLMConfig.model_validate(cfg)
+
+
+class TestIsRouterConfig:
+    """Tests for the ``is_router_config`` predicate."""
+
+    def test_plain_llm_config_is_not_router(self):
+        assert not is_router_config({"model": "litellm_proxy/foo"})
+
+    def test_missing_kind_is_not_router(self):
+        assert not is_router_config({"tiers": {"a": {"model": "litellm_proxy/x"}}})
+
+    def test_missing_tiers_is_not_router(self):
+        assert not is_router_config({"kind": ROUTER_CONFIG_KIND})
+
+    def test_wrong_kind_is_not_router(self):
+        assert not is_router_config(
+            {"kind": "some-other-kind", "tiers": {"a": {"model": "litellm_proxy/x"}}}
+        )
+
+    def test_canonical_router_payload_is_router(self):
+        assert is_router_config(
+            {
+                "kind": ROUTER_CONFIG_KIND,
+                "tiers": {"a": {"model": "litellm_proxy/x"}},
+            }
+        )
+
+    def test_non_dict_inputs_are_not_router(self):
+        # The signature is typed as dict, but downstream code in
+        # build_matrix.py may pass arbitrary JSON values; we guard regardless.
+        assert not is_router_config(None)  # type: ignore[arg-type]
+        assert not is_router_config([])  # type: ignore[arg-type]
+        assert not is_router_config("string")  # type: ignore[arg-type]
+
+
+class TestCheckModelRouterRecursion:
+    """Tests for ``check_model``'s recursion into router tiers.
+
+    These tests exercise the real ``check_model`` code path with
+    ``litellm.completion`` mocked, the same pattern as the existing
+    ``TestTestModel`` class above.
+    """
+
+    def _router_entry(self) -> dict[str, Any]:
+        return {
+            "id": "router-test",
+            "display_name": "Router Test",
+            "llm_config": {
+                "kind": ROUTER_CONFIG_KIND,
+                "classifier_model_id": "a",
+                "fallback_model_id": "b",
+                "tiers": {
+                    "a": {"model": "litellm_proxy/provider/a"},
+                    "b": {"model": "litellm_proxy/provider/b"},
+                },
+                "routing": {"Frontend": "a", "Issue Resolution (other)": "b"},
+            },
+        }
+
+    def test_all_tiers_succeed(self):
+        """When every tier sub-model passes, the router entry passes."""
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+
+        with patch("litellm.completion", return_value=mock_response) as mock_completion:
+            success, message = check_model(
+                self._router_entry(), "k", "https://t", timeout=5
+            )
+
+        assert success is True
+        assert "Router Test" in message
+        assert "2 tier(s)" in message
+        # Each tier triggered exactly one litellm completion call.
+        assert mock_completion.call_count == 2
+        called_models = sorted(
+            call.kwargs["model"] for call in mock_completion.call_args_list
+        )
+        assert called_models == [
+            "litellm_proxy/provider/a",
+            "litellm_proxy/provider/b",
+        ]
+
+    def test_one_tier_failure_fails_router(self):
+        """If a single tier fails, the router entry as a whole fails."""
+        import litellm
+
+        ok_response = MagicMock()
+        ok_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+
+        def side_effect(*_args, model: str, **_kwargs):
+            if model.endswith("/b"):
+                raise litellm.exceptions.NotFoundError(
+                    "no such model", llm_provider="x", model=model
+                )
+            return ok_response
+
+        with patch("litellm.completion", side_effect=side_effect):
+            success, message = check_model(
+                self._router_entry(), "k", "https://t", timeout=5
+            )
+
+        assert success is False
+        assert "one or more tiers failed" in message
+        # Diagnostic still surfaces the per-tier line.
+        assert "litellm_proxy/provider/b" in message or "not found" in message
+
+    def test_empty_tiers_fails_loudly(self):
+        """A router with an empty tiers map should fail preflight, not crash."""
+        entry = self._router_entry()
+        entry["llm_config"]["tiers"] = {}
+        # ``check_model`` short-circuits before calling litellm at all.
+        with patch("litellm.completion") as mock_completion:
+            success, message = check_model(entry, "k", "https://t", timeout=5)
+        assert success is False
+        assert "no tiers" in message
+        mock_completion.assert_not_called()
+
+    def test_tier_config_params_are_forwarded(self):
+        """Tier-level temperature/top_p/etc. must reach litellm.completion."""
+        entry = self._router_entry()
+        entry["llm_config"]["tiers"]["a"]["temperature"] = 0.5
+        entry["llm_config"]["tiers"]["a"]["top_p"] = 0.9
+
+        mock_response = MagicMock()
+        mock_response.choices = [MagicMock(message=MagicMock(content="OK"))]
+        with patch("litellm.completion", return_value=mock_response) as mock_completion:
+            check_model(entry, "k", "https://t", timeout=5)
+
+        # Find the call for tier 'a' specifically.
+        a_call = next(
+            call
+            for call in mock_completion.call_args_list
+            if call.kwargs["model"].endswith("/a")
+        )
+        assert a_call.kwargs["temperature"] == 0.5
+        assert a_call.kwargs["top_p"] == 0.9