OpenHands · tofarr · Jun 17, 2026 · May 17, 2026 · May 24, 2026 · May 25, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -132,6 +132,7 @@ When reviewing code, provide constructive feedback:
 - Agent-server Docker publish tags are defined centrally in `openhands-agent-server/openhands/agent_server/docker/build.py`; keep `server.yml` manifest publication derived from the emitted per-arch tags so SHA/branch/git-tag aliases stay in sync, while preserving the legacy `latest-<variant>` alias used by workspace defaults.
 - The published agent-server Docker images in `.github/workflows/server.yml` must pass `OPENHANDS_BUILD_GIT_SHA` and `OPENHANDS_BUILD_GIT_REF` as explicit `docker/build-push-action` build args; the workflow only uses `docker/build.py` for context/tag generation, so those runtime env vars are otherwise left at the Dockerfile `unknown` defaults.
 - The PyInstaller agent-server binary should copy OpenHands distribution metadata (`openhands-agent-server`, `openhands-sdk`, `openhands-tools`, `openhands-workspace`) in `agent-server.spec`, otherwise `/server_info` version lookups via `importlib.metadata` can fall back to `unknown` inside published binary images.
+- Agent-server deferred init (warm-pool / dormant mode) is driven by `Config.deferred_init` (env `OH_DEFERRED_INIT`). The `InitService` in `openhands-agent-server/openhands/agent_server/init_router.py` owns the dormant→initializing→ready transition and is registered on `app.state.init_service` only when `deferred_init=True`; the `require_initialized` dependency, added to the `/api/*` router, returns 503 while not `ready`. Bootstrap auth for `POST /api/init` uses the existing `secret_key` (`X-Init-API-Key` header) — the orchestrator already holds this key for encryption, and it is overwritten when the per-user runtime config arrives in the init body. The agent-server's 5xx exception handler rewrites `detail` on 503s, so warm-pool orchestrators should rely on the HTTP status code (not the body) when probing dormant state.
 
 
 - Auto-title generation should not re-read `ConversationState.events` from a background task triggered by a freshly received `MessageEvent`; extract message text synchronously from the incoming event and then reuse shared title helpers (`extract_message_text`, `generate_title_from_message`) to avoid persistence-order races.

diff --git a/examples/02_remote_agent_server/16_deferred_init.py b/examples/02_remote_agent_server/16_deferred_init.py
@@ -0,0 +1,192 @@
+"""Example demonstrating deferred-init (warm-pool) mode for the agent server.
+
+In warm-pool deployments, server pods are pre-warmed before a user is matched
+to one. The pod boots with ``OH_DEFERRED_INIT=true``: stateless services
+(VSCode, tool preload, etc.) start as normal, but all ``/api/*`` routes return
+503 until ``POST /api/init`` delivers the runtime configuration (credentials,
+workspace paths, session keys).
+
+The orchestrator authenticates the init call with the server's bootstrap secret
+key (``OH_SECRET_KEY`` / ``X-Init-API-Key``), which it already holds for
+encryption purposes.
+
+Lifecycle demonstrated here:
+  1. Server starts in dormant mode.
+  2. ``GET /api/init`` reports state=dormant.
+  3. ``GET /api/conversations`` returns 503 (dormant gate is active).
+  4. ``POST /api/init`` delivers runtime config → server transitions to ready.
+  5. ``GET /api/init`` reports state=ready.
+  6. A conversation runs normally on the now-ready server.
+"""
+
+import os
+import tempfile
+import time
+from uuid import UUID
+
+import httpx
+from scripts.utils import ManagedAPIServer
+
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+# ── LLM config ──────────────────────────────────────────────────────────────
+
+api_key = os.getenv("LLM_API_KEY")
+assert api_key is not None, "LLM_API_KEY environment variable is not set."
+llm_model = os.getenv("LLM_MODEL", "gpt-5.5")
+llm_base_url = os.getenv("LLM_BASE_URL")
+
+# The orchestrator knows this key before the pod is matched to a user.
+# It's used to authenticate POST /api/init and as the encryption secret.
+BOOTSTRAP_SECRET_KEY = "demo-warm-pool-bootstrap-key-32b!"
+
+# ── Server lifecycle ─────────────────────────────────────────────────────────
+
+with ManagedAPIServer(
+    port=8003,
+    extra_env={
+        "OH_DEFERRED_INIT": "true",
+        "OH_SECRET_KEY": BOOTSTRAP_SECRET_KEY,
+        "TMUX_TMPDIR": "/tmp/oh-tmux-deferred",
+    },
+) as server:
+    client = httpx.Client(base_url=server.base_url, timeout=120.0)
+
+    try:
+        # ── 1. Confirm dormant state ─────────────────────────────────────────
+        logger.info("\n" + "=" * 60)
+        logger.info("📊 Step 1: checking initial (dormant) state")
+        logger.info("=" * 60)
+
+        resp = client.get("/api/init")
+        assert resp.status_code == 200, f"GET /api/init failed: {resp.text}"
+        init_status = resp.json()
+        assert init_status["state"] == "dormant", (
+            f"Expected dormant, got: {init_status['state']}"
+        )
+        logger.info(f"✅ Server is dormant — {init_status}")
+
+        # ── 2. Verify the dormant gate blocks /api/* ─────────────────────────
+        logger.info("\n" + "=" * 60)
+        logger.info("🚧 Step 2: dormant gate returns 503 on /api/conversations")
+        logger.info("=" * 60)
+
+        resp = client.get("/api/conversations")
+        assert resp.status_code == 503, (
+            f"Expected 503 from dormant gate, got {resp.status_code}"
+        )
+        logger.info("✅ /api/conversations correctly returns 503 while dormant")
+
+        # ── 3. Activate via POST /api/init ───────────────────────────────────
+        logger.info("\n" + "=" * 60)
+        logger.info("🚀 Step 3: activating server via POST /api/init")
+        logger.info("=" * 60)
+
+        temp_workspace_dir = tempfile.mkdtemp(prefix="deferred_init_demo_")
+
+        # In a real warm-pool deployment, credentials that the server shouldn't
+        # have at cold-start (e.g., the user's LLM API key) would arrive here.
+        llm_env: dict[str, str] = {"LLM_API_KEY": api_key}
+        if llm_base_url:
+            llm_env["LLM_BASE_URL"] = llm_base_url
+
+        init_body: dict = {
+            # Pass user credentials into the server's environment.
+            "env": llm_env,
+        }
+
+        resp = client.post(
+            "/api/init",
+            json=init_body,
+            headers={"X-Init-API-Key": BOOTSTRAP_SECRET_KEY},
+        )
+        assert resp.status_code == 200, f"POST /api/init failed: {resp.text}"
+        init_status = resp.json()
+        assert init_status["state"] == "ready", (
+            f"Expected ready after init, got: {init_status['state']}"
+        )
+        logger.info(f"✅ Server is now ready — {init_status}")
+
+        # ── 4. Confirm ready via GET /api/init ───────────────────────────────
+        resp = client.get("/api/init")
+        assert resp.status_code == 200
+        assert resp.json()["state"] == "ready"
+        logger.info("✅ GET /api/init confirms ready state")
+
+        # ── 5. Run a conversation on the now-ready server ────────────────────
+        logger.info("\n" + "=" * 60)
+        logger.info("🤖 Step 5: running a conversation on the ready server")
+        logger.info("=" * 60)
+
+        llm_config: dict[str, str] = {"model": llm_model, "api_key": api_key}
+        if llm_base_url:
+            llm_config["base_url"] = llm_base_url
+
+        start_request: dict = {
+            "agent": {
+                "kind": "Agent",
+                "llm": llm_config,
+                "tools": [],
+            },
+            "workspace": {"working_dir": temp_workspace_dir},
+            "initial_message": {
+                "role": "user",
+                "content": [{"type": "text", "text": "Reply with just the number 42."}],
+                "run": True,
+            },
+        }
+
+        resp = client.post("/api/conversations", json=start_request)
+        assert resp.status_code == 201, f"Start conversation failed: {resp.text}"
+        conversation_id = UUID(resp.json()["id"])
+        logger.info(f"✅ Conversation started: {conversation_id}")
+
+        # Poll until the agent finishes.
+        max_wait = 120
+        elapsed = 0
+        execution_status = "unknown"
+        while elapsed < max_wait:
+            resp = client.get(f"/api/conversations/{conversation_id}")
+            assert resp.status_code == 200
+            data = resp.json()
+            execution_status = data.get("execution_status", "unknown")
+            if execution_status in ("stopped", "paused", "error"):
+                break
+            logger.info(f"   status: {execution_status} ({elapsed}s elapsed)")
+            time.sleep(2)
+            elapsed += 2
+
+        logger.info(f"✅ Conversation finished — status: {execution_status}")
+        assert execution_status in ("stopped", "paused"), (
+            f"Unexpected final status: {execution_status}"
+        )
+
+        resp = client.get(f"/api/conversations/{conversation_id}/agent_final_response")
+        if resp.status_code == 200:
+            agent_response = resp.json().get("response", "")
+            logger.info(f"   Agent response: {agent_response!r}")
+
+        # Collect cost metrics.
+        accumulated_cost = 0.0
+        resp = client.get(f"/api/conversations/{conversation_id}")
+        if resp.status_code == 200:
+            stats = resp.json().get("stats") or {}
+            usage_to_metrics = stats.get("usage_to_metrics") or {}
+            accumulated_cost = sum(
+                m.get("accumulated_cost", 0.0) for m in usage_to_metrics.values()
+            )
+
+        client.delete(f"/api/conversations/{conversation_id}")
+        logger.info("   Conversation deleted")
+
+        logger.info("\n" + "=" * 60)
+        logger.info("🎉 Deferred-init example completed successfully!")
+        logger.info("=" * 60)
+
+        print(f"EXAMPLE_COST: {accumulated_cost}")
+
+    finally:
+        client.close()
diff --git a/openhands-agent-server/openhands/agent_server/api.py b/openhands-agent-server/openhands/agent_server/api.py
@@ -37,6 +37,11 @@
 from openhands.agent_server.file_router import file_router
 from openhands.agent_server.git_router import git_router
 from openhands.agent_server.hooks_router import hooks_router
+from openhands.agent_server.init_router import (
+    InitService,
+    init_router,
+    require_initialized,
+)
 from openhands.agent_server.llm_router import llm_router
 from openhands.agent_server.mcp_router import mcp_router
 from openhands.agent_server.middleware import CORSDispatcher
@@ -123,7 +128,8 @@ async def api_lifespan(api: FastAPI) -> AsyncIterator[None]:
         # Clean up stale tmux sessions from previous server runs
         _cleanup_stale_tmux_sessions()
 
-        service = get_default_conversation_service()
+        config: Config = api.state.config
+        deferred = config.deferred_init
         vscode_service = get_vscode_service()
         desktop_service = get_desktop_service()
         tool_preload_service = get_tool_preload_service()
@@ -184,13 +190,50 @@ async def start_tool_preload_service():
                 f"Server initialization failed with {len(exceptions)} exception(s)"
             ) from exceptions[0]
 
-        # Mark initialization as complete - now the /ready endpoint will return 200
-        # and Kubernetes readiness probes will pass
+        async def stop_stateless_services():
+            async def stop_vscode_service():
+                if vscode_service is not None:
+                    await vscode_service.stop()
+
+            async def stop_desktop_service():
+                if desktop_service is not None:
+                    await desktop_service.stop()
+
+            async def stop_tool_preload_service():
+                if tool_preload_service is not None:
+                    await tool_preload_service.stop()
+
+            await asyncio.gather(
+                stop_vscode_service(),
+                stop_desktop_service(),
+                stop_tool_preload_service(),
+                return_exceptions=True,
+            )
+
+        # In deferred-init mode the conversation service is *not* entered
+        # here — that happens later, when POST /api/init delivers the runtime
+        # config. We still mark the /ready endpoint as ready so a warm-pool
+        # orchestrator can tell the pod has finished booting and is
+        # available to receive its /api/init payload.
+        if deferred:
+            init_service = InitService(api, base_config=config)
+            api.state.init_service = init_service
+            mark_initialization_complete()
+            logger.info("Server started in deferred-init mode; awaiting POST /api/init")
+            try:
+                yield
+            finally:
+                await init_service.teardown()
+                await stop_stateless_services()
+            return
+
+        # Non-deferred (legacy) path: build and enter the conversation
+        # service as part of the lifespan, exactly as before.
+        service = get_default_conversation_service()
         mark_initialization_complete()
         logger.info("Server initialization complete - ready to serve requests")
 
         async with service:
-            # Store the initialized service in app state for dependency injection
             api.state.conversation_service = service
 
             config = api.state.config
@@ -214,26 +257,7 @@ async def start_tool_preload_service():
                     with suppress(asyncio.CancelledError):
                         await retention_task
 
-                # Define async functions for stopping each service
-                async def stop_vscode_service():
-                    if vscode_service is not None:
-                        await vscode_service.stop()
-
-                async def stop_desktop_service():
-                    if desktop_service is not None:
-                        await desktop_service.stop()
-
-                async def stop_tool_preload_service():
-                    if tool_preload_service is not None:
-                        await tool_preload_service.stop()
-
-                # Stop all services concurrently
-                await asyncio.gather(
-                    stop_vscode_service(),
-                    stop_desktop_service(),
-                    stop_tool_preload_service(),
-                    return_exceptions=True,
-                )
+                await stop_stateless_services()
     finally:
         if tmux_tmpdir_was_defaulted and os.environ.get("TMUX_TMPDIR") == str(
             tmux_tmpdir
@@ -293,12 +317,24 @@ def _add_api_routes(app: FastAPI, config: Config) -> None:
     """
     app.include_router(server_details_router)
 
+    # The /api/init endpoint bypasses both the session-key auth and the
+    # dormant gate. It has its own X-Init-API-Key auth. When
+    # ``deferred_init`` is False the endpoints are still mounted but return
+    # 404 because no InitService is registered on app.state — see
+    # ``get_init_service``.
+    init_api_router = APIRouter(prefix="/api")
+    init_api_router.include_router(init_router)
+    app.include_router(init_api_router)
+
     # Header-only auth: applied to every /api/* route EXCEPT the workspace
     # static-file routes (handled separately below). Cookies are NOT honored
     # here so that we don't expand the CSRF surface across the whole API.
     dependencies = []
     if config.session_api_keys:
         dependencies.append(Depends(create_session_api_key_dependency(config)))
+    # Dormant gate: when ``deferred_init`` is True this 503s every /api/*
+    # route until POST /api/init completes. No-op for non-deferred deployments.
+    dependencies.append(Depends(require_initialized))
 
     api_router = APIRouter(prefix="/api", dependencies=dependencies)
     api_router.include_router(event_router)

diff --git a/openhands-agent-server/openhands/agent_server/config.py b/openhands-agent-server/openhands/agent_server/config.py
@@ -236,6 +236,17 @@ class Config(BaseModel):
             "The URL where this agent server instance is available externally"
         ),
     )
+    deferred_init: bool = Field(
+        default=False,
+        description=(
+            "When True, the server starts in dormant mode. Stateless services "
+            "(VSCode, tool preload, etc.) start as usual, but the conversation, "
+            "event, and bash routers return 503 until POST /api/init is called with "
+            "the runtime configuration. This is intended for warm-pool deployments "
+            "where pods are pre-warmed before a user is matched and per-user "
+            "configuration is delivered later."
+        ),
+    )
     model_config: ClassVar[ConfigDict] = {"frozen": True}
 
     @property