diff --git a/.github/scripts/sync_code_blocks.py b/.github/scripts/sync_code_blocks.py index 2ba2e7593..93befc545 100755 --- a/.github/scripts/sync_code_blocks.py +++ b/.github/scripts/sync_code_blocks.py @@ -37,8 +37,8 @@ def extract_code_blocks(content: str) -> list[tuple[str, str, int, int]]: ``` """ - # Captures examples/...*.py after the first line, then the body up to ``` - pattern = r'```python[^\n]*\s+(examples/[^\s]+\.py)\n(.*?)```' + # Captures ...*.py after the first line, then the body up to ``` + pattern = r'```python[^\n]*\s+([^\s]+\.py)\n(.*?)```' matches: list[tuple[str, str, int, int]] = [] for match in re.finditer(pattern, content, re.DOTALL): file_ref = match.group(1) diff --git a/docs.json b/docs.json index 3c37c3dea..00574e7ac 100644 --- a/docs.json +++ b/docs.json @@ -219,10 +219,15 @@ { "group": "Remote Agent Server", "pages": [ + "sdk/guides/agent-server/overview", + "sdk/guides/agent-server/local-server", + "sdk/guides/agent-server/docker-sandbox", + "sdk/guides/agent-server/api-sandbox", { "group": "API Reference", "openapi": { - "source": "/openapi/agent-sdk.json" + "source": "/openapi/agent-sdk.json", + "directory": "sdk/guides/agent-server/api-reference" } } ] diff --git a/sdk/guides/agent-server/api-sandbox.mdx b/sdk/guides/agent-server/api-sandbox.mdx new file mode 100644 index 000000000..438391a64 --- /dev/null +++ b/sdk/guides/agent-server/api-sandbox.mdx @@ -0,0 +1,190 @@ +--- +title: API-based Sandbox +description: Connect to hosted API-based agent server for fully managed infrastructure. +--- + +The API-sandboxed agent server demonstrates how to use `APIRemoteWorkspace` to connect to a [OpenHands runtime API service](https://runtime.all-hands.dev/). This eliminates the need to manage your own infrastructure, providing automatic scaling, monitoring, and secure sandboxed execution. + +## Basic Example + + +This example is available on GitHub: [examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py](https://github.com/OpenHands/agent-sdk/blob/main/examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py) + + +This example shows how to connect to a hosted runtime API for fully managed agent execution: + +```python icon="python" expandable examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py +"""Example: APIRemoteWorkspace with Dynamic Build. + +This example demonstrates building an agent-server image on-the-fly from the SDK +codebase and launching it in a remote sandboxed environment via Runtime API. + +Usage: + uv run examples/24_remote_convo_with_api_sandboxed_server.py + +Requirements: + - LITELLM_API_KEY: API key for LLM access + - RUNTIME_API_KEY: API key for runtime API access +""" + +import os +import time + +from pydantic import SecretStr + +from openhands.sdk import ( + LLM, + Conversation, + RemoteConversation, + get_logger, +) +from openhands.tools.preset.default import get_default_agent +from openhands.workspace import APIRemoteWorkspace + + +logger = get_logger(__name__) + + +api_key = os.getenv("LITELLM_API_KEY") +assert api_key, "LITELLM_API_KEY required" + +llm = LLM( + usage_id="agent", + model="litellm_proxy/anthropic/claude-sonnet-4-5-20250929", + base_url="https://llm-proxy.eval.all-hands.dev", + api_key=SecretStr(api_key), +) + +runtime_api_key = os.getenv("RUNTIME_API_KEY") +if not runtime_api_key: + logger.error("RUNTIME_API_KEY required") + exit(1) + + +with APIRemoteWorkspace( + runtime_api_url="https://runtime.eval.all-hands.dev", + runtime_api_key=runtime_api_key, + server_image="ghcr.io/openhands/agent-server:main-python", +) as workspace: + agent = get_default_agent(llm=llm, cli_mode=True) + received_events: list = [] + last_event_time = {"ts": time.time()} + + def event_callback(event) -> None: + received_events.append(event) + last_event_time["ts"] = time.time() + + result = workspace.execute_command( + "echo 'Hello from sandboxed environment!' && pwd" + ) + logger.info(f"Command completed: {result.exit_code}, {result.stdout}") + + conversation = Conversation( + agent=agent, workspace=workspace, callbacks=[event_callback], visualize=True + ) + assert isinstance(conversation, RemoteConversation) + + try: + conversation.send_message( + "Read the current repo and write 3 facts about the project into FACTS.txt." + ) + conversation.run() + + while time.time() - last_event_time["ts"] < 2.0: + time.sleep(0.1) + + conversation.send_message("Great! Now delete that file.") + conversation.run() + finally: + conversation.close() +``` + +```bash Running the Example +export LLM_API_KEY="your-api-key" +# If using the OpenHands LLM proxy, set its base URL: +export LLM_BASE_URL="https://llm-proxy.eval.all-hands.dev" +export RUNTIME_API_KEY="your-runtime-api-key" +# Set the runtime API URL for the remote sandbox +export RUNTIME_API_URL="https://runtime.eval.all-hands.dev" +cd agent-sdk +uv run python examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py +``` + +## Key Concepts + +### APIRemoteWorkspace + +The `APIRemoteWorkspace` connects to a hosted runtime API service: + +```python highlight={48-52} +with APIRemoteWorkspace( + runtime_api_url="https://runtime.eval.all-hands.dev", + runtime_api_key=runtime_api_key, + server_image="ghcr.io/openhands/agent-server:main-python", +) as workspace: +``` + +This workspace type: +- Connects to a remote runtime API service +- Automatically provisions sandboxed environments +- Manages container lifecycle through the API +- Handles all infrastructure concerns + +### Runtime API Authentication + +The example requires a runtime API key for authentication: + +```python highlight={42-45} +runtime_api_key = os.getenv("RUNTIME_API_KEY") +if not runtime_api_key: + logger.error("RUNTIME_API_KEY required") + exit(1) +``` + +This key authenticates your requests to the hosted runtime service. + +### Pre-built Image Selection + +You can specify which pre-built agent server image to use: + +```python highlight={51} +APIRemoteWorkspace( + runtime_api_url="https://runtime.eval.all-hands.dev", + runtime_api_key=runtime_api_key, + server_image="ghcr.io/openhands/agent-server:main-python", +) +``` + +The runtime API will pull and run the specified image in a sandboxed environment. + +### Workspace Testing + +Just like with DockerWorkspace, you can test the workspace before running the agent: + +```python highlight={61-64} +result = workspace.execute_command( + "echo 'Hello from sandboxed environment!' && pwd" +) +logger.info(f"Command completed: {result.exit_code}, {result.stdout}") +``` + +This verifies connectivity to the remote runtime and ensures the environment is ready. + +### Automatic RemoteConversation + +The conversation uses WebSocket communication with the remote server: + +```python highlight={66-68} +conversation = Conversation( + agent=agent, workspace=workspace, callbacks=[event_callback], visualize=True +) +assert isinstance(conversation, RemoteConversation) +``` + +All agent execution happens on the remote runtime infrastructure. + +## Next Steps + +- **[Docker Sandboxed Server](/sdk/guides/agent-server/docker-sandboxed-server)** +- **[Local Agent Server](/sdk/guides/agent-server/local-agent-server)** +- **[Agent Server Package Architecture](/sdk/arch/agent-server-package)** diff --git a/sdk/guides/agent-server/docker-sandbox.mdx b/sdk/guides/agent-server/docker-sandbox.mdx new file mode 100644 index 000000000..e07b44daf --- /dev/null +++ b/sdk/guides/agent-server/docker-sandbox.mdx @@ -0,0 +1,580 @@ +--- +title: Docker Sandbox +description: Run agent server in isolated Docker containers for security and reproducibility. +--- + +The docker sandboxed agent server demonstrates how to run agents in isolated Docker containers using DockerWorkspace. + +This provides complete isolation from the host system, making it ideal for production deployments, testing, and executing untrusted code safely. + +the Docker sandbox image ships with features configured in the [Dockerfile](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-agent-server/openhands/agent_server/docker/Dockerfile) (e.g., secure defaults and services like VSCode and VNC exposed behind well-defined ports), which are not available in the local (non-Docker) agent server. + +## 1) Basic Docker Sandbox + + +This example is available on GitHub: [examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py](https://github.com/OpenHands/agent-sdk/blob/main/examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py) + + +This example shows how to create a DockerWorkspace that automatically manages Docker containers for agent execution: + +```python icon="python" expandable examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py +import os +import platform +import time + +from pydantic import SecretStr + +from openhands.sdk import ( + LLM, + Conversation, + RemoteConversation, + get_logger, +) +from openhands.tools.preset.default import get_default_agent +from openhands.workspace import DockerWorkspace + + +logger = get_logger(__name__) + + +# 1) Ensure we have LLM API key +api_key = os.getenv("LLM_API_KEY") +assert api_key is not None, "LLM_API_KEY environment variable is not set." + +llm = LLM( + usage_id="agent", + model="litellm_proxy/anthropic/claude-sonnet-4-5-20250929", + base_url="https://llm-proxy.eval.all-hands.dev", + api_key=SecretStr(api_key), +) + + +def detect_platform(): + """Detects the correct Docker platform string.""" + machine = platform.machine().lower() + if "arm" in machine or "aarch64" in machine: + return "linux/arm64" + return "linux/amd64" + + +# 2) Create a Docker-based remote workspace that will set up and manage +# the Docker container automatically +with DockerWorkspace( + # dynamically build agent-server image + # base_image="nikolaik/python-nodejs:python3.12-nodejs22", + # use pre-built image for faster startup + server_image="ghcr.io/openhands/agent-server:main-python", + host_port=8010, + platform=detect_platform(), +) as workspace: + # 3) Create agent + agent = get_default_agent( + llm=llm, + cli_mode=True, + ) + + # 4) Set up callback collection + received_events: list = [] + last_event_time = {"ts": time.time()} + + def event_callback(event) -> None: + event_type = type(event).__name__ + logger.info(f"๐Ÿ”” Callback received event: {event_type}\n{event}") + received_events.append(event) + last_event_time["ts"] = time.time() + + # 5) Test the workspace with a simple command + result = workspace.execute_command( + "echo 'Hello from sandboxed environment!' && pwd" + ) + logger.info( + f"Command '{result.command}' completed with exit code {result.exit_code}" + ) + logger.info(f"Output: {result.stdout}") + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[event_callback], + visualize=True, + ) + assert isinstance(conversation, RemoteConversation) + + try: + logger.info(f"\n๐Ÿ“‹ Conversation ID: {conversation.state.id}") + + logger.info("๐Ÿ“ Sending first message...") + conversation.send_message( + "Read the current repo and write 3 facts about the project into FACTS.txt." + ) + logger.info("๐Ÿš€ Running conversation...") + conversation.run() + logger.info("โœ… First task completed!") + logger.info(f"Agent status: {conversation.state.agent_status}") + + # Wait for events to settle (no events for 2 seconds) + logger.info("โณ Waiting for events to stop...") + while time.time() - last_event_time["ts"] < 2.0: + time.sleep(0.1) + logger.info("โœ… Events have stopped") + + logger.info("๐Ÿš€ Running conversation again...") + conversation.send_message("Great! Now delete that file.") + conversation.run() + logger.info("โœ… Second task completed!") + finally: + print("\n๐Ÿงน Cleaning up conversation...") + conversation.close() +``` + +```bash Running the Example +export LLM_API_KEY="your-api-key" +cd agent-sdk +uv run python examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py +``` + +### Key Concepts + +#### DockerWorkspace Context Manager + +The `DockerWorkspace` uses a context manager to automatically handle container lifecycle: + +```python highlight={42-50} +with DockerWorkspace( + # dynamically build agent-server image + # base_image="nikolaik/python-nodejs:python3.12-nodejs22", + # use pre-built image for faster startup + server_image="ghcr.io/openhands/agent-server:latest-python", + host_port=8010, + platform=detect_platform(), +) as workspace: + # Container is running here + # Work with the workspace + pass +# Container is automatically stopped and cleaned up here +``` + +The workspace automatically: +- Pulls or builds the Docker image +- Starts the container with an agent server +- Waits for the server to be ready +- Cleans up the container when done + +#### Platform Detection + +The example includes platform detection to ensure the correct Docker image is built and used: + +```python highlight={32-37} +def detect_platform(): + """Detects the correct Docker platform string.""" + machine = platform.machine().lower() + if "arm" in machine or "aarch64" in machine: + return "linux/arm64" + return "linux/amd64" +``` + +This ensures compatibility across different CPU architectures (Intel/AMD vs ARM/Apple Silicon). + + +#### Testing the Workspace + +Before creating a conversation, the example tests the workspace connection: + +```python highlight={68-74} +result = workspace.execute_command( + "echo 'Hello from sandboxed environment!' && pwd" +) +logger.info( + f"Command '{result.command}' completed with exit code {result.exit_code}" +) +logger.info(f"Output: {result.stdout}") +``` + +This verifies the workspace is properly initialized and can execute commands. + +#### Automatic RemoteConversation + +When you use a DockerWorkspace, the Conversation automatically becomes a RemoteConversation: + +```python highlight={75-81} +conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[event_callback], + visualize=True, +) +assert isinstance(conversation, RemoteConversation) +``` + +The SDK detects the remote workspace and uses WebSocket communication for real-time event streaming. + + +#### Pre-built vs Base Images + +```python +# โœ… Fast: Use pre-built image (recommended) +DockerWorkspace( + server_image="ghcr.io/openhands/agent-server:latest-python", + host_port=8010, +) + +# โฑ๏ธ Slower: Build on the fly from base image (more control) +DockerWorkspace( + base_image="nikolaik/python-nodejs:python3.12-nodejs22", + host_port=8010, +) +``` + +Pre-built images start immediately, while base images need to build the agent server first. + +--- + +## 2) VS Code in Docker Sandbox + + +This example is available on GitHub: [examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py](https://github.com/OpenHands/agent-sdk/blob/main/examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py) + + +VS Code with Docker demonstrates how to enable VS Code Web integration in a Docker-sandboxed environment. This allows you to access a full VS Code editor running in the container, making it easy to inspect, edit, and manage files that the agent is working with. + +```python icon="python" expandable examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py +import os +import time + +import httpx +from pydantic import SecretStr + +from openhands.sdk import LLM, Conversation, get_logger +from openhands.sdk.conversation.impl.remote_conversation import RemoteConversation +from openhands.tools.preset.default import get_default_agent +from openhands.workspace import DockerWorkspace + + +logger = get_logger(__name__) + + +api_key = os.getenv("LLM_API_KEY") +assert api_key is not None, "LLM_API_KEY environment variable is not set." + +llm = LLM( + usage_id="agent", + model="litellm_proxy/anthropic/claude-sonnet-4-5-20250929", + base_url="https://llm-proxy.eval.all-hands.dev", + api_key=SecretStr(api_key), +) + +# Create a Docker-based remote workspace with extra ports for VSCode access +with DockerWorkspace( + base_image="nikolaik/python-nodejs:python3.12-nodejs22", + host_port=18010, + # TODO: Change this to your platform if not linux/arm64 + platform="linux/arm64", + extra_ports=True, # Expose extra ports for VSCode and VNC +) as workspace: + """Extra ports allows you to access VSCode at localhost:8011""" + + # Create agent + agent = get_default_agent( + llm=llm, + cli_mode=True, + ) + + # Set up callback collection + received_events: list = [] + last_event_time = {"ts": time.time()} + + def event_callback(event) -> None: + event_type = type(event).__name__ + logger.info(f"๐Ÿ”” Callback received event: {event_type}\n{event}") + received_events.append(event) + last_event_time["ts"] = time.time() + + # Create RemoteConversation using the workspace + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[event_callback], + visualize=True, + ) + assert isinstance(conversation, RemoteConversation) + + logger.info(f"\n๐Ÿ“‹ Conversation ID: {conversation.state.id}") + logger.info("๐Ÿ“ Sending first message...") + conversation.send_message("Create a simple Python script that prints Hello World") + conversation.run() + + # Get VSCode URL with token + vscode_port = (workspace.host_port or 8010) + 1 + try: + response = httpx.get( + f"{workspace.host}/api/vscode/url", + params={"workspace_dir": workspace.working_dir}, + ) + vscode_data = response.json() + vscode_url = vscode_data.get("url", "").replace( + "localhost:8001", f"localhost:{vscode_port}" + ) + except Exception: + # Fallback if server route not available + folder = ( + f"/{workspace.working_dir}" + if not str(workspace.working_dir).startswith("/") + else str(workspace.working_dir) + ) + vscode_url = f"http://localhost:{vscode_port}/?folder={folder}" + + # Wait for user to explore VSCode + y = None + while y != "y": + y = input( + "\n" + "Because you've enabled extra_ports=True in DockerWorkspace, " + "you can open VSCode Web to see the workspace.\n\n" + f"VSCode URL: {vscode_url}\n\n" + "The VSCode should have the OpenHands settings extension installed:\n" + " - Dark theme enabled\n" + " - Auto-save enabled\n" + " - Telemetry disabled\n" + " - Auto-updates disabled\n\n" + "Press 'y' and Enter to exit and terminate the workspace.\n" + ">> " + ) +``` + +```bash Running the Example +export LLM_API_KEY="your-api-key" +cd agent-sdk +uv run python examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py +``` + +### Key Concepts + +#### VS Code-Enabled DockerWorkspace + +The workspace is configured with extra ports for VS Code access: + +```python highlight={27-34} +with DockerWorkspace( + base_image="nikolaik/python-nodejs:python3.12-nodejs22", + host_port=18010, + platform="linux/arm64", + extra_ports=True, # Expose extra ports for VSCode and VNC +) as workspace: + """Extra ports allows you to access VSCode at localhost:8011""" +``` + +The `extra_ports=True` setting exposes: +- Port `host_port+1`: VS Code Web interface (host_port + 1) +- Port `host_port+2`: VNC viewer for visual access + + +#### VS Code URL Generation + +The example retrieves the VS Code URL with authentication token: + +```python highlight={68-86} +# Get VSCode URL with token +vscode_port = (workspace.host_port or 8010) + 1 +try: + response = httpx.get( + f"{workspace.host}/api/vscode/url", + params={"workspace_dir": workspace.working_dir}, + ) + vscode_data = response.json() + vscode_url = vscode_data.get("url", "").replace( + "localhost:8001", f"localhost:{vscode_port}" + ) +except Exception: + # Fallback if server route not available + folder = ( + f"/{workspace.working_dir}" + if not str(workspace.working_dir).startswith("/") + else str(workspace.working_dir) + ) + vscode_url = f"http://localhost:{vscode_port}/?folder={folder}" +``` + +This generates a properly authenticated URL with the workspace directory pre-opened. + +Read the API Reference [here](/sdk/guides/agent-server/api-reference/vscode/get-vscode-url) for more information. + +#### VS Code URL Format + +``` +http://localhost:{vscode_port}/?tkn={token}&folder={workspace_dir} +``` + +- vscode_port: Usually host_port + 1 (e.g., 8011) +- tkn: Authentication token for security +- folder: Workspace directory to open + +--- + +## 3) Browser in Docker Sandbox + + +This example is available on GitHub: [examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py](https://github.com/OpenHands/agent-sdk/blob/main/examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py) + + +Browser with Docker demonstrates how to enable browser automation capabilities in a Docker-sandboxed environment. This allows agents to browse websites, interact with web content, and perform web automation tasks while maintaining complete isolation from your host system. + +This example shows how to configure DockerWorkspace with browser capabilities and VNC access: + +```python icon="python" expandable examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py +import os +import platform +import time + +from pydantic import SecretStr + +from openhands.sdk import LLM, Conversation, get_logger +from openhands.sdk.conversation.impl.remote_conversation import RemoteConversation +from openhands.tools.preset.default import get_default_agent +from openhands.workspace import DockerWorkspace + + +logger = get_logger(__name__) + + +api_key = os.getenv("LLM_API_KEY") +assert api_key is not None, "LLM_API_KEY environment variable is not set." + +llm = LLM( + usage_id="agent", + model="litellm_proxy/anthropic/claude-sonnet-4-5-20250929", + base_url="https://llm-proxy.eval.all-hands.dev", + api_key=SecretStr(api_key), +) + + +def detect_platform(): + """Detects the correct Docker platform string.""" + machine = platform.machine().lower() + if "arm" in machine or "aarch64" in machine: + return "linux/arm64" + return "linux/amd64" + + +# Create a Docker-based remote workspace with extra ports for browser access +with DockerWorkspace( + base_image="nikolaik/python-nodejs:python3.12-nodejs22", + host_port=8010, + # TODO: Change this to your platform if not linux/arm64 + platform=detect_platform(), + extra_ports=True, # Expose extra ports for VSCode and VNC +) as workspace: + """Extra ports allows you to check localhost:8012 for VNC""" + + # Create agent with browser tools enabled + agent = get_default_agent( + llm=llm, + cli_mode=False, # CLI mode = False will enable browser tools + ) + + # Set up callback collection + received_events: list = [] + last_event_time = {"ts": time.time()} + + def event_callback(event) -> None: + event_type = type(event).__name__ + logger.info(f"๐Ÿ”” Callback received event: {event_type}\n{event}") + received_events.append(event) + last_event_time["ts"] = time.time() + + # Create RemoteConversation using the workspace + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[event_callback], + visualize=True, + ) + assert isinstance(conversation, RemoteConversation) + + logger.info(f"\n๐Ÿ“‹ Conversation ID: {conversation.state.id}") + logger.info("๐Ÿ“ Sending first message...") + conversation.send_message( + "Could you go to https://all-hands.dev/ blog page and summarize main " + "points of the latest blog?" + ) + conversation.run() + + # Wait for user confirm to exit + y = None + while y != "y": + y = input( + "Because you've enabled extra_ports=True in DockerWorkspace, " + "you can open a browser tab to see the *actual* browser OpenHands " + "is interacting with via VNC.\n\n" + "Link: http://localhost:8012/vnc.html?autoconnect=1&resize=remote\n\n" + "Press 'y' and Enter to exit and terminate the workspace.\n" + ">> " + ) +``` + +```bash Running the Example +export LLM_API_KEY="your-api-key" +cd agent-sdk +uv run python examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py +``` + +### Key Concepts + +#### Browser-Enabled DockerWorkspace + +The workspace is configured with extra ports for browser access: + +```python highlight={36-43} +with DockerWorkspace( + base_image="nikolaik/python-nodejs:python3.12-nodejs22", + host_port=8010, + platform=detect_platform(), + extra_ports=True, # Expose extra ports for VSCode and VNC +) as workspace: + """Extra ports allows you to check localhost:8012 for VNC""" +``` + +The `extra_ports=True` setting exposes additional ports for: +- Port `host_port+1`: VS Code Web interface +- Port `host_port+2`: VNC viewer for browser visualization + +#### Enabling Browser Tools + +Browser tools are enabled by setting `cli_mode=False`: + +```python highlight={46-50} +# Create agent with browser tools enabled +agent = get_default_agent( + llm=llm, + cli_mode=False, # CLI mode = False will enable browser tools +) +``` + +When `cli_mode=False`, the agent gains access to browser automation tools for web interaction. + +When VNC is available and `extra_ports=True`, the browser will be opened in the VNC desktop to visualize agent's work. You can watch the browser in real-time via VNC. Demo video: + + +#### VNC Access + +The VNC interface provides real-time visual access to the browser: + +``` +http://localhost:8012/vnc.html?autoconnect=1&resize=remote +``` + +- autoconnect=1: Automatically connect to VNC server +- resize=remote: Automatically adjust resolution + +Read the API Reference [here](/sdk/guides/agent-server/api-reference/desktop/get-desktop-url) for more information. + +--- + +## Next Steps + +- **[Local Agent Server](/sdk/guides/agent-server/local-server)** +- **[Agent Server Overview](/sdk/guides/agent-server/overview)** +- **[API Sandboxed Server](/sdk/guides/agent-server/api-sandbox)** - Connect to hosted API service +- **[Agent Server Package Architecture](/sdk/arch/agent-server-package)** - Architecture and design decisions diff --git a/sdk/guides/agent-server/local-server.mdx b/sdk/guides/agent-server/local-server.mdx new file mode 100644 index 000000000..0c0b3b1c8 --- /dev/null +++ b/sdk/guides/agent-server/local-server.mdx @@ -0,0 +1,358 @@ +--- +title: Local Agent Server +description: Run agents through a local HTTP server with RemoteConversation for client-server architecture. +--- + +The Local Agent Server demonstrates how to run a remote agent server locally and connect to it using RemoteConversation. This pattern is useful for local development, testing, and scenarios where you want to separate the client code from the agent execution environment. + +## Basic Example + + +This example is available on GitHub: [examples/02_remote_agent_server/01_convo_with_local_agent_server.py](https://github.com/OpenHands/agent-sdk/blob/main/examples/02_remote_agent_server/01_convo_with_local_agent_server.py) + + +This example shows how to programmatically start a local agent server and interact with it through a RemoteConversation: + +```python icon="python" expandable examples/02_remote_agent_server/01_convo_with_local_agent_server.py +import os +import subprocess +import sys +import threading +import time + +from pydantic import SecretStr + +from openhands.sdk import LLM, Conversation, RemoteConversation, Workspace, get_logger +from openhands.sdk.event import ConversationStateUpdateEvent +from openhands.tools.preset.default import get_default_agent + + +logger = get_logger(__name__) + + +def _stream_output(stream, prefix, target_stream): + """Stream output from subprocess to target stream with prefix.""" + try: + for line in iter(stream.readline, ""): + if line: + target_stream.write(f"[{prefix}] {line}") + target_stream.flush() + except Exception as e: + print(f"Error streaming {prefix}: {e}", file=sys.stderr) + finally: + stream.close() + + +class ManagedAPIServer: + """Context manager for subprocess-managed OpenHands API server.""" + + def __init__(self, port: int = 8000, host: str = "127.0.0.1"): + self.port: int = port + self.host: str = host + self.process: subprocess.Popen[bytes] | None = None + self.base_url: str = f"http://{host}:{port}" + self.stdout_thread: threading.Thread | None = None + self.stderr_thread: threading.Thread | None = None + + def __enter__(self): + """Start the API server subprocess.""" + print(f"Starting OpenHands API server on {self.base_url}...") + + # Start the server process + self.process = subprocess.Popen( + [ + "python", + "-m", + "openhands.agent_server", + "--port", + str(self.port), + "--host", + self.host, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env={"LOG_JSON": "true", **os.environ}, + ) + + # Start threads to stream stdout and stderr + self.stdout_thread = threading.Thread( + target=_stream_output, + args=(self.process.stdout, "SERVER", sys.stdout), + daemon=True, + ) + self.stderr_thread = threading.Thread( + target=_stream_output, + args=(self.process.stderr, "SERVER", sys.stderr), + daemon=True, + ) + + self.stdout_thread.start() + self.stderr_thread.start() + + # Wait for server to be ready + max_retries = 30 + for i in range(max_retries): + try: + import httpx + + response = httpx.get(f"{self.base_url}/health", timeout=1.0) + if response.status_code == 200: + print(f"API server is ready at {self.base_url}") + return self + except Exception: + pass + + if self.process.poll() is not None: + # Process has terminated + raise RuntimeError( + "Server process terminated unexpectedly. " + "Check the server logs above for details." + ) + + time.sleep(1) + + raise RuntimeError(f"Server failed to start after {max_retries} seconds") + + def __exit__(self, exc_type, exc_val, exc_tb): + """Stop the API server subprocess.""" + if self.process: + print("Stopping API server...") + self.process.terminate() + try: + self.process.wait(timeout=5) + except subprocess.TimeoutExpired: + print("Force killing API server...") + self.process.kill() + self.process.wait() + + # Wait for streaming threads to finish (they're daemon threads, + # so they'll stop automatically) + # But give them a moment to flush any remaining output + time.sleep(0.5) + print("API server stopped.") + + +api_key = os.getenv("LLM_API_KEY") +assert api_key is not None, "LLM_API_KEY environment variable is not set." + +llm = LLM( + usage_id="agent", + model="litellm_proxy/anthropic/claude-sonnet-4-5-20250929", + base_url="https://llm-proxy.eval.all-hands.dev", + api_key=SecretStr(api_key), +) +title_gen_llm = LLM( + usage_id="title-gen-llm", + model="litellm_proxy/openai/gpt-5-mini", + base_url="https://llm-proxy.eval.all-hands.dev", + api_key=SecretStr(api_key), +) + +# Use managed API server +with ManagedAPIServer(port=8001) as server: + # Create agent + agent = get_default_agent( + llm=llm, + cli_mode=True, # Disable browser tools for simplicity + ) + + # Define callbacks to test the WebSocket functionality + received_events = [] + event_tracker = {"last_event_time": time.time()} + + def event_callback(event): + """Callback to capture events for testing.""" + event_type = type(event).__name__ + logger.info(f"๐Ÿ”” Callback received event: {event_type}\n{event}") + received_events.append(event) + event_tracker["last_event_time"] = time.time() + + # Create RemoteConversation with callbacks + # NOTE: Workspace is required for RemoteConversation + workspace = Workspace(host=server.base_url) + result = workspace.execute_command("pwd") + logger.info( + f"Command '{result.command}' completed with exit code {result.exit_code}" + ) + logger.info(f"Output: {result.stdout}") + + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[event_callback], + visualize=True, + ) + assert isinstance(conversation, RemoteConversation) + + try: + logger.info(f"\n๐Ÿ“‹ Conversation ID: {conversation.state.id}") + + # Send first message and run + logger.info("๐Ÿ“ Sending first message...") + conversation.send_message( + "Read the current repo and write 3 facts about the project into FACTS.txt." + ) + + # Generate title using a specific LLM + title = conversation.generate_title(max_length=60, llm=title_gen_llm) + logger.info(f"Generated conversation title: {title}") + + logger.info("๐Ÿš€ Running conversation...") + conversation.run() + + logger.info("โœ… First task completed!") + logger.info(f"Agent status: {conversation.state.agent_status}") + + # Wait for events to stop coming (no events for 2 seconds) + logger.info("โณ Waiting for events to stop...") + while time.time() - event_tracker["last_event_time"] < 2.0: + time.sleep(0.1) + logger.info("โœ… Events have stopped") + + logger.info("๐Ÿš€ Running conversation again...") + conversation.send_message("Great! Now delete that file.") + conversation.run() + logger.info("โœ… Second task completed!") + + # Demonstrate state.events functionality + logger.info("\n" + "=" * 50) + logger.info("๐Ÿ“Š Demonstrating State Events API") + logger.info("=" * 50) + + # Count total events using state.events + total_events = len(conversation.state.events) + logger.info(f"๐Ÿ“ˆ Total events in conversation: {total_events}") + + # Get recent events (last 5) using state.events + logger.info("\n๐Ÿ” Getting last 5 events using state.events...") + all_events = conversation.state.events + recent_events = all_events[-5:] if len(all_events) >= 5 else all_events + + for i, event in enumerate(recent_events, 1): + event_type = type(event).__name__ + timestamp = getattr(event, "timestamp", "Unknown") + logger.info(f" {i}. {event_type} at {timestamp}") + + # Let's see what the actual event types are + logger.info("\n๐Ÿ” Event types found:") + event_types = set() + for event in recent_events: + event_type = type(event).__name__ + event_types.add(event_type) + for event_type in sorted(event_types): + logger.info(f" - {event_type}") + + # Print all ConversationStateUpdateEvent + logger.info("\n๐Ÿ—‚๏ธ ConversationStateUpdateEvent events:") + for event in conversation.state.events: + if isinstance(event, ConversationStateUpdateEvent): + logger.info(f" - {event}") + + finally: + # Clean up + print("\n๐Ÿงน Cleaning up conversation...") + conversation.close() +``` + +```bash Running the Example +export LLM_API_KEY="your-api-key" +cd agent-sdk +uv run python examples/02_remote_agent_server/01_convo_with_local_agent_server.py +``` + +## Key Concepts + +### Managed API Server + +The example includes a `ManagedAPIServer` context manager that handles starting and stopping the server subprocess: + +```python highlight={42-61} +class ManagedAPIServer: + """Context manager for subprocess-managed OpenHands API server.""" + + def __enter__(self): + """Start the API server subprocess.""" + self.process = subprocess.Popen( + [ + "python", + "-m", + "openhands.agent_server", + "--port", + str(self.port), + "--host", + self.host, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env={"LOG_JSON": "true", **os.environ}, + ) +``` + +The server starts with `python -m openhands.agent_server` and automatically handles health checks to ensure it's ready before proceeding. + +### Remote Workspace + +When connecting to a remote server, you need to provide a `Workspace` that connects to that server: + +```python +workspace = Workspace(host=server.base_url) +result = workspace.execute_command("pwd") +``` + +When `host` is provided, the `Workspace` returns an instance of `RemoteWorkspace` ([source](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/workspace/workspace.py)). +The `Workspace` object communicates with the remote server's API to execute commands and manage files. + +### RemoteConversation + +When you pass a remote `Workspace` to `Conversation`, it automatically becomes a `RemoteConversation` ([source](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/conversation/conversation.py)): + +```python +conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[event_callback], + visualize=True, +) +assert isinstance(conversation, RemoteConversation) +``` + +RemoteConversation handles communication with the remote agent server over WebSocket for real-time event streaming. + +### Event Callbacks + +Callbacks receive events in real-time as they happen on the remote server: + +```python +def event_callback(event): + """Callback to capture events for testing.""" + event_type = type(event).__name__ + logger.info(f"๐Ÿ”” Callback received event: {event_type}\n{event}") + received_events.append(event) + event_tracker["last_event_time"] = time.time() +``` + +This enables monitoring agent activity, tracking progress, and implementing custom event handling logic. + +### Conversation State + +The conversation state provides access to all events and status: + +```python +# Count total events using state.events +total_events = len(conversation.state.events) +logger.info(f"๐Ÿ“ˆ Total events in conversation: {total_events}") + +# Get recent events (last 5) using state.events +all_events = conversation.state.events +recent_events = all_events[-5:] if len(all_events) >= 5 else all_events +``` + +This allows you to inspect the conversation history, analyze agent behavior, and build custom monitoring tools. + +## Next Steps + +- **[Docker Sandboxed Server](/sdk/guides/agent-server/docker-sandbox)** - Run server in Docker for isolation +- **[API Sandboxed Server](/sdk/guides/agent-server/api-sandbox)** - Connect to hosted API service +- **[Agent Server Package Architecture](/sdk/arch/agent-server-package)** - Architecture and design decisions diff --git a/sdk/guides/agent-server/overview.mdx b/sdk/guides/agent-server/overview.mdx new file mode 100644 index 000000000..87525cdc0 --- /dev/null +++ b/sdk/guides/agent-server/overview.mdx @@ -0,0 +1,157 @@ +--- +title: Overview +description: Run agents on remote servers with isolated workspaces for production deployments. +--- + +Remote Agent Servers package the Agent SDK into containers you can deploy anywhere (Kubernetes, VMs, onโ€‘prem, any cloud) with strong isolation. The remote path uses the exact same SDK API as localโ€”switching is just changing the workspace argument; your Conversation code stays the same. + + +For example, switching from a local workspace to a Dockerโ€‘based remote agent server: + +```python lines +# Local โ†’ Docker +conversation = Conversation(agent=agent, workspace=os.getcwd()) # [!code --] +from openhands.workspace import DockerWorkspace # [!code ++] +with DockerWorkspace( # [!code ++] + server_image="ghcr.io/openhands/agent-server:latest-python", # [!code ++] +) as workspace: # [!code ++] + conversation = Conversation(agent=agent, workspace=workspace) # [!code ++] +``` + +Or switching to an APIโ€‘based remote workspace (via [OpenHands Runtime API](https://runtime.all-hands.dev/)): + +```python lines +# Local โ†’ Remote API +conversation = Conversation(agent=agent, workspace=os.getcwd()) # [!code --] +from openhands.workspace import APIRemoteWorkspace # [!code ++] +with APIRemoteWorkspace( # [!code ++] + runtime_api_url="https://runtime.eval.all-hands.dev", # [!code ++] + runtime_api_key="YOUR_API_KEY", # [!code ++] + server_image="ghcr.io/openhands/agent-server:latest-python", # [!code ++] +) as workspace: # [!code ++] + conversation = Conversation(agent=agent, workspace=workspace) # [!code ++] +``` + + +## What is a Remote Agent Server? + +A Remote Agent Server is an HTTP/WebSocket server that: +- **Package the Agent SDK into containers** and deploy on your own infrastructure (Kubernetes, VMs, on-prem, or cloud) +- **Runs agents** on dedicated infrastructure +- **Manages workspaces** (Docker containers or remote sandboxes) +- **Streams events** to clients via WebSocket +- **Handles command and file operations** (execute command, upload, download), check [base class](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/workspace/base.py) for more details +- **Provides isolation** between different agent executions + +Think of it as the "backend" for your agent, while your Python code acts as the "frontend" client. + +{/* +Same interfaces as local: +[BaseConversation](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/conversation/base.py), +[ConversationStateProtocol](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/conversation/base.py), +[EventsListBase](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/conversation/events_list_base.py). Server-backed impl: +[RemoteConversation](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py). + */} + + +## Architecture Overview + +Remote Agent Servers follow a simple three-part architecture: + +```mermaid +graph TD + Client[Client Code] -->|HTTP / WebSocket| Server[Agent Server] + Server --> Workspace[Workspace] + + subgraph Workspace Types + Workspace --> Local[Local Folder] + Workspace --> Docker[Docker Container] + Workspace --> API[Remote Sandbox via API] + end + + Local --> Files[File System] + Docker --> Container[Isolated Runtime] + API --> Cloud[Cloud Infrastructure] + + style Client fill:#e1f5fe + style Server fill:#fff3e0 + style Workspace fill:#e8f5e8 +``` + +1. **Client (Python SDK)** โ€” Your application creates and controls conversations using the SDK. +2. **Agent Server** โ€” A lightweight HTTP/WebSocket service that runs the agent and manages workspace execution. +3. **Workspace** โ€” An isolated environment (local, Docker, or remote VM) where the agent code runs. + +The same SDK API works across all three workspace typesโ€”you just switch which workspace the conversation connects to. + +## How Remote Conversations Work + +Each step in the diagram maps directly to how the SDK and server interact: + +### 1. Workspace Connection โ†’ *(Client โ†’ Server)* + +When you create a conversation with a remote workspace (e.g., `DockerWorkspace` or `APIRemoteWorkspace`), the SDK automatically starts or connects to an agent server inside that workspace: + +```python +with DockerWorkspace(server_image="ghcr.io/openhands/agent-server:latest") as workspace: + conversation = Conversation(agent=agent, workspace=workspace) +``` + +This turns the local `Conversation` into a **[RemoteConversation](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py)** that speaks to the agent server over HTTP/WebSocket. + + +### 2. Server Initialization โ†’ *(Server โ†’ Workspace)* + +Once the workspace starts: +- It launches the agent server process. +- Waits for it to be ready. +- Shares the server URL with the SDK client. + +You donโ€™t need to manage this manuallyโ€”the workspace context handles startup and teardown automatically. + +### 3. Event Streaming โ†’ *(Bidirectional WebSocket)* + +The client and agent server maintain a live WebSocket connection for streaming events: + +```python +def on_event(event): + print(f"Received: {type(event).__name__}") + +conversation = Conversation(agent=agent, workspace=workspace, callbacks=[on_event]) +``` + +This allows you to see real-time updates from the running agent as it executes tasks inside the workspace. + +### 4. Workspace Supports File and Command Operations โ†’ *(Server โ†” Workspace)* + +Workspace supports file and command operations via the agent server API ([base class](https://github.com/All-Hands-AI/agent-sdk/blob/main/openhands-sdk/openhands/sdk/workspace/base.py)), ensuring isolation and consistent behavior: + +```python +workspace.file_upload(local_path, remote_path) +workspace.file_download(remote_path, local_path) +result = workspace.execute_command("ls -la") +print(result.stdout) +``` + +These commands are proxied through the agent server, whether itโ€™s a Docker container or a remote VM, keeping your client code environment-agnostic. + +### Summary + +The architecture makes remote execution seamless: +- Your **client code** stays the same. +- The **agent server** manages execution and streaming. +- The **workspace** provides secure, isolated runtime environments. + +Switching from local to remote is just a matter of swapping the workspace classโ€”no code rewrites needed. + +## Next Steps + +Explore different deployment options: + +- **[Local Agent Server](/sdk/guides/agent-server/local-agent-server)** +- **[Docker Sandboxed Server](/sdk/guides/agent-server/docker-sandboxed-server)** +- **[API Sandboxed Server](/sdk/guides/agent-server/api-sandboxed-server)** + +For architectural details: +- **[Agent Server Package Architecture](/sdk/arch/agent-server-package)** - Remote execution architecture and deployment +- **[Workspace Package Architecture](/sdk/arch/workspace-package)** - Execution environments and isolation diff --git a/sdk/guides/custom-tools.mdx b/sdk/guides/custom-tools.mdx index 12b39b927..8426c10bd 100644 --- a/sdk/guides/custom-tools.mdx +++ b/sdk/guides/custom-tools.mdx @@ -17,7 +17,7 @@ tools = get_default_tools() agent = Agent(llm=llm, tools=tools) ``` -See [Tools Overview](/sdk/arch/tools/overview) for the complete list of available tools. +See [Tools Package Architecture](/sdk/arch/tools-package) for the complete list of available tools and design philosophy. ## Understanding the Tool System @@ -27,7 +27,7 @@ The SDK's tool system is built around three core components: 2. **Observation** - Defines output data (what the tool returns) 3. **Executor** - Implements the tool's logic (what the tool does) -These components are tied together by a **ToolDefinition** that registers the tool with the agent. For architectural details and advanced usage patterns, see [Tool System Architecture](/sdk/arch/sdk/tool). +These components are tied together by a **ToolDefinition** that registers the tool with the agent. For architectural details and design principles, see [SDK Package Architecture - Tool System](/sdk/arch/sdk-package#4-tool-system---typed-capabilities). ## Creating a Custom Tool @@ -308,5 +308,6 @@ Create custom tools when you need to: ## Next Steps -- **[Tool System Architecture](/sdk/arch/sdk/tool)** - Deep dive into the tool system +- **[SDK Package Architecture](/sdk/arch/sdk-package)** - Deep dive into the tool system and other SDK components +- **[Tools Package Architecture](/sdk/arch/tools-package)** - Built-in tools design philosophy - **[Model Context Protocol (MCP) Integration](/sdk/guides/mcp)** - Use Model Context Protocol servers diff --git a/sdk/guides/hello-world.mdx b/sdk/guides/hello-world.mdx index a644dd3fc..d37adf50b 100644 --- a/sdk/guides/hello-world.mdx +++ b/sdk/guides/hello-world.mdx @@ -66,7 +66,7 @@ Use the preset agent with common built-in tools: agent = get_default_agent(llm=llm, cli_mode=True) ``` -The default agent includes BashTool, FileEditorTool, etc. See [Tools Overview](/sdk/arch/tools/overview) for the complete list of available tools. +The default agent includes BashTool, FileEditorTool, etc. See [Tools Package Architecture](/sdk/arch/tools-package) for the complete list of available tools. ### Conversation Start a conversation to manage the agent's lifecycle: diff --git a/sdk/guides/llm-routing.mdx b/sdk/guides/llm-routing.mdx index 17a78ed23..b76c392f8 100644 --- a/sdk/guides/llm-routing.mdx +++ b/sdk/guides/llm-routing.mdx @@ -11,7 +11,7 @@ This example is available on GitHub: [examples/01_standalone_sdk/19_llm_routing. Automatically route requests to different LLMs based on task characteristics to optimize cost and performance: -```python icon="python" examples/01_standalone_sdk/19_llm_routing.py +```python icon="python" expandable examples/01_standalone_sdk/19_llm_routing.py import os from pydantic import SecretStr diff --git a/sdk/guides/mcp.mdx b/sdk/guides/mcp.mdx index 5d0d91e8d..1063dd191 100644 --- a/sdk/guides/mcp.mdx +++ b/sdk/guides/mcp.mdx @@ -242,6 +242,6 @@ mcp_config = { ## Next Steps -- **[MCP Architecture](/sdk/arch/sdk/mcp)** - Technical details and internals +- **[SDK Package Architecture - MCP](/sdk/arch/sdk-package#8-mcp---model-context-protocol)** - Technical details and design decisions - **[Custom Tools](/sdk/guides/custom-tools)** - Creating native SDK tools - **[Security Analyzer](/sdk/guides/security)** - Securing tool usage diff --git a/sdk/index.mdx b/sdk/index.mdx index 3eeef3d08..987791690 100644 --- a/sdk/index.mdx +++ b/sdk/index.mdx @@ -1,5 +1,5 @@ --- -title: OpenHands SDK +title: Agent SDK description: Build AI agents that write software. A clean, modular SDK with production-ready tools. icon: code mode: wide