secondorderai · kinwo · May 17, 2026 · May 17, 2026 · May 17, 2026 · May 17, 2026
diff --git a/benchmarks/terminal-bench/README.md b/benchmarks/terminal-bench/README.md
@@ -0,0 +1,135 @@
+# Ouroboros Terminal-Bench 2.0 Pilot Harness
+
+This directory contains a local pilot harness for running the Ouroboros CLI on
+Terminal-Bench 2.0 through Harbor. It is meant to validate that Ouroboros can be
+installed, built, and invoked inside Harbor task containers.
+
+This is not a leaderboard submission package. It does not add audited
+leaderboard metadata or a full ATIF trajectory converter.
+
+## Files
+
+- `ouroboros_tbench_agent.py`: Harbor `BaseInstalledAgent` adapter for
+  Ouroboros.
+- `run-pilot.sh`: local convenience script for a one-concurrency pilot run.
+
+The directory name contains a hyphen, so it is not imported as a Python package.
+`run-pilot.sh` adds this directory to `PYTHONPATH` and imports the adapter as:
+
+```bash
+ouroboros_tbench_agent:OuroborosInstalledAgent
+```
+
+## Prerequisites
+
+- Docker Desktop installed and running.
+- `uv` installed.
+- Network access from task containers for installing Bun and calling the model
+  provider.
+- `OPENAI_API_KEY` exported in the shell.
+
+Optional environment variables:
+
+- `OUROBOROS_TBENCH_MODEL`, default `openai/gpt-5.5`
+- `OUROBOROS_TBENCH_REASONING`, default `medium`
+- `OUROBOROS_TBENCH_MAX_STEPS`, default `50`
+- `OUROBOROS_TBENCH_N_CONCURRENT`, default `1`
+- `OUROBOROS_TBENCH_TIMEOUT_SEC`, default `3600`
+- `OUROBOROS_TBENCH_JOBS_DIR`, default `/private/tmp/ouroboros-tbench/jobs`
+
+## Developer Execution Plan
+
+1. Start Docker Desktop and verify the daemon is reachable:
+
+   ```bash
+   docker info
+   ```
+
+2. Export credentials:
+
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+
+3. Verify Harbor is available through `uv`:
+
+   ```bash
+   uv tool run harbor --help
+   ```
+
+4. Run the Harbor oracle sanity check:
+
+   ```bash
+   uv tool run harbor run --dataset terminal-bench@2.0 --agent oracle --n-concurrent 1
+   ```
+
+5. Run the Ouroboros pilot:
+
+   ```bash
+   benchmarks/terminal-bench/run-pilot.sh
+   ```
+
+6. Inspect results:
+
+   ```bash
+   ls -la /private/tmp/ouroboros-tbench/jobs
+   ```
+
+   Open the latest Harbor job directory and inspect the trial `agent/`,
+   `verifier/`, `result.json`, and `trial.log` files. Ouroboros logs are written
+   as `agent/ouroboros.txt`, `agent/ouroboros-stdout.txt`, and
+   `agent/ouroboros-stderr.txt`.
+
+## Verification Without Running The Benchmark
+
+Check shell syntax:
+
+```bash
+bash -n benchmarks/terminal-bench/run-pilot.sh
+```
+
+Validate the adapter import:
+
+```bash
+PYTHONPATH=benchmarks/terminal-bench \
+  uv tool run --with harbor python -c "from ouroboros_tbench_agent import OuroborosInstalledAgent; print(OuroborosInstalledAgent.name())"
+```
+
+Run the repo verification suite:
+
+```bash
+bun run verify
+```
+
+## Troubleshooting
+
+### Docker daemon is down
+
+If `docker info` fails, start Docker Desktop and wait until it reports that the
+engine is running.
+
+### `OPENAI_API_KEY` is missing
+
+`run-pilot.sh` exits early when `OPENAI_API_KEY` is empty because the default
+model is `openai/gpt-5.5`.
+
+### Harbor is missing
+
+Use `uv run harbor --help`. If `uv` cannot resolve Harbor, install it with:
+
+```bash
+uv tool install harbor
+```
+
+### Container setup fails while installing Bun
+
+Confirm the task container has outbound network access and can reach
+`https://bun.sh`. Some Terminal-Bench tasks may intentionally restrict internet
+access; those tasks are not suitable for this pilot adapter without pre-baking
+Ouroboros and Bun into the agent image.
+
+### Ouroboros build fails in the task container
+
+Inspect `agent/ouroboros-stderr.txt` and `trial.log` in the latest Harbor job
+directory. The adapter uploads a filtered copy of the current repo and runs
+`bun install` followed by `bun run --filter @ouroboros/cli build`.
diff --git a/benchmarks/terminal-bench/ouroboros_tbench_agent.py b/benchmarks/terminal-bench/ouroboros_tbench_agent.py
@@ -0,0 +1,169 @@
+import os
+import shlex
+import shutil
+from pathlib import Path
+
+from harbor.agents.installed.base import BaseInstalledAgent, with_prompt_template
+from harbor.environments.base import BaseEnvironment
+from harbor.models.agent.context import AgentContext
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CONTAINER_REPO_DIR = "/installed-agent/ouroboros"
+AGENT_LOG_NAME = "ouroboros.txt"
+AGENT_STDOUT_NAME = "ouroboros-stdout.txt"
+AGENT_STDERR_NAME = "ouroboros-stderr.txt"
+
+
+def _env_or_default(name: str, default: str) -> str:
+    value = os.environ.get(name)
+    return value if value and value.strip() else default
+
+
+class OuroborosInstalledAgent(BaseInstalledAgent):
+    """Harbor installed-agent adapter for running Ouroboros CLI on TB 2.0."""
+
+    SUPPORTS_ATIF = False
+
+    @staticmethod
+    def name() -> str:
+        return "ouroboros"
+
+    def get_version_command(self) -> str | None:
+        return (
+            f"cd {shlex.quote(CONTAINER_REPO_DIR)} && "
+            "./packages/cli/dist/ouroboros --version"
+        )
+
+    async def install(self, environment: BaseEnvironment) -> None:
+        await self.exec_as_root(
+            environment,
+            command=(
+                "if command -v apk >/dev/null 2>&1; then "
+                "apk add --no-cache bash curl unzip ca-certificates tar; "
+                "elif command -v apt-get >/dev/null 2>&1; then "
+                "apt-get update && apt-get install -y "
+                "bash curl unzip ca-certificates tar; "
+                "elif command -v yum >/dev/null 2>&1; then "
+                "yum install -y bash curl unzip ca-certificates tar; "
+                "else "
+                "echo 'No supported package manager found; assuming prerequisites exist' >&2; "
+                "fi"
+            ),
+            env={"DEBIAN_FRONTEND": "noninteractive"},
+            timeout_sec=300,
+        )
+
+        await self.exec_as_agent(
+            environment,
+            command=(
+                "if ! command -v bun >/dev/null 2>&1; then "
+                "curl -fsSL https://bun.sh/install | bash; "
+                "fi"
+            ),
+            timeout_sec=300,
+        )
+
+        upload_dir = self._prepare_repo_upload()
+        await environment.upload_dir(upload_dir, CONTAINER_REPO_DIR)
+
+        await self.exec_as_agent(
+            environment,
+            command=(
+                'export BUN_INSTALL="$HOME/.bun"; '
+                'export PATH="$BUN_INSTALL/bin:$PATH"; '
+                "bun install && bun run --filter @ouroboros/cli build"
+            ),
+            cwd=CONTAINER_REPO_DIR,
+            timeout_sec=900,
+        )
+
+    @with_prompt_template
+    async def run(
+        self, instruction: str, environment: BaseEnvironment, context: AgentContext
+    ) -> None:
+        model = _env_or_default("OUROBOROS_TBENCH_MODEL", "openai/gpt-5.5")
+        reasoning = _env_or_default("OUROBOROS_TBENCH_REASONING", "medium")
+        max_steps = _env_or_default("OUROBOROS_TBENCH_MAX_STEPS", "50")
+
+        env = {
+            "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY", ""),
+            "OUROBOROS_TBENCH_MODEL": model,
+            "OUROBOROS_TBENCH_REASONING": reasoning,
+            "OUROBOROS_TBENCH_MAX_STEPS": max_steps,
+        }
+        env = {key: value for key, value in env.items() if value}
+
+        command = (
+            "mkdir -p /logs/agent && "
+            'export BUN_INSTALL="$HOME/.bun"; '
+            'export PATH="$BUN_INSTALL/bin:$PATH"; '
+            f"{shlex.quote(CONTAINER_REPO_DIR)}/packages/cli/dist/ouroboros "
+            f"--model {shlex.quote(model)} "
+            f"--reasoning-effort {shlex.quote(reasoning)} "
+            "--no-stream --no-rsi "
+            f"--max-steps {shlex.quote(max_steps)} "
+            f"-m {shlex.quote(instruction)} "
+            f"> /logs/agent/{AGENT_STDOUT_NAME} "
+            f"2> /logs/agent/{AGENT_STDERR_NAME}; "
+            "status=$?; "
+            f"cat /logs/agent/{AGENT_STDOUT_NAME} "
+            f"/logs/agent/{AGENT_STDERR_NAME} > /logs/agent/{AGENT_LOG_NAME}; "
+            "exit $status"
+        )
+
+        await self.exec_as_agent(
+            environment,
+            command=command,
+            env=env,
+            timeout_sec=int(_env_or_default("OUROBOROS_TBENCH_TIMEOUT_SEC", "3600")),
+        )
+
+    def populate_context_post_run(self, context: AgentContext) -> None:
+        log_path = self.logs_dir / AGENT_LOG_NAME
+        stdout_path = self.logs_dir / AGENT_STDOUT_NAME
+        stderr_path = self.logs_dir / AGENT_STDERR_NAME
+
+        context.metadata = {
+            "agent": self.name(),
+            "model": _env_or_default("OUROBOROS_TBENCH_MODEL", "openai/gpt-5.5"),
+            "reasoning_effort": _env_or_default("OUROBOROS_TBENCH_REASONING", "medium"),
+            "max_steps": _env_or_default("OUROBOROS_TBENCH_MAX_STEPS", "50"),
+            "log_path": str(log_path),
+            "stdout_path": str(stdout_path),
+            "stderr_path": str(stderr_path),
+            "log_excerpt": self._read_excerpt(log_path),
+            "stdout_excerpt": self._read_excerpt(stdout_path),
+            "stderr_excerpt": self._read_excerpt(stderr_path),
+        }
+
+    def _prepare_repo_upload(self) -> Path:
+        target = self.logs_dir / "repo-upload"
+        if target.exists():
+            shutil.rmtree(target)
+
+        ignore = shutil.ignore_patterns(
+            ".git",
+            ".DS_Store",
+            "node_modules",
+            "dist",
+            "out",
+            "coverage",
+            ".cache",
+            ".turbo",
+            "tmp",
+            "logs",
+            "*.log",
+            ".ouroboros-transcripts.db",
+        )
+        shutil.copytree(REPO_ROOT, target, ignore=ignore)
+        return target
+
+    def _read_excerpt(self, path: Path, limit: int = 4000) -> str | None:
+        if not path.exists():
+            return None
+
+        text = path.read_text(errors="replace")
+        if len(text) <= limit:
+            return text
+        return text[:limit] + "\n...[truncated]"
diff --git a/benchmarks/terminal-bench/run-pilot.sh b/benchmarks/terminal-bench/run-pilot.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+JOBS_DIR="${OUROBOROS_TBENCH_JOBS_DIR:-/private/tmp/ouroboros-tbench/jobs}"
+N_CONCURRENT="${OUROBOROS_TBENCH_N_CONCURRENT:-1}"
+
+export OUROBOROS_TBENCH_MODEL="${OUROBOROS_TBENCH_MODEL:-openai/gpt-5.5}"
+export OUROBOROS_TBENCH_REASONING="${OUROBOROS_TBENCH_REASONING:-medium}"
+export OUROBOROS_TBENCH_MAX_STEPS="${OUROBOROS_TBENCH_MAX_STEPS:-50}"
+
+if ! command -v uv >/dev/null 2>&1; then
+  echo "error: uv is required. Install it from https://docs.astral.sh/uv/." >&2
+  exit 1
+fi
+
+if ! command -v docker >/dev/null 2>&1; then
+  echo "error: docker is required. Install Docker Desktop and start it." >&2
+  exit 1
+fi
+
+if ! docker info >/dev/null 2>&1; then
+  echo "error: Docker daemon is not reachable. Start Docker Desktop, then retry." >&2
+  exit 1
+fi
+
+if [[ -z "${OPENAI_API_KEY:-}" ]]; then
+  echo "error: OPENAI_API_KEY is required for the default openai/gpt-5.5 run." >&2
+  exit 1
+fi
+
+if command -v harbor >/dev/null 2>&1; then
+  HARBOR_CMD=(harbor)
+else
+  HARBOR_CMD=(uv tool run harbor)
+fi
+
+mkdir -p "$JOBS_DIR"
+
+echo "Running Ouroboros Terminal-Bench 2.0 pilot"
+echo "repo: $REPO_ROOT"
+echo "jobs: $JOBS_DIR"
+echo "model: $OUROBOROS_TBENCH_MODEL"
+echo "reasoning: $OUROBOROS_TBENCH_REASONING"
+echo "max steps: $OUROBOROS_TBENCH_MAX_STEPS"
+echo "concurrency: $N_CONCURRENT"
+
+cd "$REPO_ROOT"
+
+PYTHONPATH="$SCRIPT_DIR${PYTHONPATH:+:$PYTHONPATH}" \
+  "${HARBOR_CMD[@]}" run \
+    --dataset terminal-bench@2.0 \
+    --agent-import-path ouroboros_tbench_agent:OuroborosInstalledAgent \
+    --n-concurrent "$N_CONCURRENT" \
+    --jobs-dir "$JOBS_DIR" \
+    "$@"