OpenHands · VascoSch92 · Jun 18, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/examples/01_standalone_sdk/54_goal_completion_loop.py b/examples/01_standalone_sdk/54_goal_completion_loop.py
@@ -0,0 +1,74 @@
+"""The /goal command: pursue an objective until a judge LLM confirms it is done.
+
+A plain ``conversation.run()`` stops as soon as the agent *thinks* it is
+finished. The ``/goal`` loop is stricter: after each run it asks a second
+"judge" LLM to audit the transcript for authoritative evidence -- file
+contents, command output, test results -- that the objective is *provably*
+complete. If something is still missing, it re-prompts the agent with the
+judge's feedback and runs again, until the goal is genuinely done or a hard
+iteration cap is reached.
+
+That makes it a good fit for verifiable objectives like "make the tests pass":
+the agent cannot finish just by claiming success; the judge has to see green
+output first.
+
+Key concepts demonstrated:
+1. ``run_goal(conversation, objective, judge_llm, max_iterations=...)`` drives
+   the conversation from the outside, re-prompting until the judge is satisfied.
+2. A second, independent "judge" LLM grades completion -- separate from the
+   agent that does the work.
+3. The returned ``GoalOutcome`` reports whether the goal ``"complete"``-d or was
+   ``"capped"``, how many audit rounds it took, and the judge's final verdict.
+
+Because ``run_goal`` drives the conversation you pass in (it does not fork or
+spin up a sidecar), every turn -- objective, agent work, judge-driven followups
+-- lands in the same ``conversation.state.events`` history. It therefore
+composes with whatever agent, tools, or critic you already have.
+"""
+
+import os
+import tempfile
+
+from openhands.sdk import LLM, Agent, Conversation, Tool
+from openhands.sdk.conversation.goal import run_goal
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.terminal import TerminalTool
+
+
+# The agent LLM does the work; the judge LLM independently grades completion.
+# Two separate instances (same model, distinct usage_id) keep their costs apart.
+model = os.getenv("LLM_MODEL", "gpt-5.5")
+api_key = os.getenv("LLM_API_KEY")
+base_url = os.getenv("LLM_BASE_URL")
+agent_llm = LLM(usage_id="agent", model=model, api_key=api_key, base_url=base_url)
+judge_llm = LLM(usage_id="goal-judge", model=model, api_key=api_key, base_url=base_url)
+
+agent = Agent(
+    llm=agent_llm,
+    tools=[Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name)],
+)
+
+workspace = tempfile.mkdtemp(prefix="goal_demo_")
+conversation = Conversation(agent=agent, workspace=workspace)
+
+# A verifiable objective: the judge can only call it done once it has seen
+# pytest actually pass -- not merely the agent asserting that it did.
+objective = (
+    "Create mathx.py with an add(a, b) function and test_mathx.py with a pytest "
+    "test for it. The goal is complete only when `python -m pytest -q` passes."
+)
+
+# Drive the conversation toward the objective, re-judging after each run.
+outcome = run_goal(conversation, objective, judge_llm, max_iterations=3)
+
+print("\n" + "=" * 70)
+print(f"Goal {outcome.status} after {outcome.iterations} audit round(s).")
+print(f"Judge score: {outcome.verdict.score:.2f}")
+if outcome.verdict.missing:
+    print(f"Still missing: {outcome.verdict.missing}")
+print(f"Workspace: {workspace}")
+print("=" * 70)
+
+# Report cost (agent work + judge audits).
+cost = agent_llm.metrics.accumulated_cost + judge_llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/__init__.py b/openhands-sdk/openhands/sdk/conversation/goal/__init__.py
@@ -0,0 +1,42 @@
+"""The ``/goal`` command: judge-driven, self-continuing goal completion.
+
+A conversation-level command (not a critic) that drives the agent toward an
+objective: it sends the objective, runs the agent, judges completion with a
+second LLM, and re-prompts until the goal is done or a cap is reached.
+
+The decision logic lives in :class:`GoalController` (transport-agnostic, no
+I/O); :func:`run_goal` is a thin synchronous driver over it. An async
+agent-server task can reuse the same controller with its own I/O loop.
+
+Usage::
+
+    from openhands.sdk.conversation.goal import run_goal
+
+    outcome = run_goal(conversation, "make pytest pass for mathx.py", judge_llm)
+"""
+
+from openhands.sdk.conversation.goal.controller import (
+    GoalContinue,
+    GoalController,
+    GoalDone,
+    GoalOutcome,
+    GoalStatus,
+    GoalStatusName,
+    GoalStep,
+)
+from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
+from openhands.sdk.conversation.goal.runner import run_goal
+
+
+__all__ = [
+    "GoalContinue",
+    "GoalController",
+    "GoalDone",
+    "GoalOutcome",
+    "GoalStatus",
+    "GoalStatusName",
+    "GoalStep",
+    "GoalVerdict",
+    "judge_goal",
+    "run_goal",
+]
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/controller.py b/openhands-sdk/openhands/sdk/conversation/goal/controller.py
@@ -0,0 +1,130 @@
+"""The transport-agnostic brain of the ``/goal`` loop.
+
+``GoalController`` decides -- after each agent run finishes -- whether to
+continue (with a followup message) or stop (with a ``GoalOutcome``). It performs
+NO I/O: a *driver* (the sync ``run_goal``, or an async agent-server task) owns
+sending messages and running the agent; the controller only judges and decides.
+That split lets the sync and async drivers share identical decision logic.
+"""
+
+from collections.abc import Sequence
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
+from openhands.sdk.conversation.goal.prompts import FOLLOWUP_PROMPT
+from openhands.sdk.event import Event
+from openhands.sdk.llm import LLM
+from openhands.sdk.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class GoalOutcome(BaseModel):
+    """Result of a ``/goal`` loop.
+
+    ``status`` distinguishes genuine completion from hitting the iteration cap,
+    so a driver never has to guess whether a silent finish meant success.
+    """
+
+    status: Literal["complete", "capped"]
+    iterations: int = Field(ge=1, description="Number of audit rounds performed.")
+    verdict: GoalVerdict
+
+
+GoalStatusName = Literal["running", "complete", "capped", "interrupted"]
+"""Lifecycle state of a ``/goal`` loop."""
+
+
+class GoalStatus(BaseModel):
+    """Live status of a ``/goal`` loop, for a UI progress chip.
+
+    The agent server publishes this as the ``value`` of a
+    ``ConversationStateUpdateEvent`` with ``key="goal"`` at each lifecycle point
+    (start, each round, and the terminal/interrupted state).
+    """
+
+    active: bool = Field(description="Whether the goal loop is still running.")
+    status: GoalStatusName
+    iteration: int = Field(ge=0, description="Audit rounds completed so far.")
+    max_iterations: int = Field(ge=1)
+    objective: str
+    verdict: GoalVerdict | None = Field(
+        default=None, description="Last judge verdict; set once the loop ends."
+    )
+
+
+class GoalContinue(BaseModel):
+    """Decision to keep going: send ``followup`` before the next run."""
+
+    followup: str
+
+
+class GoalDone(BaseModel):
+    """Decision to stop: the loop finished with ``outcome``."""
+
+    outcome: GoalOutcome
+
+
+GoalStep = GoalContinue | GoalDone
+"""One decision returned by :meth:`GoalController.on_run_finished`."""
+
+
+class GoalController:
+    """Judges goal completion and decides continue-vs-stop, without doing I/O.
+
+    A driver calls :meth:`start` once to get the first message to send, then
+    calls :meth:`on_run_finished` after every agent run to get the next
+    decision. The controller owns the iteration count and the ``max_iterations``
+    cap, so drivers stay trivial.
+    """
+
+    def __init__(
+        self, objective: str, judge_llm: LLM, *, max_iterations: int = 10
+    ) -> None:
+        if not objective.strip():
+            raise ValueError("Goal objective must not be empty.")
+        if max_iterations < 1:
+            raise ValueError("max_iterations must be >= 1.")
+        self.objective = objective
+        self.judge_llm = judge_llm
+        self.max_iterations = max_iterations
+        self.iteration = 0
+
+    def start(self) -> str:
+        """Return the first message a driver should send (the objective)."""
+        return self.objective
+
+    def on_run_finished(self, events: Sequence[Event]) -> GoalStep:
+        """Judge the objective after a run and decide whether to continue.
+
+        Increments the iteration count, audits ``events`` with the judge LLM,
+        and returns a :class:`GoalContinue` (with a followup) or a terminal
+        :class:`GoalDone` (with a :class:`GoalOutcome`).
+        """
+        self.iteration += 1
+        verdict = judge_goal(self.judge_llm, self.objective, events)
+        logger.info(
+            "Goal audit %d/%d: score=%.2f complete=%s",
+            self.iteration,
+            self.max_iterations,
+            verdict.score,
+            verdict.complete,
+        )
+        if verdict.complete:
+            return GoalDone(
+                outcome=GoalOutcome(
+                    status="complete", iterations=self.iteration, verdict=verdict
+                )
+            )
+        if self.iteration >= self.max_iterations:
+            return GoalDone(
+                outcome=GoalOutcome(
+                    status="capped", iterations=self.iteration, verdict=verdict
+                )
+            )
+        missing = verdict.missing or "Some requirements are not yet verified."
+        followup = FOLLOWUP_PROMPT.format(iteration=self.iteration, missing=missing)
+        return GoalContinue(followup=followup)
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/judge.py b/openhands-sdk/openhands/sdk/conversation/goal/judge.py
@@ -0,0 +1,121 @@
+"""LLM judge that decides whether a ``/goal`` objective is complete.
+
+This is the reusable kernel of the goal feature: a pure
+``objective + transcript -> verdict`` evaluator with no dependency on the
+critic machinery. The ``/goal`` runner uses it to drive continuation, but it
+can equally back a status command, a stop hook, or a server endpoint.
+"""
+
+import contextlib
+import json
+import re
+from collections.abc import Sequence
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from openhands.sdk.conversation.goal.prompts import JUDGE_PROMPT
+from openhands.sdk.event import Event, LLMConvertibleEvent
+from openhands.sdk.llm import LLM, Message, TextContent, content_to_str
+from openhands.sdk.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class GoalVerdict(BaseModel):
+    """The judge's verdict on whether the objective is complete."""
+
+    score: float = Field(
+        ge=0.0,
+        le=1.0,
+        description="Probability (0-1) that the full objective is provably done.",
+    )
+    complete: bool = Field(
+        description="Whether the judge considers the objective complete."
+    )
+    missing: str = Field(
+        default="",
+        description="Concise description of what remains, or empty if complete.",
+    )
+
+
+def judge_goal(judge_llm: LLM, objective: str, events: Sequence[Event]) -> GoalVerdict:
+    """Audit the transcript and decide whether ``objective`` is complete.
+
+    Args:
+        judge_llm: The second LLM that grades completion.
+        objective: The goal to audit against.
+        events: Conversation events (non-LLM events are ignored).
+
+    Returns:
+        A GoalVerdict. On a judge response that cannot be parsed, returns a
+        conservative low score so the caller keeps working rather than
+        falsely finishing.
+    """
+    convertible = [e for e in events if isinstance(e, LLMConvertibleEvent)]
+    transcript = _render_transcript(convertible)
+    prompt = JUDGE_PROMPT.format(objective=objective, transcript=transcript)
+
+    # The judge only needs the verdict text. Force non-streaming so reusing a
+    # streaming agent LLM as the judge does not trip completion()'s requirement
+    # of an on_token callback when stream=True.
+    if judge_llm.stream:
+        judge_llm = judge_llm.model_copy(update={"stream": False})
+    response = judge_llm.completion(
+        messages=[Message(role="user", content=[TextContent(text=prompt)])]
+    )
+    verdict = _parse_verdict(response.message)
+    logger.debug("judge_goal verdict: %s", verdict)
+    return verdict
+
+
+def _render_transcript(events: Sequence[LLMConvertibleEvent]) -> str:
+    """Render events as a plain ``role: text`` transcript for the judge.
+
+    The agent's ``system`` prompt is excluded: it is large (~thousands of tokens)
+    and carries no goal-specific evidence, so it would only inflate the judge's
+    token cost on every audit.
+    """
+    turns = [
+        (msg.role, text)
+        for msg in LLMConvertibleEvent.events_to_messages(list(events))
+        if msg.role != "system"
+        and (text := "\n".join(content_to_str(msg.content)).strip())
+    ]
+    return "\n\n".join(f"{role}: {text}" for role, text in turns)
+
+
+def _parse_verdict(message: Message) -> GoalVerdict:
+    """Normalize the judge response into a GoalVerdict, conservatively."""
+    raw = "\n".join(content_to_str(message.content)).strip()
+
+    data: dict[str, Any] | None = None
+    candidates = [raw]
+    block = re.search(r"\{.*\}", raw, re.DOTALL)
+    if block:
+        candidates.append(block.group(0))
+    for candidate in candidates:
+        with contextlib.suppress(json.JSONDecodeError):
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict):
+                data = parsed
+                break
+
+    if data is None:
+        logger.warning("judge_goal: could not parse verdict: %r", raw)
+        return GoalVerdict(
+            score=0.0, complete=False, missing="Judge verdict could not be parsed."
+        )
+
+    try:
+        score = float(data.get("score", 0.0))
+    except (TypeError, ValueError):
+        score = 0.0
+    score = max(0.0, min(1.0, score))
+
+    return GoalVerdict(
+        score=score,
+        complete=bool(data.get("complete", score >= 1.0)),
+        missing=str(data.get("missing") or ""),
+    )