Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions examples/01_standalone_sdk/54_goal_completion_loop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""The /goal command: pursue an objective until a judge LLM confirms it is done.

A plain ``conversation.run()`` stops as soon as the agent *thinks* it is
finished. The ``/goal`` loop is stricter: after each run it asks a second
"judge" LLM to audit the transcript for authoritative evidence -- file
contents, command output, test results -- that the objective is *provably*
complete. If something is still missing, it re-prompts the agent with the
judge's feedback and runs again, until the goal is genuinely done or a hard
iteration cap is reached.

That makes it a good fit for verifiable objectives like "make the tests pass":
the agent cannot finish just by claiming success; the judge has to see green
output first.

Key concepts demonstrated:
1. ``run_goal(conversation, objective, judge_llm, max_iterations=...)`` drives
the conversation from the outside, re-prompting until the judge is satisfied.
2. A second, independent "judge" LLM grades completion -- separate from the
agent that does the work.
3. The returned ``GoalOutcome`` reports whether the goal ``"complete"``-d or was
``"capped"``, how many audit rounds it took, and the judge's final verdict.

Because ``run_goal`` drives the conversation you pass in (it does not fork or
spin up a sidecar), every turn -- objective, agent work, judge-driven followups
-- lands in the same ``conversation.state.events`` history. It therefore
composes with whatever agent, tools, or critic you already have.
"""

import os
import tempfile

from openhands.sdk import LLM, Agent, Conversation, Tool
from openhands.sdk.conversation.goal import run_goal
from openhands.tools.file_editor import FileEditorTool
from openhands.tools.terminal import TerminalTool


# The agent LLM does the work; the judge LLM independently grades completion.
# Two separate instances (same model, distinct usage_id) keep their costs apart.
model = os.getenv("LLM_MODEL", "gpt-5.5")
api_key = os.getenv("LLM_API_KEY")
base_url = os.getenv("LLM_BASE_URL")
agent_llm = LLM(usage_id="agent", model=model, api_key=api_key, base_url=base_url)
judge_llm = LLM(usage_id="goal-judge", model=model, api_key=api_key, base_url=base_url)

agent = Agent(
llm=agent_llm,
tools=[Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name)],
)

workspace = tempfile.mkdtemp(prefix="goal_demo_")
conversation = Conversation(agent=agent, workspace=workspace)

# A verifiable objective: the judge can only call it done once it has seen
# pytest actually pass -- not merely the agent asserting that it did.
objective = (
"Create mathx.py with an add(a, b) function and test_mathx.py with a pytest "
"test for it. The goal is complete only when `python -m pytest -q` passes."
)

# Drive the conversation toward the objective, re-judging after each run.
outcome = run_goal(conversation, objective, judge_llm, max_iterations=3)

print("\n" + "=" * 70)
print(f"Goal {outcome.status} after {outcome.iterations} audit round(s).")
print(f"Judge score: {outcome.verdict.score:.2f}")
if outcome.verdict.missing:
print(f"Still missing: {outcome.verdict.missing}")
print(f"Workspace: {workspace}")
print("=" * 70)

# Report cost (agent work + judge audits).
cost = agent_llm.metrics.accumulated_cost + judge_llm.metrics.accumulated_cost
print(f"EXAMPLE_COST: {cost}")
42 changes: 42 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/goal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""The ``/goal`` command: judge-driven, self-continuing goal completion.

A conversation-level command (not a critic) that drives the agent toward an
objective: it sends the objective, runs the agent, judges completion with a
second LLM, and re-prompts until the goal is done or a cap is reached.

The decision logic lives in :class:`GoalController` (transport-agnostic, no
I/O); :func:`run_goal` is a thin synchronous driver over it. An async
agent-server task can reuse the same controller with its own I/O loop.

Usage::

from openhands.sdk.conversation.goal import run_goal

outcome = run_goal(conversation, "make pytest pass for mathx.py", judge_llm)
"""

from openhands.sdk.conversation.goal.controller import (
GoalContinue,
GoalController,
GoalDone,
GoalOutcome,
GoalStatus,
GoalStatusName,
GoalStep,
)
from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
from openhands.sdk.conversation.goal.runner import run_goal


__all__ = [
"GoalContinue",
"GoalController",
"GoalDone",
"GoalOutcome",
"GoalStatus",
"GoalStatusName",
"GoalStep",
"GoalVerdict",
"judge_goal",
"run_goal",
]
130 changes: 130 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/goal/controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""The transport-agnostic brain of the ``/goal`` loop.

``GoalController`` decides -- after each agent run finishes -- whether to
continue (with a followup message) or stop (with a ``GoalOutcome``). It performs
NO I/O: a *driver* (the sync ``run_goal``, or an async agent-server task) owns
sending messages and running the agent; the controller only judges and decides.
That split lets the sync and async drivers share identical decision logic.
"""

from collections.abc import Sequence
from typing import Literal

from pydantic import BaseModel, Field

from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
from openhands.sdk.conversation.goal.prompts import FOLLOWUP_PROMPT
from openhands.sdk.event import Event
from openhands.sdk.llm import LLM
from openhands.sdk.logger import get_logger


logger = get_logger(__name__)


class GoalOutcome(BaseModel):
"""Result of a ``/goal`` loop.

``status`` distinguishes genuine completion from hitting the iteration cap,
so a driver never has to guess whether a silent finish meant success.
"""

status: Literal["complete", "capped"]
iterations: int = Field(ge=1, description="Number of audit rounds performed.")
verdict: GoalVerdict


GoalStatusName = Literal["running", "complete", "capped", "interrupted"]
"""Lifecycle state of a ``/goal`` loop."""


class GoalStatus(BaseModel):
"""Live status of a ``/goal`` loop, for a UI progress chip.

The agent server publishes this as the ``value`` of a
``ConversationStateUpdateEvent`` with ``key="goal"`` at each lifecycle point
(start, each round, and the terminal/interrupted state).
"""

active: bool = Field(description="Whether the goal loop is still running.")
status: GoalStatusName
iteration: int = Field(ge=0, description="Audit rounds completed so far.")
max_iterations: int = Field(ge=1)
objective: str
verdict: GoalVerdict | None = Field(
default=None, description="Last judge verdict; set once the loop ends."
)


class GoalContinue(BaseModel):
"""Decision to keep going: send ``followup`` before the next run."""

followup: str


class GoalDone(BaseModel):
"""Decision to stop: the loop finished with ``outcome``."""

outcome: GoalOutcome


GoalStep = GoalContinue | GoalDone
"""One decision returned by :meth:`GoalController.on_run_finished`."""


class GoalController:
"""Judges goal completion and decides continue-vs-stop, without doing I/O.

A driver calls :meth:`start` once to get the first message to send, then
calls :meth:`on_run_finished` after every agent run to get the next
decision. The controller owns the iteration count and the ``max_iterations``
cap, so drivers stay trivial.
"""

def __init__(
self, objective: str, judge_llm: LLM, *, max_iterations: int = 10
) -> None:
if not objective.strip():
raise ValueError("Goal objective must not be empty.")
if max_iterations < 1:
raise ValueError("max_iterations must be >= 1.")
self.objective = objective
self.judge_llm = judge_llm
self.max_iterations = max_iterations
self.iteration = 0

def start(self) -> str:
"""Return the first message a driver should send (the objective)."""
return self.objective

def on_run_finished(self, events: Sequence[Event]) -> GoalStep:
"""Judge the objective after a run and decide whether to continue.

Increments the iteration count, audits ``events`` with the judge LLM,
and returns a :class:`GoalContinue` (with a followup) or a terminal
:class:`GoalDone` (with a :class:`GoalOutcome`).
"""
self.iteration += 1
verdict = judge_goal(self.judge_llm, self.objective, events)
logger.info(
"Goal audit %d/%d: score=%.2f complete=%s",
self.iteration,
self.max_iterations,
verdict.score,
verdict.complete,
)
if verdict.complete:
return GoalDone(
outcome=GoalOutcome(
status="complete", iterations=self.iteration, verdict=verdict
)
)
if self.iteration >= self.max_iterations:
return GoalDone(
outcome=GoalOutcome(
status="capped", iterations=self.iteration, verdict=verdict
)
)
missing = verdict.missing or "Some requirements are not yet verified."
followup = FOLLOWUP_PROMPT.format(iteration=self.iteration, missing=missing)
return GoalContinue(followup=followup)
121 changes: 121 additions & 0 deletions openhands-sdk/openhands/sdk/conversation/goal/judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""LLM judge that decides whether a ``/goal`` objective is complete.

This is the reusable kernel of the goal feature: a pure
``objective + transcript -> verdict`` evaluator with no dependency on the
critic machinery. The ``/goal`` runner uses it to drive continuation, but it
can equally back a status command, a stop hook, or a server endpoint.
"""

import contextlib
import json
import re
from collections.abc import Sequence
from typing import Any

from pydantic import BaseModel, Field

from openhands.sdk.conversation.goal.prompts import JUDGE_PROMPT
from openhands.sdk.event import Event, LLMConvertibleEvent
from openhands.sdk.llm import LLM, Message, TextContent, content_to_str
from openhands.sdk.logger import get_logger


logger = get_logger(__name__)


class GoalVerdict(BaseModel):
"""The judge's verdict on whether the objective is complete."""

score: float = Field(
ge=0.0,
le=1.0,
description="Probability (0-1) that the full objective is provably done.",
)
complete: bool = Field(
description="Whether the judge considers the objective complete."
)
missing: str = Field(
default="",
description="Concise description of what remains, or empty if complete.",
)


def judge_goal(judge_llm: LLM, objective: str, events: Sequence[Event]) -> GoalVerdict:
"""Audit the transcript and decide whether ``objective`` is complete.

Args:
judge_llm: The second LLM that grades completion.
objective: The goal to audit against.
events: Conversation events (non-LLM events are ignored).

Returns:
A GoalVerdict. On a judge response that cannot be parsed, returns a
conservative low score so the caller keeps working rather than
falsely finishing.
"""
convertible = [e for e in events if isinstance(e, LLMConvertibleEvent)]
transcript = _render_transcript(convertible)
prompt = JUDGE_PROMPT.format(objective=objective, transcript=transcript)

# The judge only needs the verdict text. Force non-streaming so reusing a
# streaming agent LLM as the judge does not trip completion()'s requirement
# of an on_token callback when stream=True.
if judge_llm.stream:
judge_llm = judge_llm.model_copy(update={"stream": False})
response = judge_llm.completion(
messages=[Message(role="user", content=[TextContent(text=prompt)])]
)
verdict = _parse_verdict(response.message)
logger.debug("judge_goal verdict: %s", verdict)
return verdict


def _render_transcript(events: Sequence[LLMConvertibleEvent]) -> str:
"""Render events as a plain ``role: text`` transcript for the judge.

The agent's ``system`` prompt is excluded: it is large (~thousands of tokens)
and carries no goal-specific evidence, so it would only inflate the judge's
token cost on every audit.
"""
turns = [
(msg.role, text)
for msg in LLMConvertibleEvent.events_to_messages(list(events))
if msg.role != "system"
and (text := "\n".join(content_to_str(msg.content)).strip())
]
return "\n\n".join(f"{role}: {text}" for role, text in turns)


def _parse_verdict(message: Message) -> GoalVerdict:
"""Normalize the judge response into a GoalVerdict, conservatively."""
raw = "\n".join(content_to_str(message.content)).strip()

data: dict[str, Any] | None = None
candidates = [raw]
block = re.search(r"\{.*\}", raw, re.DOTALL)
if block:
candidates.append(block.group(0))
for candidate in candidates:
with contextlib.suppress(json.JSONDecodeError):
parsed = json.loads(candidate)
if isinstance(parsed, dict):
data = parsed
break

if data is None:
logger.warning("judge_goal: could not parse verdict: %r", raw)
return GoalVerdict(
score=0.0, complete=False, missing="Judge verdict could not be parsed."
)

try:
score = float(data.get("score", 0.0))
except (TypeError, ValueError):
score = 0.0
score = max(0.0, min(1.0, score))

return GoalVerdict(
score=score,
complete=bool(data.get("complete", score >= 1.0)),
missing=str(data.get("missing") or ""),
)
Loading
Loading