diff --git a/examples/01_standalone_sdk/54_goal_completion_loop.py b/examples/01_standalone_sdk/54_goal_completion_loop.py
new file mode 100644
index 0000000000..a50b60720f
--- /dev/null
+++ b/examples/01_standalone_sdk/54_goal_completion_loop.py
@@ -0,0 +1,74 @@
+"""The /goal command: pursue an objective until a judge LLM confirms it is done.
+
+A plain ``conversation.run()`` stops as soon as the agent *thinks* it is
+finished. The ``/goal`` loop is stricter: after each run it asks a second
+"judge" LLM to audit the transcript for authoritative evidence -- file
+contents, command output, test results -- that the objective is *provably*
+complete. If something is still missing, it re-prompts the agent with the
+judge's feedback and runs again, until the goal is genuinely done or a hard
+iteration cap is reached.
+
+That makes it a good fit for verifiable objectives like "make the tests pass":
+the agent cannot finish just by claiming success; the judge has to see green
+output first.
+
+Key concepts demonstrated:
+1. ``run_goal(conversation, objective, judge_llm, max_iterations=...)`` drives
+   the conversation from the outside, re-prompting until the judge is satisfied.
+2. A second, independent "judge" LLM grades completion -- separate from the
+   agent that does the work.
+3. The returned ``GoalOutcome`` reports whether the goal ``"complete"``-d or was
+   ``"capped"``, how many audit rounds it took, and the judge's final verdict.
+
+Because ``run_goal`` drives the conversation you pass in (it does not fork or
+spin up a sidecar), every turn -- objective, agent work, judge-driven followups
+-- lands in the same ``conversation.state.events`` history. It therefore
+composes with whatever agent, tools, or critic you already have.
+"""
+
+import os
+import tempfile
+
+from openhands.sdk import LLM, Agent, Conversation, Tool
+from openhands.sdk.conversation.goal import run_goal
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.terminal import TerminalTool
+
+
+# The agent LLM does the work; the judge LLM independently grades completion.
+# Two separate instances (same model, distinct usage_id) keep their costs apart.
+model = os.getenv("LLM_MODEL", "gpt-5.5")
+api_key = os.getenv("LLM_API_KEY")
+base_url = os.getenv("LLM_BASE_URL")
+agent_llm = LLM(usage_id="agent", model=model, api_key=api_key, base_url=base_url)
+judge_llm = LLM(usage_id="goal-judge", model=model, api_key=api_key, base_url=base_url)
+
+agent = Agent(
+    llm=agent_llm,
+    tools=[Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name)],
+)
+
+workspace = tempfile.mkdtemp(prefix="goal_demo_")
+conversation = Conversation(agent=agent, workspace=workspace)
+
+# A verifiable objective: the judge can only call it done once it has seen
+# pytest actually pass -- not merely the agent asserting that it did.
+objective = (
+    "Create mathx.py with an add(a, b) function and test_mathx.py with a pytest "
+    "test for it. The goal is complete only when `python -m pytest -q` passes."
+)
+
+# Drive the conversation toward the objective, re-judging after each run.
+outcome = run_goal(conversation, objective, judge_llm, max_iterations=3)
+
+print("\n" + "=" * 70)
+print(f"Goal {outcome.status} after {outcome.iterations} audit round(s).")
+print(f"Judge score: {outcome.verdict.score:.2f}")
+if outcome.verdict.missing:
+    print(f"Still missing: {outcome.verdict.missing}")
+print(f"Workspace: {workspace}")
+print("=" * 70)
+
+# Report cost (agent work + judge audits).
+cost = agent_llm.metrics.accumulated_cost + judge_llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/__init__.py b/openhands-sdk/openhands/sdk/conversation/goal/__init__.py
new file mode 100644
index 0000000000..766318e385
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/conversation/goal/__init__.py
@@ -0,0 +1,42 @@
+"""The ``/goal`` command: judge-driven, self-continuing goal completion.
+
+A conversation-level command (not a critic) that drives the agent toward an
+objective: it sends the objective, runs the agent, judges completion with a
+second LLM, and re-prompts until the goal is done or a cap is reached.
+
+The decision logic lives in :class:`GoalController` (transport-agnostic, no
+I/O); :func:`run_goal` is a thin synchronous driver over it. An async
+agent-server task can reuse the same controller with its own I/O loop.
+
+Usage::
+
+    from openhands.sdk.conversation.goal import run_goal
+
+    outcome = run_goal(conversation, "make pytest pass for mathx.py", judge_llm)
+"""
+
+from openhands.sdk.conversation.goal.controller import (
+    GoalContinue,
+    GoalController,
+    GoalDone,
+    GoalOutcome,
+    GoalStatus,
+    GoalStatusName,
+    GoalStep,
+)
+from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
+from openhands.sdk.conversation.goal.runner import run_goal
+
+
+__all__ = [
+    "GoalContinue",
+    "GoalController",
+    "GoalDone",
+    "GoalOutcome",
+    "GoalStatus",
+    "GoalStatusName",
+    "GoalStep",
+    "GoalVerdict",
+    "judge_goal",
+    "run_goal",
+]
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/controller.py b/openhands-sdk/openhands/sdk/conversation/goal/controller.py
new file mode 100644
index 0000000000..3ea92524bf
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/conversation/goal/controller.py
@@ -0,0 +1,130 @@
+"""The transport-agnostic brain of the ``/goal`` loop.
+
+``GoalController`` decides -- after each agent run finishes -- whether to
+continue (with a followup message) or stop (with a ``GoalOutcome``). It performs
+NO I/O: a *driver* (the sync ``run_goal``, or an async agent-server task) owns
+sending messages and running the agent; the controller only judges and decides.
+That split lets the sync and async drivers share identical decision logic.
+"""
+
+from collections.abc import Sequence
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
+from openhands.sdk.conversation.goal.prompts import FOLLOWUP_PROMPT
+from openhands.sdk.event import Event
+from openhands.sdk.llm import LLM
+from openhands.sdk.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class GoalOutcome(BaseModel):
+    """Result of a ``/goal`` loop.
+
+    ``status`` distinguishes genuine completion from hitting the iteration cap,
+    so a driver never has to guess whether a silent finish meant success.
+    """
+
+    status: Literal["complete", "capped"]
+    iterations: int = Field(ge=1, description="Number of audit rounds performed.")
+    verdict: GoalVerdict
+
+
+GoalStatusName = Literal["running", "complete", "capped", "interrupted"]
+"""Lifecycle state of a ``/goal`` loop."""
+
+
+class GoalStatus(BaseModel):
+    """Live status of a ``/goal`` loop, for a UI progress chip.
+
+    The agent server publishes this as the ``value`` of a
+    ``ConversationStateUpdateEvent`` with ``key="goal"`` at each lifecycle point
+    (start, each round, and the terminal/interrupted state).
+    """
+
+    active: bool = Field(description="Whether the goal loop is still running.")
+    status: GoalStatusName
+    iteration: int = Field(ge=0, description="Audit rounds completed so far.")
+    max_iterations: int = Field(ge=1)
+    objective: str
+    verdict: GoalVerdict | None = Field(
+        default=None, description="Last judge verdict; set once the loop ends."
+    )
+
+
+class GoalContinue(BaseModel):
+    """Decision to keep going: send ``followup`` before the next run."""
+
+    followup: str
+
+
+class GoalDone(BaseModel):
+    """Decision to stop: the loop finished with ``outcome``."""
+
+    outcome: GoalOutcome
+
+
+GoalStep = GoalContinue | GoalDone
+"""One decision returned by :meth:`GoalController.on_run_finished`."""
+
+
+class GoalController:
+    """Judges goal completion and decides continue-vs-stop, without doing I/O.
+
+    A driver calls :meth:`start` once to get the first message to send, then
+    calls :meth:`on_run_finished` after every agent run to get the next
+    decision. The controller owns the iteration count and the ``max_iterations``
+    cap, so drivers stay trivial.
+    """
+
+    def __init__(
+        self, objective: str, judge_llm: LLM, *, max_iterations: int = 10
+    ) -> None:
+        if not objective.strip():
+            raise ValueError("Goal objective must not be empty.")
+        if max_iterations < 1:
+            raise ValueError("max_iterations must be >= 1.")
+        self.objective = objective
+        self.judge_llm = judge_llm
+        self.max_iterations = max_iterations
+        self.iteration = 0
+
+    def start(self) -> str:
+        """Return the first message a driver should send (the objective)."""
+        return self.objective
+
+    def on_run_finished(self, events: Sequence[Event]) -> GoalStep:
+        """Judge the objective after a run and decide whether to continue.
+
+        Increments the iteration count, audits ``events`` with the judge LLM,
+        and returns a :class:`GoalContinue` (with a followup) or a terminal
+        :class:`GoalDone` (with a :class:`GoalOutcome`).
+        """
+        self.iteration += 1
+        verdict = judge_goal(self.judge_llm, self.objective, events)
+        logger.info(
+            "Goal audit %d/%d: score=%.2f complete=%s",
+            self.iteration,
+            self.max_iterations,
+            verdict.score,
+            verdict.complete,
+        )
+        if verdict.complete:
+            return GoalDone(
+                outcome=GoalOutcome(
+                    status="complete", iterations=self.iteration, verdict=verdict
+                )
+            )
+        if self.iteration >= self.max_iterations:
+            return GoalDone(
+                outcome=GoalOutcome(
+                    status="capped", iterations=self.iteration, verdict=verdict
+                )
+            )
+        missing = verdict.missing or "Some requirements are not yet verified."
+        followup = FOLLOWUP_PROMPT.format(iteration=self.iteration, missing=missing)
+        return GoalContinue(followup=followup)
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/judge.py b/openhands-sdk/openhands/sdk/conversation/goal/judge.py
new file mode 100644
index 0000000000..9bdc73a009
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/conversation/goal/judge.py
@@ -0,0 +1,121 @@
+"""LLM judge that decides whether a ``/goal`` objective is complete.
+
+This is the reusable kernel of the goal feature: a pure
+``objective + transcript -> verdict`` evaluator with no dependency on the
+critic machinery. The ``/goal`` runner uses it to drive continuation, but it
+can equally back a status command, a stop hook, or a server endpoint.
+"""
+
+import contextlib
+import json
+import re
+from collections.abc import Sequence
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from openhands.sdk.conversation.goal.prompts import JUDGE_PROMPT
+from openhands.sdk.event import Event, LLMConvertibleEvent
+from openhands.sdk.llm import LLM, Message, TextContent, content_to_str
+from openhands.sdk.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class GoalVerdict(BaseModel):
+    """The judge's verdict on whether the objective is complete."""
+
+    score: float = Field(
+        ge=0.0,
+        le=1.0,
+        description="Probability (0-1) that the full objective is provably done.",
+    )
+    complete: bool = Field(
+        description="Whether the judge considers the objective complete."
+    )
+    missing: str = Field(
+        default="",
+        description="Concise description of what remains, or empty if complete.",
+    )
+
+
+def judge_goal(judge_llm: LLM, objective: str, events: Sequence[Event]) -> GoalVerdict:
+    """Audit the transcript and decide whether ``objective`` is complete.
+
+    Args:
+        judge_llm: The second LLM that grades completion.
+        objective: The goal to audit against.
+        events: Conversation events (non-LLM events are ignored).
+
+    Returns:
+        A GoalVerdict. On a judge response that cannot be parsed, returns a
+        conservative low score so the caller keeps working rather than
+        falsely finishing.
+    """
+    convertible = [e for e in events if isinstance(e, LLMConvertibleEvent)]
+    transcript = _render_transcript(convertible)
+    prompt = JUDGE_PROMPT.format(objective=objective, transcript=transcript)
+
+    # The judge only needs the verdict text. Force non-streaming so reusing a
+    # streaming agent LLM as the judge does not trip completion()'s requirement
+    # of an on_token callback when stream=True.
+    if judge_llm.stream:
+        judge_llm = judge_llm.model_copy(update={"stream": False})
+    response = judge_llm.completion(
+        messages=[Message(role="user", content=[TextContent(text=prompt)])]
+    )
+    verdict = _parse_verdict(response.message)
+    logger.debug("judge_goal verdict: %s", verdict)
+    return verdict
+
+
+def _render_transcript(events: Sequence[LLMConvertibleEvent]) -> str:
+    """Render events as a plain ``role: text`` transcript for the judge.
+
+    The agent's ``system`` prompt is excluded: it is large (~thousands of tokens)
+    and carries no goal-specific evidence, so it would only inflate the judge's
+    token cost on every audit.
+    """
+    turns = [
+        (msg.role, text)
+        for msg in LLMConvertibleEvent.events_to_messages(list(events))
+        if msg.role != "system"
+        and (text := "\n".join(content_to_str(msg.content)).strip())
+    ]
+    return "\n\n".join(f"{role}: {text}" for role, text in turns)
+
+
+def _parse_verdict(message: Message) -> GoalVerdict:
+    """Normalize the judge response into a GoalVerdict, conservatively."""
+    raw = "\n".join(content_to_str(message.content)).strip()
+
+    data: dict[str, Any] | None = None
+    candidates = [raw]
+    block = re.search(r"\{.*\}", raw, re.DOTALL)
+    if block:
+        candidates.append(block.group(0))
+    for candidate in candidates:
+        with contextlib.suppress(json.JSONDecodeError):
+            parsed = json.loads(candidate)
+            if isinstance(parsed, dict):
+                data = parsed
+                break
+
+    if data is None:
+        logger.warning("judge_goal: could not parse verdict: %r", raw)
+        return GoalVerdict(
+            score=0.0, complete=False, missing="Judge verdict could not be parsed."
+        )
+
+    try:
+        score = float(data.get("score", 0.0))
+    except (TypeError, ValueError):
+        score = 0.0
+    score = max(0.0, min(1.0, score))
+
+    return GoalVerdict(
+        score=score,
+        complete=bool(data.get("complete", score >= 1.0)),
+        missing=str(data.get("missing") or ""),
+    )
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/prompts.py b/openhands-sdk/openhands/sdk/conversation/goal/prompts.py
new file mode 100644
index 0000000000..c24bbd2214
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/conversation/goal/prompts.py
@@ -0,0 +1,41 @@
+"""Prompt text for the ``/goal`` command's judge and continuation messages."""
+
+from typing import Final
+
+
+JUDGE_PROMPT: Final[str] = """You are auditing whether a long-running GOAL has \
+been COMPLETED by an AI software agent.
+
+<objective>
+{objective}
+</objective>
+
+Derive the concrete requirements implied by the objective. For EACH requirement,
+look for authoritative evidence in the transcript below: file contents, command
+output, or test results produced by the agent. Treat missing, uncertain, or
+merely-claimed-but-unverified evidence as NOT satisfied.
+
+<transcript>
+{transcript}
+</transcript>
+
+Respond with STRICT JSON and nothing else, in exactly this shape:
+{{"score": <float 0.0-1.0, probability the FULL objective is provably done>, \
+"complete": <true|false>, "missing": "<concise description of what remains, or \
+an empty string if complete>"}}"""
+
+
+FOLLOWUP_PROMPT: Final[str] = """The goal is NOT yet complete (audit iteration \
+{iteration}).
+Outstanding: {missing}
+
+Inspect the real current state of the workspace (do not rely on memory). For \
+each remaining requirement, make concrete progress and gather authoritative \
+evidence by running the relevant tests/commands. Keep the full objective intact \
+and finish only once every requirement is provably satisfied."""
+
+
+RESUME_PROMPT: Final[str] = """Resuming a goal that was paused or interrupted. \
+Re-check the real current state of the workspace (do not rely on memory) and \
+continue making concrete, verified progress toward the original objective. \
+Finish only once every requirement is provably satisfied."""
diff --git a/openhands-sdk/openhands/sdk/conversation/goal/runner.py b/openhands-sdk/openhands/sdk/conversation/goal/runner.py
new file mode 100644
index 0000000000..253966a47c
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/conversation/goal/runner.py
@@ -0,0 +1,57 @@
+"""The synchronous ``/goal`` driver.
+
+``run_goal`` is a thin synchronous driver over :class:`GoalController`: it owns
+the I/O (sending messages, running the agent) while the controller owns the
+judging and continue-vs-stop decision. An async agent-server task can reuse the
+same controller with its own I/O loop.
+
+Unlike a critic (which the run loop consults *inside* one ``run()``), this drives
+the conversation from the outside, so it composes with whatever critic the agent
+already has -- that critic governs each inner ``run()``; this loop governs the
+overall objective.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from openhands.sdk.conversation.goal.controller import (
+    GoalController,
+    GoalDone,
+    GoalOutcome,
+)
+
+
+if TYPE_CHECKING:
+    from openhands.sdk.conversation.base import BaseConversation
+    from openhands.sdk.llm import LLM
+
+
+def run_goal(
+    conversation: BaseConversation,
+    objective: str,
+    judge_llm: LLM,
+    *,
+    max_iterations: int = 10,
+) -> GoalOutcome:
+    """Drive ``conversation`` toward ``objective``, judging completion each round.
+
+    Sends the objective, runs the agent to a finish, and lets a
+    :class:`GoalController` decide whether to re-prompt with the judge's feedback
+    or stop. Returns a :class:`GoalOutcome` whose ``status`` is ``"complete"`` or
+    ``"capped"``.
+
+    Args:
+        conversation: The conversation to drive (any agent/critic config).
+        objective: The goal to pursue and audit against.
+        judge_llm: The second LLM that grades completion.
+        max_iterations: Hard cap on audit rounds before giving up.
+    """
+    controller = GoalController(objective, judge_llm, max_iterations=max_iterations)
+    conversation.send_message(controller.start())
+    while True:
+        conversation.run()
+        step = controller.on_run_finished(conversation.state.events)
+        if isinstance(step, GoalDone):
+            return step.outcome
+        conversation.send_message(step.followup)
diff --git a/tests/sdk/conversation/goal/__init__.py b/tests/sdk/conversation/goal/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/sdk/conversation/goal/fixtures/events.jsonl b/tests/sdk/conversation/goal/fixtures/events.jsonl
new file mode 100644
index 0000000000..b1e966f65f
--- /dev/null
+++ b/tests/sdk/conversation/goal/fixtures/events.jsonl
@@ -0,0 +1,10 @@
+{"id":"03d5b5c0-6890-4b43-831a-5b079d0e889e","timestamp":"2026-06-16T05:39:18.809928","source":"environment","key":"execution_status","value":"running","kind":"ConversationStateUpdateEvent"}
+{"id":"bb62c862-0a3c-42c6-b96a-7942974f63a0","timestamp":"2026-06-16T05:39:40.828199","source":"environment","key":"stats","value":{"usage_to_metrics":{"default":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":596,"completion_tokens":775,"cache_read_tokens":0,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":1371,"response_id":""}},"condenser":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":0,"completion_tokens":0,"cache_read_tokens":0,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":0,"response_id":""}}}},"kind":"ConversationStateUpdateEvent"}
+{"id":"732ad0ee-1cfd-4c74-8934-ea310c76e335","timestamp":"2026-06-16T05:39:48.700734","source":"environment","key":"stats","value":{"usage_to_metrics":{"default":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":35895,"completion_tokens":960,"cache_read_tokens":1115,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":35484,"response_id":""}},"condenser":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":0,"completion_tokens":0,"cache_read_tokens":0,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":0,"response_id":""}}}},"kind":"ConversationStateUpdateEvent"}
+{"id":"e54cbba2-3be2-4494-aa13-83af6044c5a7","timestamp":"2026-06-16T05:39:48.701247","source":"agent","thought":[{"cache_prompt":false,"type":"text","text":"<think>Let me analyze the task. I need to:\n\n1. Build the session URL\n2. Extract PR info from the event payload\n3. Post a progress comment and save the comment ID\n4. Review the PR using the code-review and github-pr-review skills\n5. Update the progress comment when done\n\nLet me start by executing Steps 1 and 2.\n</think>\n\n\nI'll help you review PR #3745 from the OpenHands/software-agent-sdk repository. Let me start by following the steps in order.\n"}],"reasoning_content":"Let me analyze the task. I need to:\n\n1. Build the session URL\n2. Extract PR info from the event payload\n3. Post a progress comment and save the comment ID\n4. Review the PR using the code-review and github-pr-review skills\n5. Update the progress comment when done\n\nLet me start by executing Steps 1 and 2.\n","thinking_blocks":[],"responses_reasoning_item":null,"action":{"command":"SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\"\necho \"SESSION_URL=${SESSION_URL}\"","is_input":false,"timeout":null,"reset":false,"kind":"TerminalAction"},"tool_name":"terminal","tool_call_id":"chatcmpl-tool-9d0341c7a033ae28","tool_call":{"id":"chatcmpl-tool-9d0341c7a033ae28","responses_item_id":null,"name":"terminal","arguments":"{\"security_risk\": \"LOW\", \"summary\": \"Build session URL from environment\", \"command\": \"SESSION_URL=\\\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\\\"\\necho \\\"SESSION_URL=${SESSION_URL}\\\"\"}","origin":"completion"},"llm_response_id":"0680149d625114e6a97f54b158714212","security_risk":"UNKNOWN","critic_result":null,"summary":"Build session URL from environment","kind":"ActionEvent"}
+{"id":"d4f6dce6-2c6f-4559-8481-8b07ce1e775e","timestamp":"2026-06-16T05:39:49.105230","source":"environment","tool_name":"terminal","tool_call_id":"chatcmpl-tool-9d0341c7a033ae28","observation":{"content":[{"cache_prompt":false,"type":"text","text":"Cannot execute multiple commands at once.\nPlease run each command separately OR chain them into a single command via && or ;\nProvided commands:\n(1) SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\"\n(2) echo \"SESSION_URL=${SESSION_URL}\""}],"is_error":true,"command":"SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\"\necho \"SESSION_URL=${SESSION_URL}\"","exit_code":null,"timeout":false,"metadata":{"exit_code":-1,"pid":-1,"username":null,"hostname":null,"working_dir":null,"py_interpreter_path":null,"prefix":"","suffix":""},"full_output_save_dir":"/workspace/conversations/e18b827d3440425a9ff7353c4a6d6a3a/observations","kind":"TerminalObservation"},"action_id":"e54cbba2-3be2-4494-aa13-83af6044c5a7","kind":"ObservationEvent"}
+{"id":"50f603a8-d05a-4cc7-b47f-7295091be6aa","timestamp":"2026-06-16T05:39:55.324065","source":"environment","key":"stats","value":{"usage_to_metrics":{"default":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":71474,"completion_tokens":1155,"cache_read_tokens":34536,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":35774,"response_id":""}},"condenser":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":0,"completion_tokens":0,"cache_read_tokens":0,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":0,"response_id":""}}}},"kind":"ConversationStateUpdateEvent"}
+{"id":"8df85183-1da3-4677-ad2e-eb87d6f33809","timestamp":"2026-06-16T05:39:55.479027","source":"environment","tool_name":"terminal","tool_call_id":"chatcmpl-tool-9051d3d10f914188","observation":{"content":[{"cache_prompt":false,"type":"text","text":"Cannot execute multiple commands at once.\nPlease run each command separately OR chain them into a single command via && or ;\nProvided commands:\n(1) SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\"\n(2) echo \"SESSION_URL=${SESSION_URL}\"\n\n# Extract PR info from the event payload\n(3) PR_NUMBER=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])\")\n(4) REPO=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['repository']['full_name'])\")\n(5) echo \"PR_NUMBER=${PR_NUMBER}\"\n(6) echo \"REPO=${REPO}\""}],"is_error":true,"command":"SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\"\necho \"SESSION_URL=${SESSION_URL}\"\n\n# Extract PR info from the event payload\nPR_NUMBER=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])\")\nREPO=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['repository']['full_name'])\")\necho \"PR_NUMBER=${PR_NUMBER}\"\necho \"REPO=${REPO}\"","exit_code":null,"timeout":false,"metadata":{"exit_code":-1,"pid":-1,"username":null,"hostname":null,"working_dir":null,"py_interpreter_path":null,"prefix":"","suffix":""},"full_output_save_dir":"/workspace/conversations/e18b827d3440425a9ff7353c4a6d6a3a/observations","kind":"TerminalObservation"},"action_id":"70274abb-c9e1-4914-b778-29464d7bf293","kind":"ObservationEvent"}
+{"id":"bc1cee85-3816-4b67-80a9-37c002d8b1e8","timestamp":"2026-06-16T05:40:00.385458","source":"environment","key":"stats","value":{"usage_to_metrics":{"default":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":107458,"completion_tokens":1347,"cache_read_tokens":68193,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":36176,"response_id":""}},"condenser":{"model_name":"litellm_proxy/minimax-m2.7","accumulated_cost":0.0,"max_budget_per_task":null,"accumulated_token_usage":{"model":"litellm_proxy/minimax-m2.7","prompt_tokens":0,"completion_tokens":0,"cache_read_tokens":0,"cache_write_tokens":0,"reasoning_tokens":0,"context_window":0,"per_turn_token":0,"response_id":""}}}},"kind":"ConversationStateUpdateEvent"}
+{"id":"1eb65bb3-22e7-473f-855c-2dcf26ae8e08","timestamp":"2026-06-16T05:40:00.385957","source":"agent","thought":[{"cache_prompt":false,"type":"text","text":"<think>Let me chain these properly with && and ;\n</think>\n\n"}],"reasoning_content":"Let me chain these properly with && and ;\n","thinking_blocks":[],"responses_reasoning_item":null,"action":{"command":"SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\" && echo \"SESSION_URL=${SESSION_URL}\" && PR_NUMBER=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])\") && REPO=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['repository']['full_name'])\") && echo \"PR_NUMBER=${PR_NUMBER}\" && echo \"REPO=${REPO}\"","is_input":false,"timeout":null,"reset":false,"kind":"TerminalAction"},"tool_name":"terminal","tool_call_id":"chatcmpl-tool-9fa678118a57807d","tool_call":{"id":"chatcmpl-tool-9fa678118a57807d","responses_item_id":null,"name":"terminal","arguments":"{\"security_risk\": \"LOW\", \"summary\": \"Build session URL and extract PR info\", \"command\": \"SESSION_URL=\\\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\\\" && echo \\\"SESSION_URL=${SESSION_URL}\\\" && PR_NUMBER=$(echo \\\"$AUTOMATION_EVENT_PAYLOAD\\\" | python3 -c \\\"import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])\\\") && REPO=$(echo \\\"$AUTOMATION_EVENT_PAYLOAD\\\" | python3 -c \\\"import sys,json; p=json.load(sys.stdin); print(p['repository']['full_name'])\\\") && echo \\\"PR_NUMBER=${PR_NUMBER}\\\" && echo \\\"REPO=${REPO}\\\"\"}","origin":"completion"},"llm_response_id":"068014ab9c72170b40bca7fa74f68a89","security_risk":"UNKNOWN","critic_result":null,"summary":"Build session URL and extract PR info","kind":"ActionEvent"}
+{"id":"88ece7fc-2f7b-4d51-b006-5570b76f4e19","timestamp":"2026-06-16T05:40:01.175476","source":"environment","tool_name":"terminal","tool_call_id":"chatcmpl-tool-9fa678118a57807d","observation":{"content":[{"cache_prompt":false,"type":"text","text":"SESSION_URL=<secret-hidden>\nTraceback (most recent call last):\n  File \"<string>\", line 1, in <module>\n    import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])\n                       ~~~~~~~~~^^^^^^^^^^^\n  File \"/usr/local/lib/python3.13/json/__init__.py\", line 298, in load\n    return loads(fp.read(),\n        cls=cls, object_hook=object_hook,\n        parse_float=parse_float, parse_int=parse_int,\n        parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)\n  File \"/usr/local/lib/python3.13/json/__init__.py\", line 352, in loads\n    return _default_decoder.decode(s)\n           ~~~~~~~~~~~~~~~~~~~~~~~^^^\n  File \"/usr/local/lib/python3.13/json/decoder.py\", line 345, in decode\n    obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n               ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/usr/local/lib/python3.13/json/decoder.py\", line 363, in raw_decode\n    raise JSONDecodeError(\"Expecting value\", s, err.value) from None\njson.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)"}],"is_error":false,"command":"SESSION_URL=\"${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}\" && echo \"SESSION_URL=${SESSION_URL}\" && PR_NUMBER=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])\") && REPO=$(echo \"$AUTOMATION_EVENT_PAYLOAD\" | python3 -c \"import sys,json; p=json.load(sys.stdin); print(p['repository']['full_name'])\") && echo \"PR_NUMBER=${PR_NUMBER}\" && echo \"REPO=${REPO}\"","exit_code":1,"timeout":false,"metadata":{"exit_code":1,"pid":-1,"username":"openhands","hostname":"runtime-sfzcsbejkqbokdub-7c4d4748c7-pb2st","working_dir":"/workspace/project","py_interpreter_path":"/usr/local/bin/python","prefix":"","suffix":"\n[The command completed with exit code 1.]"},"full_output_save_dir":"/workspace/conversations/e18b827d3440425a9ff7353c4a6d6a3a/observations","kind":"TerminalObservation"},"action_id":"1eb65bb3-22e7-473f-855c-2dcf26ae8e08","kind":"ObservationEvent"}
diff --git a/tests/sdk/conversation/goal/fixtures/expected_transcript.txt b/tests/sdk/conversation/goal/fixtures/expected_transcript.txt
new file mode 100644
index 0000000000..a8249ce828
--- /dev/null
+++ b/tests/sdk/conversation/goal/fixtures/expected_transcript.txt
@@ -0,0 +1,62 @@
+assistant: <think>Let me analyze the task. I need to:
+
+1. Build the session URL
+2. Extract PR info from the event payload
+3. Post a progress comment and save the comment ID
+4. Review the PR using the code-review and github-pr-review skills
+5. Update the progress comment when done
+
+Let me start by executing Steps 1 and 2.
+</think>
+
+
+I'll help you review PR #3745 from the OpenHands/software-agent-sdk repository. Let me start by following the steps in order.
+
+tool: [An error occurred during execution.]
+
+Cannot execute multiple commands at once.
+Please run each command separately OR chain them into a single command via && or ;
+Provided commands:
+(1) SESSION_URL="${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}"
+(2) echo "SESSION_URL=${SESSION_URL}"
+
+tool: [An error occurred during execution.]
+
+Cannot execute multiple commands at once.
+Please run each command separately OR chain them into a single command via && or ;
+Provided commands:
+(1) SESSION_URL="${AUTOMATION_SESSION_URL:-${AUTOMATION_API_URL:-https://app.all-hands.dev}}"
+(2) echo "SESSION_URL=${SESSION_URL}"
+
+# Extract PR info from the event payload
+(3) PR_NUMBER=$(echo "$AUTOMATION_EVENT_PAYLOAD" | python3 -c "import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])")
+(4) REPO=$(echo "$AUTOMATION_EVENT_PAYLOAD" | python3 -c "import sys,json; p=json.load(sys.stdin); print(p['repository']['full_name'])")
+(5) echo "PR_NUMBER=${PR_NUMBER}"
+(6) echo "REPO=${REPO}"
+
+assistant: <think>Let me chain these properly with && and ;
+</think>
+
+tool: SESSION_URL=<secret-hidden>
+Traceback (most recent call last):
+  File "<string>", line 1, in <module>
+    import sys,json; p=json.load(sys.stdin); print(p['pull_request']['number'])
+                       ~~~~~~~~~^^^^^^^^^^^
+  File "/usr/local/lib/python3.13/json/__init__.py", line 298, in load
+    return loads(fp.read(),
+        cls=cls, object_hook=object_hook,
+        parse_float=parse_float, parse_int=parse_int,
+        parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
+  File "/usr/local/lib/python3.13/json/__init__.py", line 352, in loads
+    return _default_decoder.decode(s)
+           ~~~~~~~~~~~~~~~~~~~~~~~^^^
+  File "/usr/local/lib/python3.13/json/decoder.py", line 345, in decode
+    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
+               ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.13/json/decoder.py", line 363, in raw_decode
+    raise JSONDecodeError("Expecting value", s, err.value) from None
+json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
+[The command completed with exit code 1.]
+[Current working directory: /workspace/project]
+[Python interpreter: /usr/local/bin/python]
+[Command finished with exit code 1]
\ No newline at end of file
diff --git a/tests/sdk/conversation/goal/test_controller.py b/tests/sdk/conversation/goal/test_controller.py
new file mode 100644
index 0000000000..49b60cd977
--- /dev/null
+++ b/tests/sdk/conversation/goal/test_controller.py
@@ -0,0 +1,71 @@
+"""Tests for GoalController (the transport-agnostic /goal decision logic)."""
+
+import pytest
+
+from openhands.sdk.conversation.goal import (
+    GoalContinue,
+    GoalController,
+    GoalDone,
+)
+from openhands.sdk.llm import Message, TextContent
+from openhands.sdk.testing import TestLLM
+
+
+def _judge(*texts: str) -> TestLLM:
+    """A judge LLM scripted to return one verdict per on_run_finished() call."""
+    return TestLLM.from_messages(
+        [Message(role="assistant", content=[TextContent(text=t)]) for t in texts]
+    )
+
+
+def test_start_returns_objective():
+    controller = GoalController("build x", _judge("{}"))
+    assert controller.start() == "build x"
+    assert controller.iteration == 0
+
+
+def test_continue_when_incomplete():
+    controller = GoalController(
+        "build x",
+        _judge('{"score": 0.2, "complete": false, "missing": "tests"}'),
+        max_iterations=3,
+    )
+    step = controller.on_run_finished([])
+    assert isinstance(step, GoalContinue)
+    assert "tests" in step.followup
+    assert controller.iteration == 1
+
+
+def test_done_when_complete():
+    controller = GoalController(
+        "build x", _judge('{"score": 1.0, "complete": true, "missing": ""}')
+    )
+    step = controller.on_run_finished([])
+    assert isinstance(step, GoalDone)
+    assert step.outcome.status == "complete"
+    assert step.outcome.iterations == 1
+    assert step.outcome.verdict.complete
+
+
+def test_caps_at_max_iterations():
+    incomplete = '{"score": 0.1, "complete": false, "missing": "still broken"}'
+    controller = GoalController(
+        "build x", _judge(incomplete, incomplete), max_iterations=2
+    )
+
+    assert isinstance(controller.on_run_finished([]), GoalContinue)  # round 1
+    step = controller.on_run_finished([])  # round 2 -> capped
+    assert isinstance(step, GoalDone)
+    assert step.outcome.status == "capped"
+    assert step.outcome.iterations == 2
+    assert not step.outcome.verdict.complete
+
+
+@pytest.mark.parametrize(
+    ("objective", "max_iterations"),
+    [("   ", 10), ("build x", 0)],
+    ids=["empty-objective", "bad-max-iterations"],
+)
+def test_validates_inputs(objective, max_iterations):
+    with pytest.raises(ValueError):
+        GoalController(objective, _judge("{}"), max_iterations=max_iterations)
diff --git a/tests/sdk/conversation/goal/test_judge.py b/tests/sdk/conversation/goal/test_judge.py
new file mode 100644
index 0000000000..0be4c2702f
--- /dev/null
+++ b/tests/sdk/conversation/goal/test_judge.py
@@ -0,0 +1,95 @@
+"""Tests for judge_goal / GoalVerdict (the goal-completion judge kernel)."""
+
+import pytest
+
+from openhands.sdk.conversation.goal import GoalVerdict, judge_goal
+from openhands.sdk.llm import Message, TextContent
+from openhands.sdk.testing import TestLLM
+
+
+def _judge(text: str) -> TestLLM:
+    """A judge LLM scripted to return a single verdict string."""
+    return TestLLM.from_messages(
+        [Message(role="assistant", content=[TextContent(text=text)])]
+    )
+
+
+class _StreamGuardLLM(TestLLM):
+    """TestLLM that enforces the real LLM's streaming guard.
+
+    Lets us prove ``judge_goal`` disables streaming before calling
+    ``completion()`` (plain ``TestLLM`` ignores ``stream`` entirely).
+    """
+
+    def completion(
+        self,
+        messages,
+        tools=None,
+        _return_metrics=False,
+        add_security_risk_prediction=False,
+        on_token=None,
+        **kwargs,
+    ):
+        if self.stream and on_token is None:
+            raise ValueError("Streaming requires an on_token callback")
+        return super().completion(
+            messages,
+            tools,
+            _return_metrics,
+            add_security_risk_prediction,
+            on_token,
+            **kwargs,
+        )
+
+
+@pytest.mark.parametrize(
+    ("response", "score", "complete", "missing"),
+    [
+        # exact-JSON verdict
+        ('{"score": 0.9, "complete": true, "missing": ""}', 0.9, True, ""),
+        # incomplete verdict surfaces the missing work
+        (
+            '{"score": 0.1, "complete": false, "missing": "tests not run"}',
+            0.1,
+            False,
+            "tests not run",
+        ),
+        # JSON embedded in prose / a code fence is still extracted
+        (
+            "Here is my verdict:\n```json\n"
+            '{"score": 0.4, "complete": false, "missing": "lint"}\n```',
+            0.4,
+            False,
+            "lint",
+        ),
+        # unparseable -> conservative (keeps the caller working)
+        ("I cannot decide.", 0.0, False, "Judge verdict could not be parsed."),
+        # out-of-range score is clamped into [0, 1]
+        ('{"score": 1.5, "complete": true, "missing": ""}', 1.0, True, ""),
+    ],
+    ids=["complete", "incomplete", "json-in-fence", "unparseable", "clamped-score"],
+)
+def test_judge_goal_parses_verdict(response, score, complete, missing):
+    verdict = judge_goal(_judge(response), "build it", [])
+    assert isinstance(verdict, GoalVerdict)
+    assert verdict.score == score
+    assert verdict.complete is complete
+    assert verdict.missing == missing
+
+
+def test_judge_goal_disables_streaming_on_judge_llm():
+    """A stream=True judge LLM must not trip completion()'s on_token guard."""
+    llm = _StreamGuardLLM.from_messages(
+        [
+            Message(
+                role="assistant",
+                content=[
+                    TextContent(text='{"score": 1.0, "complete": true, "missing": ""}')
+                ],
+            )
+        ],
+        usage_id="judge",
+        stream=True,
+    )
+    verdict = judge_goal(llm, "build x", [])  # would raise without the fix
+    assert verdict.complete
diff --git a/tests/sdk/conversation/goal/test_render_transcript.py b/tests/sdk/conversation/goal/test_render_transcript.py
new file mode 100644
index 0000000000..784a583d23
--- /dev/null
+++ b/tests/sdk/conversation/goal/test_render_transcript.py
@@ -0,0 +1,77 @@
+"""One complete test of the exact transcript the judge sees, over a REAL trace.
+
+``fixtures/events.jsonl`` is a trimmed slice of the persisted events of an actual
+conversation (a PR-review automation run: ``terminal`` tool calls + observations
++ state updates), loaded with the SDK's real deserializer
+(``Event.model_validate_json``). ``fixtures/expected_transcript.txt`` is the
+byte-for-byte golden of exactly what ``judge_goal`` feeds the judge -- so this
+also guards the rendering and the persisted event format.
+"""
+
+from collections import Counter
+from pathlib import Path
+
+import pytest
+
+from openhands.sdk.conversation.goal import judge_goal
+from openhands.sdk.conversation.goal.judge import _render_transcript
+from openhands.sdk.event import Event, LLMConvertibleEvent, SystemPromptEvent
+from openhands.sdk.llm import Message, TextContent
+from openhands.sdk.testing import TestLLM
+
+
+# The trace's `terminal` tool actions need openhands-tools to deserialize their
+# kinds; skip in isolated openhands-sdk runs where it is not installed.
+pytest.importorskip("openhands.tools.terminal")
+
+
+_FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _load_trace() -> list[Event]:
+    lines = (_FIXTURES / "events.jsonl").read_text().splitlines()
+    return [Event.model_validate_json(line) for line in lines if line.strip()]
+
+
+def test_render_transcript_for_judge():
+    events = _load_trace()
+
+    # A real trace: agent tool calls + observations + (non-rendered) state updates.
+    kinds = Counter(type(e).__name__ for e in events)
+    assert kinds["ActionEvent"] and kinds["ObservationEvent"]
+    assert kinds["ConversationStateUpdateEvent"]  # present, but NOT LLM-convertible
+
+    convertible = [e for e in events if isinstance(e, LLMConvertibleEvent)]
+    transcript = _render_transcript(convertible)
+
+    # Tool calls render as: ActionEvent -> assistant reasoning turn,
+    # ObservationEvent -> `tool:` turn.
+    assert "assistant: <think>" in transcript
+    assert "\n\ntool: " in transcript
+    # Secrets are redacted in what the judge sees.
+    assert "<secret-hidden>" in transcript
+
+    # The (large) system prompt is excluded by design: prepending a
+    # SystemPromptEvent does not change the rendered transcript.
+    system_event = SystemPromptEvent(
+        system_prompt=TextContent(text="<SOUL>...big system prompt...</SOUL>"),
+        tools=[],
+    )
+    assert _render_transcript([system_event, *convertible]) == transcript
+
+    # The full kernel turns this transcript into a verdict (the judge's feedback).
+    judge = TestLLM.from_messages(
+        [
+            Message(
+                role="assistant",
+                content=[TextContent(text='{"score": 0.8, "complete": true}')],
+            )
+        ]
+    )
+    verdict = judge_goal(judge, "Review PR #3745", events)
+    assert verdict.complete is True
+    assert verdict.score == 0.8
+
+    # Byte-for-byte: EXACTLY what judge_goal feeds the judge over the real trace.
+    expected = (_FIXTURES / "expected_transcript.txt").read_text()
+    assert transcript == expected
diff --git a/tests/sdk/conversation/goal/test_runner.py b/tests/sdk/conversation/goal/test_runner.py
new file mode 100644
index 0000000000..f33eccbf60
--- /dev/null
+++ b/tests/sdk/conversation/goal/test_runner.py
@@ -0,0 +1,65 @@
+"""Tests for the /goal driver loop and command parsing."""
+
+import pytest
+
+from openhands.sdk.agent import Agent
+from openhands.sdk.conversation import Conversation
+from openhands.sdk.conversation.base import BaseConversation
+from openhands.sdk.conversation.goal import GoalOutcome, run_goal
+from openhands.sdk.llm import Message, TextContent
+from openhands.sdk.testing import TestLLM
+
+
+def _text_llm(*texts: str) -> TestLLM:
+    """An LLM scripted to return content-only replies (each finishes a turn)."""
+    return TestLLM.from_messages(
+        [Message(role="assistant", content=[TextContent(text=t)]) for t in texts]
+    )
+
+
+def _conversation(*agent_turns: str) -> BaseConversation:
+    """A conversation whose agent finishes (content-only) on each run()."""
+    agent = Agent(llm=_text_llm(*agent_turns), tools=[])
+    return Conversation(agent=agent)
+
+
+_DONE = '{"score": 1.0, "complete": true, "missing": ""}'
+_NOT_DONE = '{"score": 0.2, "complete": false, "missing": "tests"}'
+
+
+@pytest.mark.parametrize(
+    ("agent_turns", "verdicts", "max_iterations", "status", "iterations"),
+    [
+        # judge says "done" on the first audit
+        (("done",), (_DONE,), 5, "complete", 1),
+        # "not done" then "done" -> loops once more, then completes
+        (("turn 1", "turn 2"), (_NOT_DONE, _DONE), 5, "complete", 2),
+        # never done -> capped at max_iterations
+        (("turn 1", "turn 2"), (_NOT_DONE, _NOT_DONE), 2, "capped", 2),
+    ],
+    ids=["complete-first-audit", "loops-until-complete", "caps-at-max"],
+)
+def test_run_goal_outcomes(agent_turns, verdicts, max_iterations, status, iterations):
+    conversation = _conversation(*agent_turns)
+    outcome = run_goal(
+        conversation, "build x", _text_llm(*verdicts), max_iterations=max_iterations
+    )
+    assert isinstance(outcome, GoalOutcome)
+    assert outcome.status == status
+    assert outcome.iterations == iterations
+    assert outcome.verdict.complete is (status == "complete")
+
+
+@pytest.mark.parametrize(
+    ("objective", "max_iterations"),
+    [("   ", 5), ("build x", 0)],
+    ids=["empty-objective", "bad-max-iterations"],
+)
+def test_run_goal_rejects_invalid_input(objective, max_iterations):
+    with pytest.raises(ValueError):
+        run_goal(
+            _conversation("noop"),
+            objective,
+            _text_llm("{}"),
+            max_iterations=max_iterations,
+        )