From 79938d3bd12708640e0af6ad4b4ed098b42136d8 Mon Sep 17 00:00:00 2001
From: Wild Wind <wwind123@gmail.com>
Date: Sat, 2 May 2026 19:08:59 -0700
Subject: [PATCH 1/2] Use response files for public agent output

---
 docs/local_agent_loop.md                      |  2 +-
 src/coding_review_agent_loop/agents/base.py   | 44 +++++++++++++++++++
 src/coding_review_agent_loop/agents/claude.py | 15 +++++--
 src/coding_review_agent_loop/agents/gemini.py | 17 +++++--
 tests/test_agent_loop.py                      | 39 ++++++++++++++++
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/docs/local_agent_loop.md b/docs/local_agent_loop.md
index 8b8f7f3..a8c7e5a 100644
--- a/docs/local_agent_loop.md
+++ b/docs/local_agent_loop.md
@@ -339,7 +339,7 @@ agent-loop issue 56 \
   --gemini-arg=--approval-mode --gemini-arg=auto_edit
 ```
 
-Providing any `--claude-arg`, `--codex-arg`, or `--gemini-arg` replaces that agent's default entirely. Gemini's text output is used directly. If you pass `--gemini-arg=--output-format --gemini-arg=json`, the loop extracts the JSON `response` field before parsing markers.
+Providing any `--claude-arg`, `--codex-arg`, or `--gemini-arg` replaces that agent's default entirely. Claude and Gemini prompts include a tool-owned response-file path under `/tmp/coding-review-agent-loop/responses/`; when the file exists and is non-empty, the loop posts that file instead of stdout so CLI diagnostics and tool narration do not leak into GitHub comments. Gemini still supports stdout marker filtering as a fallback. If you pass `--gemini-arg=--output-format --gemini-arg=json`, the loop extracts the JSON `response` field before parsing markers when no response file was written.
 
 ## Protocol
 
diff --git a/src/coding_review_agent_loop/agents/base.py b/src/coding_review_agent_loop/agents/base.py
index 0292e4a..f711afd 100644
--- a/src/coding_review_agent_loop/agents/base.py
+++ b/src/coding_review_agent_loop/agents/base.py
@@ -4,6 +4,8 @@
 
 from dataclasses import dataclass
 from pathlib import Path
+import tempfile
+import uuid
 from typing import TYPE_CHECKING, Literal, Protocol
 
 from ..runner import Runner
@@ -36,3 +38,45 @@ def run(
         prompt: str,
         session_id: str | None = None,
     ) -> AgentResult: ...
+
+
+def _safe_repo_slug(repo: str) -> str:
+    return repo.replace("/", "-").replace(":", "-")
+
+
+def public_response_path(config: AgentLoopConfig, agent: AgentName) -> Path:
+    path = (
+        Path(tempfile.gettempdir())
+        / "coding-review-agent-loop"
+        / "responses"
+        / _safe_repo_slug(config.repo)
+        / agent
+        / f"{uuid.uuid4().hex}.md"
+    )
+    path.parent.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def with_public_response_file_instruction(prompt: str, response_path: Path) -> str:
+    return f"""{prompt}
+
+PUBLIC RESPONSE FILE:
+
+Write the final public response that should be posted to GitHub to this file:
+
+{response_path}
+
+The orchestrator will post only that file's contents when it exists and is
+non-empty. Keep internal tool narration, planning notes, diagnostics, and
+scratch output out of that file. Include the required AGENT_STATE / AGENT_PR /
+AGENT_CLARIFY markers in the file, as requested above.
+"""
+
+
+def read_public_response_file(response_path: Path) -> str | None:
+    try:
+        text = response_path.read_text(encoding="utf-8")
+    except FileNotFoundError:
+        return None
+    text = text.strip()
+    return text or None
diff --git a/src/coding_review_agent_loop/agents/claude.py b/src/coding_review_agent_loop/agents/claude.py
index dad2318..888c39d 100644
--- a/src/coding_review_agent_loop/agents/claude.py
+++ b/src/coding_review_agent_loop/agents/claude.py
@@ -6,7 +6,13 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from .base import AgentName, AgentResult
+from .base import (
+    AgentName,
+    AgentResult,
+    public_response_path,
+    read_public_response_file,
+    with_public_response_file_instruction,
+)
 from ..logging import agent_log_path, log
 from ..runner import Runner
 
@@ -46,12 +52,13 @@ def run(
         prompt: str,
         session_id: str | None = None,
     ) -> AgentResult:
+        response_path = public_response_path(config, "claude")
         args = [config.claude_cmd, "--print", "--output-format", "json", *config.claude_args]
         if session_id:
             args += ["--resume", session_id]
-        args.append(prompt)
+        args.append(with_public_response_file_instruction(prompt, response_path))
         log_path = agent_log_path(config, "claude")
-        log(config, f"Starting Claude in {config.claude_dir}; log: {log_path}")
+        log(config, f"Starting Claude in {config.claude_dir}; log: {log_path}; response: {response_path}")
         result = runner.run_with_log(
             args,
             cwd=config.claude_dir,
@@ -61,7 +68,7 @@ def run(
         )
         log(config, f"Claude finished; log: {log_path}")
         text, new_session_id = _parse_claude_output(result.stdout)
-        return AgentResult(text=text, session_id=new_session_id)
+        return AgentResult(text=read_public_response_file(response_path) or text, session_id=new_session_id)
 
 
 BACKEND = ClaudeBackend()
diff --git a/src/coding_review_agent_loop/agents/gemini.py b/src/coding_review_agent_loop/agents/gemini.py
index 9d9627f..9174e21 100644
--- a/src/coding_review_agent_loop/agents/gemini.py
+++ b/src/coding_review_agent_loop/agents/gemini.py
@@ -6,7 +6,13 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from .base import AgentName, AgentResult
+from .base import (
+    AgentName,
+    AgentResult,
+    public_response_path,
+    read_public_response_file,
+    with_public_response_file_instruction,
+)
 from ..logging import agent_log_path, log
 from ..protocol import CLARIFY_RE, STATE_RE
 from ..runner import Runner
@@ -94,12 +100,15 @@ def run(
         prompt: str,
         session_id: str | None = None,
     ) -> AgentResult:
+        response_path = public_response_path(config, "gemini")
         log_path = agent_log_path(config, "gemini")
-        log(config, f"Starting Gemini in {config.gemini_dir}; log: {log_path}")
+        log(config, f"Starting Gemini in {config.gemini_dir}; log: {log_path}; response: {response_path}")
         args = [
             config.gemini_cmd,
             "--prompt",
-            _with_public_response_marker_instruction(prompt),
+            _with_public_response_marker_instruction(
+                with_public_response_file_instruction(prompt, response_path)
+            ),
             *config.gemini_args,
         ]
         if session_id:
@@ -113,7 +122,7 @@ def run(
         )
         log(config, f"Gemini finished; log: {log_path}")
         text, new_session_id = _parse_gemini_output(result.stdout)
-        return AgentResult(text=text, session_id=new_session_id)
+        return AgentResult(text=read_public_response_file(response_path) or text, session_id=new_session_id)
 
 
 BACKEND = GeminiBackend()
diff --git a/tests/test_agent_loop.py b/tests/test_agent_loop.py
index 89567f1..8019311 100644
--- a/tests/test_agent_loop.py
+++ b/tests/test_agent_loop.py
@@ -1,4 +1,5 @@
 import json
+import re
 from pathlib import Path
 
 import pytest
@@ -46,6 +47,7 @@ def __init__(
         diff_returncode=0,
         diff_stderr="",
         issue_urls=None,
+        public_response_outputs=None,
     ):
         super().__init__(dry_run=False)
         self.claude_outputs = list(claude_outputs or [])
@@ -83,6 +85,7 @@ def __init__(
         self.diff_returncode = diff_returncode
         self.diff_stderr = diff_stderr
         self.issue_urls = list(issue_urls) if issue_urls is not None else None
+        self.public_response_outputs = list(public_response_outputs or [])
 
     def _record_command(self, args, cwd):
         cmd = [str(arg) for arg in args]
@@ -92,6 +95,17 @@ def _record_command(self, args, cwd):
         self.commands.append((cmd, cwd_path))
         return cmd, cwd_path
 
+    def _maybe_write_public_response_file(self, cmd):
+        if not self.public_response_outputs:
+            return
+        prompt = "\n".join(cmd)
+        match = re.search(r"Write the final public response.*?\n\n([^\n]+/responses/[^\n]+\.md)", prompt, re.S)
+        if not match:
+            return
+        response_path = Path(match.group(1))
+        response_path.parent.mkdir(parents=True, exist_ok=True)
+        response_path.write_text(self.public_response_outputs.pop(0), encoding="utf-8")
+
     def run_with_log(
         self,
         args,
@@ -108,6 +122,7 @@ def run_with_log(
 
         if cmd[:1] == ["claude"]:
             output = self.claude_outputs.pop(0)
+            self._maybe_write_public_response_file(cmd)
             log_path.write_text(f"$ {' '.join(cmd)}\n\n{output}", encoding="utf-8")
             return CommandResult(cmd, cwd_path, output, "", 0)
 
@@ -121,6 +136,7 @@ def run_with_log(
 
         if cmd[:1] == ["gemini"]:
             output = self.gemini_outputs.pop(0)
+            self._maybe_write_public_response_file(cmd)
             log_path.write_text(f"$ {' '.join(cmd)}\n\n{output}", encoding="utf-8")
             return CommandResult(cmd, cwd_path, output, "", 0)
 
@@ -2230,6 +2246,29 @@ def test_gemini_review_loop_uses_prompt_and_extra_args(tmp_path):
     assert runner.comments == ["LGTM.\n<!-- AGENT_STATE: approved -->\n-- Google Gemini"]
 
 
+def test_gemini_review_loop_prefers_public_response_file_over_stdout(tmp_path):
+    runner = FakeRunner(
+        gemini_outputs=[
+            "Warning: True color (24-bit) support not detected.\n"
+            "YOLO mode is enabled. All tool calls will be automatically approved.\n"
+            "I will fetch the PR and inspect the diff.\n"
+            "Error executing tool run_shell_command: confirmation required.\n"
+            "This stdout chatter should not be posted.\n",
+        ],
+        public_response_outputs=[
+            "LGTM from response file.\n<!-- AGENT_STATE: approved -->\n-- Google Gemini",
+        ],
+    )
+    config = make_config(tmp_path, reviewer="gemini")
+
+    assert run_pr_loop(runner, pr_number=77, config=config) == 0
+
+    gemini_call = next(cmd for cmd, _cwd in runner.commands if cmd[:1] == ["gemini"])
+    assert "PUBLIC RESPONSE FILE:" in gemini_call[2]
+    assert "/coding-review-agent-loop/responses/OWNER-REPO/gemini/" in gemini_call[2]
+    assert runner.comments == ["LGTM from response file.\n<!-- AGENT_STATE: approved -->\n-- Google Gemini"]
+
+
 def test_codex_task_loop_rejects_empty_task_text(tmp_path):
     runner = FakeRunner()
     config = make_config(tmp_path, coder="codex", reviewer="claude")

From 6f5c67abd018b5c80080f3df595315fe2ce5627f Mon Sep 17 00:00:00 2001
From: Wild Wind <wwind123@gmail.com>
Date: Sat, 2 May 2026 19:18:36 -0700
Subject: [PATCH 2/2] Add Claude public response regression test

---
 tests/test_agent_loop.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/test_agent_loop.py b/tests/test_agent_loop.py
index 8019311..1c92b3c 100644
--- a/tests/test_agent_loop.py
+++ b/tests/test_agent_loop.py
@@ -2269,6 +2269,33 @@ def test_gemini_review_loop_prefers_public_response_file_over_stdout(tmp_path):
     assert runner.comments == ["LGTM from response file.\n<!-- AGENT_STATE: approved -->\n-- Google Gemini"]
 
 
+def test_claude_review_loop_prefers_public_response_file_over_stdout(tmp_path):
+    runner = FakeRunner(
+        claude_outputs=[
+            json.dumps(
+                {
+                    "result": (
+                        "I will inspect the PR diff.\n"
+                        "Tool output chatter should not be posted.\n"
+                    ),
+                    "session_id": "claude-session-1",
+                }
+            ),
+        ],
+        public_response_outputs=[
+            "LGTM from response file.\n<!-- AGENT_STATE: approved -->\n-- Anthropic Claude",
+        ],
+    )
+    config = make_config(tmp_path, reviewer="claude")
+
+    assert run_pr_loop(runner, pr_number=77, config=config) == 0
+
+    claude_call = next(cmd for cmd, _cwd in runner.commands if cmd[:1] == ["claude"])
+    assert "PUBLIC RESPONSE FILE:" in claude_call[-1]
+    assert "/coding-review-agent-loop/responses/OWNER-REPO/claude/" in claude_call[-1]
+    assert runner.comments == ["LGTM from response file.\n<!-- AGENT_STATE: approved -->\n-- Anthropic Claude"]
+
+
 def test_codex_task_loop_rejects_empty_task_text(tmp_path):
     runner = FakeRunner()
     config = make_config(tmp_path, coder="codex", reviewer="claude")