From e8743a36a3a2b386535ea3577a2ba37fcbeb87e6 Mon Sep 17 00:00:00 2001
From: wittekin <6079900+wittekin@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:04:52 -0700
Subject: [PATCH 1/2] Release v0.6.3

Fix tasklist lifecycle handling around compaction, remote completion, and inner-loop resume.
---
 CHANGELOG.md                             |   8 +
 pyproject.toml                           |   2 +-
 src/millstone/artifact_providers/file.py |   8 +-
 src/millstone/artifacts/tasklist.py      |  26 ++-
 src/millstone/runtime/orchestrator.py    | 268 +++++++++++++++++++++--
 tests/test_orchestrator.py               | 265 ++++++++++++++++++++++
 tests/test_tasklist.py                   |  58 +++++
 7 files changed, 604 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a2f37bf..14617ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,14 @@ The format is based on Keep a Changelog and this project follows Semantic Versio
 
 ## [Unreleased]
 
+## [0.6.3] - 2026-04-02
+
+### Fixed
+- Single-task scope validation now accepts compacted tasklists that move the selected completed item into the `Completed` summary section, avoiding false scope violations after compaction.
+- Inner-loop `--continue` now preserves selected-task identity so resumed builder/reviewer work cannot drift to a different task after a mid-task halt.
+- Successful MCP-backed task runs now explicitly mark the selected remote task done after commit, instead of relying only on the builder prompt to do it.
+- Automatic compaction on tracked tasklists now finalizes its own tasklist rewrite immediately, preventing compaction-only changes from leaking into the next task's diff, review, or commit.
+
 ## [0.6.2] - 2026-03-31
 
 ### Fixed
diff --git a/pyproject.toml b/pyproject.toml
index 91696d4..ffed1dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "millstone"
-version = "0.6.2"
+version = "0.6.3"
 description = "Orchestrator for agentic coding tools"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/src/millstone/artifact_providers/file.py b/src/millstone/artifact_providers/file.py
index f8969ac..c263b02 100644
--- a/src/millstone/artifact_providers/file.py
+++ b/src/millstone/artifact_providers/file.py
@@ -676,8 +676,12 @@ def get_prompt_placeholders(self) -> dict[str, str]:
                 "`- [ ]` pending, `- [x]` complete. Select the FIRST unchecked task."
             ),
             "TASKLIST_COMPLETE_INSTRUCTIONS": (
-                f"Mark exactly this one task complete by changing its `- [ ]` to "
-                f"`- [x]` in `{self.path}` and stop. Do not modify any other tasks."
+                f"Mark exactly this one task complete in `{self.path}` and stop. "
+                "Normally, change its `- [ ]` to `- [x]`. "
+                "If the file has a `Completed` summary section instead of listing older "
+                "finished tasks individually, you may instead remove the selected task from "
+                "the pending list and update only that `Completed` summary. Do not modify "
+                "any other pending tasks."
             ),
             "TASKLIST_REWRITE_INSTRUCTIONS": (
                 f"Write the entire compacted content back to `{self.path}`, "
diff --git a/src/millstone/artifacts/tasklist.py b/src/millstone/artifacts/tasklist.py
index 2ac050c..145f8af 100644
--- a/src/millstone/artifacts/tasklist.py
+++ b/src/millstone/artifacts/tasklist.py
@@ -414,6 +414,10 @@ def _extract_tasks(self, content: str) -> list[tuple[bool, str]]:
             tasks.append((checked.lower() == "x", task_text.strip()))
         return tasks
 
+    def _has_completed_summary_section(self, content: str) -> bool:
+        """Return True when the tasklist already uses a compacted completed summary."""
+        return bool(re.search(r"^#{2,6}\s+Completed\b", content, re.MULTILINE | re.IGNORECASE))
+
     def validate_single_task_completion(
         self,
         original_content: str,
@@ -430,7 +434,25 @@ def validate_single_task_completion(
         if not original_tasks or not new_tasks:
             return True, ""
 
+        first_unchecked = next(
+            (i for i, (checked, _) in enumerate(original_tasks) if not checked),
+            None,
+        )
+
         if len(new_tasks) < len(original_tasks):
+            if (
+                first_unchecked is not None
+                and len(new_tasks) == len(original_tasks) - 1
+                and (
+                    self._has_completed_summary_section(original_content)
+                    or self._has_completed_summary_section(new_content)
+                )
+            ):
+                expected_tasks = (
+                    original_tasks[:first_unchecked] + original_tasks[first_unchecked + 1 :]
+                )
+                if new_tasks == expected_tasks:
+                    return True, ""
             return (
                 False,
                 f"Task count decreased: {len(original_tasks)} -> {len(new_tasks)}. Deleting tasks is not allowed.",
@@ -457,10 +479,6 @@ def validate_single_task_completion(
                 f"Multiple tasks were marked complete ({len(checkoffs)}).",
             )
 
-        first_unchecked = next(
-            (i for i, (checked, _) in enumerate(original_tasks) if not checked),
-            None,
-        )
         if first_unchecked is None:
             return False, "No unchecked tasks remained, but a task was marked complete."
 
diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py
index d50eea4..e59ce97 100755
--- a/src/millstone/runtime/orchestrator.py
+++ b/src/millstone/runtime/orchestrator.py
@@ -528,6 +528,7 @@ def __init__(
         }  # Aggregated severity counts
         self._current_task_text: str = ""  # Current task text for review metrics
         self._selected_task_item: TasklistItem | None = None
+        self._resumed_task_state: dict[str, Any] | None = None
         self._tasklist_fix_scope_active: bool = False
         self._task_impossible_decision: ReviewDecision | None = None
         self._task_previous_diff: str | None = (
@@ -998,7 +999,7 @@ def _get_state_file_path(self) -> Path:
 
     def _build_state_snapshot(self, halt_reason: str = "") -> dict[str, Any]:
         """Build the standard state payload for resumable halts."""
-        return {
+        state = {
             "current_task_num": self.current_task_num,
             "builder_session_id": self.builder_session_id,
             "reviewer_session_id": self.reviewer_session_id,
@@ -1009,6 +1010,75 @@ def _build_state_snapshot(self, halt_reason: str = "") -> dict[str, Any]:
             "halt_reason": halt_reason,
             "timestamp": datetime.now().isoformat(),
         }
+        state.update(self._selected_task_state_snapshot())
+        return state
+
+    def _serialize_task_item(self, task: TasklistItem | None) -> dict[str, Any] | None:
+        if task is None:
+            return None
+        return {
+            "task_id": task.task_id,
+            "title": task.title,
+            "status": task.status.value,
+            "design_ref": task.design_ref,
+            "opportunity_ref": task.opportunity_ref,
+            "risk": task.risk,
+            "tests": task.tests,
+            "criteria": task.criteria,
+            "acceptance_criteria": list(task.acceptance_criteria or []),
+            "context": task.context,
+            "raw": task.raw,
+        }
+
+    def _deserialize_task_item(self, payload: dict[str, Any] | None) -> TasklistItem | None:
+        if not payload:
+            return None
+        status_value = payload.get("status", TaskStatus.todo.value)
+        with contextlib.suppress(ValueError):
+            status = TaskStatus(status_value)
+            return TasklistItem(
+                task_id=payload.get("task_id", ""),
+                title=payload.get("title", "") or "task",
+                status=status,
+                design_ref=payload.get("design_ref"),
+                opportunity_ref=payload.get("opportunity_ref"),
+                risk=payload.get("risk"),
+                tests=payload.get("tests"),
+                criteria=payload.get("criteria"),
+                acceptance_criteria=list(payload.get("acceptance_criteria") or []),
+                context=payload.get("context"),
+                raw=payload.get("raw"),
+            )
+        return None
+
+    def _selected_task_state_snapshot(self) -> dict[str, Any]:
+        """Capture selected-task identity so inner-loop resumes keep the same scope."""
+        snapshot = {
+            "selected_task_line": self._selected_task_line,
+            "selected_task_title": self._selected_task_title or self.current_task_title or None,
+            "selected_task_context_file": self._selected_task_context_file,
+            "current_task_id": self._current_task_id,
+            "current_task_text": self._current_task_text,
+            "current_task_group": self.current_task_group,
+            "current_task_risk": self.current_task_risk,
+            "selected_task_item": self._serialize_task_item(self._get_selected_task_item()),
+        }
+        return {k: v for k, v in snapshot.items() if v is not None}
+
+    def _restore_selected_task_state(self, state: dict[str, Any]) -> None:
+        """Restore saved selected-task identity for a resumed inner-loop run."""
+        keys = {
+            "selected_task_line",
+            "selected_task_title",
+            "selected_task_context_file",
+            "current_task_id",
+            "current_task_text",
+            "current_task_group",
+            "current_task_risk",
+            "selected_task_item",
+        }
+        snapshot = {k: state.get(k) for k in keys if state.get(k) is not None}
+        self._resumed_task_state = snapshot or None
 
     def save_state(self, halt_reason: str = "", **extra: object) -> None:
         """Save current orchestration state to state.json.
@@ -2237,16 +2307,12 @@ def _selected_task_scope_block(self) -> str:
         )
 
     def _current_task_acceptance_criteria(self) -> list[str]:
-        from millstone.artifact_providers.file import FileTasklistProvider
-
-        provider = self._outer_loop_manager.tasklist_provider
-        if not isinstance(provider, FileTasklistProvider):
-            task = self._get_selected_task_item()
-            if task is not None:
-                if task.acceptance_criteria:
-                    return task.acceptance_criteria
-                if task.criteria:
-                    return [task.criteria]
+        task = self._get_selected_task_item()
+        if task is not None:
+            if task.acceptance_criteria:
+                return task.acceptance_criteria
+            if task.criteria:
+                return [task.criteria]
         return self.extract_current_task_acceptance_criteria()
 
     def _tasklist_fix_targeted_update_instruction(self) -> str:
@@ -2357,6 +2423,31 @@ def _ensure_tasklist_provider_author_callback(self) -> None:
         ):
             provider.set_agent_callback(lambda p, **k: self.run_agent(p, role="author", **k))
 
+    def _finalize_remote_task_completion(self) -> bool:
+        """Ensure MCP-backed tasks are marked done after a successful task run."""
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+
+        if self.task or not self._current_task_id:
+            return True
+
+        provider = self._outer_loop_manager.tasklist_provider
+        if not isinstance(provider, MCPTasklistProvider):
+            return True
+
+        try:
+            self._ensure_tasklist_provider_author_callback()
+            provider.update_task_status(self._current_task_id, TaskStatus.done)
+            self.log("remote_task_marked_complete", task_id=self._current_task_id)
+            return True
+        except Exception as exc:
+            self.log(
+                "remote_task_completion_failed",
+                task_id=self._current_task_id,
+                error=str(exc),
+            )
+            progress(f"{self._task_prefix()} ERROR: failed to mark remote task complete: {exc}")
+            return False
+
     def extract_current_task_line(self) -> str:
         if self._selected_task_line:
             return self._selected_task_line
@@ -2500,6 +2591,81 @@ def run_compaction(self) -> bool:
         self.completed_task_count = self._tasklist_manager.completed_task_count
         return result
 
+    def _finalize_automatic_compaction(self) -> bool:
+        """Commit tracked tasklist compaction so it cannot spill into the next task."""
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+
+        provider = self._outer_loop_manager.tasklist_provider
+        if isinstance(provider, MCPTasklistProvider):
+            return True
+
+        status_lines = self._get_working_tree_status_lines()
+        if status_lines is None:
+            self.log("automatic_compaction_status_failed")
+            progress("Automatic compaction completed, but git status failed.")
+            return False
+        if not status_lines:
+            return True
+
+        tasklist_path = str(self.tasklist)
+        remaining_files = [line.split()[-1] for line in status_lines if line.strip()]
+        unexpected_files = [path for path in remaining_files if path != tasklist_path]
+        if unexpected_files:
+            self.log(
+                "automatic_compaction_unexpected_changes",
+                tasklist=tasklist_path,
+                files=",".join(unexpected_files),
+            )
+            progress(
+                "Automatic compaction left unexpected working-tree changes; halting before the next task."
+            )
+            return False
+
+        add_result = subprocess.run(
+            ["git", "add", "--", tasklist_path],
+            cwd=self.repo_dir,
+            capture_output=True,
+            text=True,
+        )
+        if add_result.returncode != 0:
+            self.log(
+                "automatic_compaction_git_add_failed",
+                tasklist=tasklist_path,
+                error=add_result.stderr.strip(),
+            )
+            progress("Automatic compaction could not stage the tasklist change.")
+            return False
+
+        commit_result = subprocess.run(
+            [
+                "git",
+                "commit",
+                "-m",
+                "Compact completed tasks in tasklist\n\nGenerated with millstone orchestrator",
+            ],
+            cwd=self.repo_dir,
+            capture_output=True,
+            text=True,
+        )
+        if commit_result.returncode != 0:
+            self.log(
+                "automatic_compaction_commit_failed",
+                tasklist=tasklist_path,
+                error=commit_result.stderr.strip(),
+            )
+            progress("Automatic compaction could not commit the tasklist change.")
+            return False
+
+        self._update_loc_baseline()
+        progress("Auto-committed tasklist compaction.")
+        return True
+
+    def _run_automatic_compaction(self) -> bool:
+        """Run compaction and finalize any tracked tasklist changes immediately."""
+        if not self.run_compaction():
+            return False
+        return self._finalize_automatic_compaction()
+
     def run_eval(self, coverage: bool = False, mode: str | None = None) -> dict:
         def _emit_eval_evidence(eval_result: dict) -> None:
             record = make_eval_evidence(
@@ -3695,6 +3861,9 @@ def run_single_task(self) -> bool:
 
         # Determine task text and metadata
         _mcp_task_item: Any = None  # Set when MCP provider supplies the current task
+        restored_task_state = self._resumed_task_state if not self.task else None
+        self._resumed_task_state = None
+
         if self.task:
             task_display = self.task[:50] + "..." if len(self.task) > 50 else self.task
             self.current_task_title = task_display
@@ -3706,6 +3875,17 @@ def run_single_task(self) -> bool:
             self._selected_task_title = self.current_task_title
             self._selected_task_line = task_text
             self._selected_task_context_file = task_metadata.get("context")
+        elif restored_task_state:
+            self.current_task_title = restored_task_state.get("selected_task_title") or "task"
+            task_text = restored_task_state.get("current_task_text") or self.current_task_title
+            self.apply_risk_settings(restored_task_state.get("current_task_risk"))
+            self.current_task_group = restored_task_state.get("current_task_group")
+            self._selected_task_title = self.current_task_title
+            self._selected_task_line = restored_task_state.get("selected_task_line") or task_text
+            self._selected_task_context_file = restored_task_state.get("selected_task_context_file")
+            self._selected_task_item = self._deserialize_task_item(
+                restored_task_state.get("selected_task_item")
+            )
         else:
             # When the tasklist is MCP-backed, derive task title and ID from the
             # remote provider's cached task list instead of reading a local file
@@ -3749,7 +3929,9 @@ def run_single_task(self) -> bool:
         self._current_task_text = task_text
         # Prefer canonical task_id from metadata (stable, ontology-compliant identity).
         # When an MCP provider supplied the task, use its remote ID directly.
-        if _mcp_task_item is not None:
+        if restored_task_state and restored_task_state.get("current_task_id"):
+            self._current_task_id = restored_task_state["current_task_id"]
+        elif _mcp_task_item is not None:
             self._current_task_id = _mcp_task_item.task_id
         elif self.task:
             _task_meta = self._tasklist_manager._parse_task_metadata(task_text)
@@ -3764,7 +3946,8 @@ def run_single_task(self) -> bool:
                 re.sub(r"[^a-z0-9]+", "-", task_text.lower()).strip("-")[:30] or None
             )
 
-        self._selected_task_item = _mcp_task_item
+        if _mcp_task_item is not None:
+            self._selected_task_item = _mcp_task_item
         if not self.task and self._selected_task_item is None:
             with contextlib.suppress(Exception):
                 pending_provider_tasks = [
@@ -3900,14 +4083,13 @@ def _worker_finish(
             self.log("research_completed", task=task_text[:200], output_file=str(output_file))
             if not self.task:
                 self.mark_task_complete()
-                # Close remote task for MCP providers — mark_task_complete()
-                # only updates the local tasklist file, which is a no-op when
-                # the task lives on a remote backend (GitHub Issues, Linear, …).
-                from millstone.artifact_providers.mcp import MCPTasklistProvider
-
-                provider = self._outer_loop_manager.tasklist_provider
-                if isinstance(provider, MCPTasklistProvider) and self._current_task_id:
-                    provider.update_task_status(self._current_task_id, TaskStatus.done)
+                if not self._finalize_remote_task_completion():
+                    _worker_finish(
+                        "failed",
+                        review_summary=builder_output[:4000],
+                        error="remote_task_completion",
+                    )
+                    return False
             progress(f"{self._task_prefix()} Research completed -> {output_file}")
             _worker_finish("success", review_summary=builder_output[:4000])
             return True
@@ -4165,6 +4347,10 @@ def builder_on_success(artifact: BuilderArtifact, verdict: BuilderVerdict) -> bo
                 # Update baseline so next task measures LoC from this commit
                 self._update_loc_baseline()
 
+                if not self._finalize_remote_task_completion():
+                    loop_state["failure_reason"] = "remote_task_completion"
+                    return False
+
                 if self.eval_on_commit and not self._run_eval_on_commit(task_text=task_text):
                     loop_state["failure_reason"] = "eval_regression"
                     return False
@@ -4173,6 +4359,9 @@ def builder_on_success(artifact: BuilderArtifact, verdict: BuilderVerdict) -> bo
                 return True
 
             if self.delegate_commit():
+                if not self._finalize_remote_task_completion():
+                    loop_state["failure_reason"] = "remote_task_completion"
+                    return False
                 if self.eval_on_commit and not self._run_eval_on_commit(task_text=task_text):
                     loop_state["failure_reason"] = "eval_regression"
                     return False
@@ -4560,6 +4749,7 @@ def run(self) -> int:
                     # Inner-loop resume (LoC/sensitive-file halt): user has
                     # already reviewed the diff, so skip mechanical checks.
                     self._skip_mechanical_checks = True
+                    self._restore_selected_task_state(state)
             else:
                 print("Warning: --continue specified but no saved state found.")
                 print("Running normally...")
@@ -4620,7 +4810,21 @@ def run(self) -> int:
                         "Skipping startup tasklist compaction because the working tree has uncommitted changes."
                     )
                 else:
-                    self.run_compaction()
+                    if not self._run_automatic_compaction():
+                        self.log(
+                            "run_completed",
+                            result="HALTED",
+                            reason="startup_compaction_failed",
+                            tasks_completed="0",
+                        )
+                        self._print_run_summary(
+                            0,
+                            1,
+                            "startup_compaction_failed",
+                            run_start_time,
+                            remaining="unknown",
+                        )
+                        return 1
 
         # Capture baseline eval if eval_on_commit or eval_on_task is enabled
         # Note: skip_eval only affects eval_on_task gate, not eval_on_commit
@@ -4717,8 +4921,24 @@ def run(self) -> int:
                     # motivation to reorganize and drop unchecked tasks.
                     if not self.task:
                         self.completed_task_count = self.count_completed_tasks()
-                        if self.should_compact():
-                            self.run_compaction()
+                        if self.should_compact() and not self._run_automatic_compaction():
+                            progress(
+                                f"=== HALTED after {tasks_completed} task(s) === automatic compaction failed"
+                            )
+                            self.log(
+                                "run_completed",
+                                result="HALTED",
+                                reason="inter_task_compaction_failed",
+                                tasks_completed=str(tasks_completed),
+                            )
+                            self._print_run_summary(
+                                tasks_completed,
+                                tasks_failed + 1,
+                                "inter_task_compaction_failed",
+                                run_start_time,
+                                remaining=self._remaining_count(task_num),
+                            )
+                            return 1
                 else:
                     tasks_failed += 1
                     # Stop on first failure
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 7c979d4..0c3baf5 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -1989,6 +1989,85 @@ def test_run_single_task_research_mode_direct_task_skips_mcp_close(self, temp_re
             finally:
                 orch.cleanup()
 
+    def test_run_single_task_closes_mcp_task_on_success(self, temp_repo):
+        """Approved MCP-backed tasks are explicitly marked done after commit."""
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+        from millstone.artifacts.models import TasklistItem, TaskStatus
+
+        orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True)
+        try:
+            mock_mcp = MagicMock(spec=MCPTasklistProvider)
+            mock_mcp._agent_callback = None
+            task = TasklistItem(task_id="GH-42", title="Remote task", status=TaskStatus.todo)
+            mock_mcp.get_prompt_placeholders.return_value = {}
+            mock_mcp.list_tasks.return_value = [task]
+            mock_mcp.get_task.return_value = task
+            orch._outer_loop_manager.tasklist_provider = mock_mcp
+
+            def fake_run_agent(prompt, role="default", **kwargs):
+                if role == "author":
+                    (temp_repo / "impl.py").write_text("value = 1\n")
+                    return "Implemented remote task."
+                if role == "reviewer":
+                    return (
+                        '{"status":"APPROVED","review":"ok","summary":"ok",'
+                        '"findings":[],"findings_by_severity":{"critical":[],"high":[],'
+                        '"medium":[],"low":[],"nit":[]},"impossible_condition":null,'
+                        '"tasklist_fix_recommendation":null}'
+                    )
+                raise AssertionError(f"Unexpected role: {role}")
+
+            with (
+                patch.object(orch, "run_agent", side_effect=fake_run_agent),
+                patch.object(orch, "sanity_check_impl", return_value=True),
+                patch.object(orch, "delegate_commit", return_value=True),
+            ):
+                assert orch.run_single_task() is True
+
+            mock_mcp.update_task_status.assert_called_once_with("GH-42", TaskStatus.done)
+        finally:
+            orch.cleanup()
+
+    def test_run_single_task_fails_when_mcp_completion_update_fails(self, temp_repo):
+        """Remote completion failures are surfaced instead of silently ignored."""
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+        from millstone.artifacts.models import TasklistItem, TaskStatus
+
+        orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True)
+        try:
+            mock_mcp = MagicMock(spec=MCPTasklistProvider)
+            mock_mcp._agent_callback = None
+            task = TasklistItem(task_id="GH-42", title="Remote task", status=TaskStatus.todo)
+            mock_mcp.get_prompt_placeholders.return_value = {}
+            mock_mcp.list_tasks.return_value = [task]
+            mock_mcp.get_task.return_value = task
+            mock_mcp.update_task_status.side_effect = RuntimeError("remote outage")
+            orch._outer_loop_manager.tasklist_provider = mock_mcp
+
+            def fake_run_agent(prompt, role="default", **kwargs):
+                if role == "author":
+                    (temp_repo / "impl.py").write_text("value = 1\n")
+                    return "Implemented remote task."
+                if role == "reviewer":
+                    return (
+                        '{"status":"APPROVED","review":"ok","summary":"ok",'
+                        '"findings":[],"findings_by_severity":{"critical":[],"high":[],'
+                        '"medium":[],"low":[],"nit":[]},"impossible_condition":null,'
+                        '"tasklist_fix_recommendation":null}'
+                    )
+                raise AssertionError(f"Unexpected role: {role}")
+
+            with (
+                patch.object(orch, "run_agent", side_effect=fake_run_agent),
+                patch.object(orch, "sanity_check_impl", return_value=True),
+                patch.object(orch, "delegate_commit", return_value=True),
+            ):
+                assert orch.run_single_task() is False
+
+            mock_mcp.update_task_status.assert_called_once_with("GH-42", TaskStatus.done)
+        finally:
+            orch.cleanup()
+
 
 class TestIsApproved:
     """Tests for approval detection."""
@@ -3657,6 +3736,57 @@ def mock_compaction(prompt):
         finally:
             orch.cleanup()
 
+    def test_finalize_automatic_compaction_commits_tracked_tasklist(self, temp_repo):
+        """Tracked tasklist compaction is committed immediately and updates baseline."""
+        tasklist_path = temp_repo / "docs" / "tasklist.md"
+        tasklist_path.parent.mkdir(parents=True, exist_ok=True)
+        tasklist_path.write_text("# Tasklist\n\n- [x] Done 1\n- [ ] Pending\n")
+        subprocess.run(
+            ["git", "add", "docs/tasklist.md"], cwd=temp_repo, check=True, capture_output=True
+        )
+        subprocess.run(
+            ["git", "commit", "-m", "Add tracked tasklist"],
+            cwd=temp_repo,
+            check=True,
+            capture_output=True,
+        )
+
+        orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True)
+        try:
+            orch._init_loc_baseline()
+            tasklist_path.write_text("# Tasklist\n\n## Completed\n\nDone 1.\n\n- [ ] Pending\n")
+
+            assert orch._finalize_automatic_compaction() is True
+
+            status = subprocess.run(
+                ["git", "status", "--porcelain"],
+                cwd=temp_repo,
+                check=True,
+                capture_output=True,
+                text=True,
+            ).stdout.strip()
+            assert status == ""
+
+            head = subprocess.run(
+                ["git", "rev-parse", "HEAD"],
+                cwd=temp_repo,
+                check=True,
+                capture_output=True,
+                text=True,
+            ).stdout.strip()
+            assert orch.loc_baseline_ref == head
+
+            message = subprocess.run(
+                ["git", "log", "-1", "--pretty=%B"],
+                cwd=temp_repo,
+                check=True,
+                capture_output=True,
+                text=True,
+            ).stdout
+            assert "Compact completed tasks in tasklist" in message
+        finally:
+            orch.cleanup()
+
 
 class TestCompactionSanityCheck:
     """Tests for compaction sanity check functionality."""
@@ -4767,6 +4897,99 @@ def test_save_state_content(self, temp_repo):
         finally:
             orch.cleanup()
 
+    def test_save_state_persists_selected_task_identity(self, temp_repo):
+        """save_state captures selected-task identity for inner-loop resume."""
+        from millstone.artifacts.models import TasklistItem, TaskStatus
+
+        tasklist = temp_repo / "docs" / "tasklist.md"
+        tasklist.parent.mkdir(parents=True, exist_ok=True)
+        tasklist.write_text("# Tasklist\n\n- [ ] Task 1\n- [ ] Task 2\n")
+
+        orch = Orchestrator(tasklist="docs/tasklist.md")
+        try:
+            orch._selected_task_line = "- [ ] Task 1"
+            orch._selected_task_title = "Task 1"
+            orch._selected_task_context_file = ".millstone/context/task-1.md"
+            orch._current_task_id = "task-1"
+            orch._current_task_text = "Task 1\n  - Risk: low"
+            orch.current_task_group = "Core"
+            orch.current_task_risk = "low"
+            orch._selected_task_item = TasklistItem(
+                task_id="task-1",
+                title="Task 1",
+                status=TaskStatus.todo,
+                acceptance_criteria=["Criterion A"],
+                context=".millstone/context/task-1.md",
+            )
+
+            orch.save_state(halt_reason="policy:loc_threshold")
+
+            state = json.loads(orch._get_state_file_path().read_text())
+            assert state["selected_task_line"] == "- [ ] Task 1"
+            assert state["selected_task_title"] == "Task 1"
+            assert state["current_task_id"] == "task-1"
+            assert state["current_task_group"] == "Core"
+            assert state["current_task_risk"] == "low"
+            assert state["selected_task_item"]["acceptance_criteria"] == ["Criterion A"]
+        finally:
+            orch.cleanup()
+
+    def test_run_single_task_uses_resumed_selected_task_scope(self, temp_repo):
+        """run_single_task preserves the saved selected task on inner-loop resume."""
+        from millstone.artifacts.models import TaskStatus
+
+        tasklist = temp_repo / "docs" / "tasklist.md"
+        tasklist.parent.mkdir(parents=True, exist_ok=True)
+        tasklist.write_text("# Tasklist\n\n- [x] Task 1\n- [ ] Task 2\n")
+
+        orch = Orchestrator(tasklist="docs/tasklist.md", continue_run=True, quiet=True)
+        try:
+            orch._resumed_task_state = {
+                "selected_task_line": "- [ ] Task 1",
+                "selected_task_title": "Task 1",
+                "current_task_id": "task-1",
+                "current_task_text": "Task 1",
+                "selected_task_item": {
+                    "task_id": "task-1",
+                    "title": "Task 1",
+                    "status": TaskStatus.todo.value,
+                    "acceptance_criteria": ["Keep using Task 1"],
+                    "criteria": None,
+                    "design_ref": None,
+                    "opportunity_ref": None,
+                    "risk": None,
+                    "tests": None,
+                    "context": None,
+                    "raw": None,
+                },
+            }
+
+            def fake_run_agent(prompt, role="default", **kwargs):
+                if role == "author":
+                    assert "Task 1" in prompt
+                    assert "Task 2" not in prompt.split("## Selected Task")[-1]
+                    (temp_repo / "impl.py").write_text("value = 1\n")
+                    return "Implemented Task 1."
+                if role == "reviewer":
+                    return (
+                        '{"status":"APPROVED","review":"ok","summary":"ok",'
+                        '"findings":[],"findings_by_severity":{"critical":[],"high":[],'
+                        '"medium":[],"low":[],"nit":[]},"impossible_condition":null,'
+                        '"tasklist_fix_recommendation":null}'
+                    )
+                raise AssertionError(f"Unexpected role: {role}")
+
+            with (
+                patch.object(orch, "run_agent", side_effect=fake_run_agent),
+                patch.object(orch, "sanity_check_impl", return_value=True),
+                patch.object(orch, "delegate_commit", return_value=True),
+            ):
+                assert orch.run_single_task() is True
+                assert orch._current_task_id == "task-1"
+                assert orch.current_task_title == "Task 1"
+        finally:
+            orch.cleanup()
+
     def test_load_state_returns_none_when_no_file(self, temp_repo):
         """load_state returns None when no state file exists."""
         orch = Orchestrator()
@@ -15101,6 +15324,48 @@ def test_mechanical_checks_logs_tasklist_scope_violation(self, temp_repo):
         finally:
             orch.cleanup()
 
+    def test_mechanical_checks_allows_compacted_completed_summary_update(self, temp_repo):
+        """mechanical_checks accepts compacted tasklists that consume only the selected task."""
+        from millstone.runtime.orchestrator import POLICY_FILE_NAME, WORK_DIR_NAME
+
+        config_dir = temp_repo / WORK_DIR_NAME
+        config_dir.mkdir(exist_ok=True)
+        policy_file = config_dir / POLICY_FILE_NAME
+        policy_file.write_text("""
+[limits]
+max_loc_per_task = 10000
+
+[tasklist]
+enforce_single_task = true
+""")
+
+        orch = Orchestrator()
+        try:
+            tasklist_path = temp_repo / ".millstone" / "tasklist.md"
+            baseline = (
+                "# Tasklist\n\n"
+                "## Completed\n\n"
+                "Earlier work is summarized here.\n\n"
+                "## Remaining\n\n"
+                "- [ ] Task 1: Do something\n"
+                "- [ ] Task 2: Do another thing\n"
+            )
+            tasklist_path.write_text(baseline)
+            orch._tasklist_baseline = baseline
+
+            tasklist_path.write_text(
+                "# Tasklist\n\n"
+                "## Completed\n\n"
+                "Earlier work is summarized here. Task 1 is now complete.\n\n"
+                "## Remaining\n\n"
+                "- [ ] Task 2: Do another thing\n"
+            )
+
+            result = orch.mechanical_checks()
+            assert result is True
+        finally:
+            orch.cleanup()
+
     def test_policy_violation_logged(self, temp_repo):
         """Policy violations are logged with specific rule info."""
         from millstone.runtime.orchestrator import POLICY_FILE_NAME, WORK_DIR_NAME
diff --git a/tests/test_tasklist.py b/tests/test_tasklist.py
index 88f07ba..7953369 100644
--- a/tests/test_tasklist.py
+++ b/tests/test_tasklist.py
@@ -184,3 +184,61 @@ def test_extract_current_task_no_file(self, temp_repo):
         mgr = TasklistManager(repo_dir=temp_repo)
         result = mgr.extract_current_task_acceptance_criteria()
         assert result == []
+
+
+class TestSingleTaskCompletionValidation:
+    def test_allows_first_unchecked_to_move_into_completed_summary(self, temp_repo):
+        mgr = TasklistManager(repo_dir=temp_repo)
+        original = (
+            "# Tasklist\n\n"
+            "## Completed\n\n"
+            "Earlier work is summarized here.\n\n"
+            "## Remaining\n\n"
+            "- [ ] Task 1\n"
+            "- [ ] Task 2\n"
+        )
+        new = (
+            "# Tasklist\n\n"
+            "## Completed\n\n"
+            "Earlier work is summarized here. Task 1 is now complete.\n\n"
+            "## Remaining\n\n"
+            "- [ ] Task 2\n"
+        )
+
+        valid, reason = mgr.validate_single_task_completion(original, new)
+
+        assert valid is True
+        assert reason == ""
+
+    def test_rejects_task_removal_without_completed_summary(self, temp_repo):
+        mgr = TasklistManager(repo_dir=temp_repo)
+        original = "# Tasklist\n\n- [ ] Task 1\n- [ ] Task 2\n"
+        new = "# Tasklist\n\n- [ ] Task 2\n"
+
+        valid, reason = mgr.validate_single_task_completion(original, new)
+
+        assert valid is False
+        assert "Task count decreased" in reason
+
+    def test_rejects_removing_later_task_from_completed_summary_mode(self, temp_repo):
+        mgr = TasklistManager(repo_dir=temp_repo)
+        original = (
+            "# Tasklist\n\n"
+            "## Completed\n\n"
+            "Earlier work is summarized here.\n\n"
+            "## Remaining\n\n"
+            "- [ ] Task 1\n"
+            "- [ ] Task 2\n"
+        )
+        new = (
+            "# Tasklist\n\n"
+            "## Completed\n\n"
+            "Earlier work is summarized here. Task 2 is now complete.\n\n"
+            "## Remaining\n\n"
+            "- [ ] Task 1\n"
+        )
+
+        valid, reason = mgr.validate_single_task_completion(original, new)
+
+        assert valid is False
+        assert "Task count decreased" in reason

From 182f6ca721112e70ed507f56b51df55386e08899 Mon Sep 17 00:00:00 2001
From: wittekin <6079900+wittekin@users.noreply.github.com>
Date: Thu, 2 Apr 2026 12:14:52 -0700
Subject: [PATCH 2/2] Release v0.6.4

Gate direct MCP task completion on C2 transactional capability and fix MCP release docs.
---
 CHANGELOG.md                          |  6 ++
 docs/providers/mcp.md                 |  4 +-
 pyproject.toml                        |  2 +-
 src/millstone/runtime/orchestrator.py | 21 ++++++-
 tests/test_orchestrator.py            | 85 ++++++++++++++++++++++++++-
 5 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14617ea..ca71092 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,12 @@ The format is based on Keep a Changelog and this project follows Semantic Versio
 
 ## [Unreleased]
 
+## [0.6.4] - 2026-04-02
+
+### Fixed
+- MCP-backed task completion finalization now only performs a direct provider-side status update when the active profile permits `C2_remote_bounded` transactional effects, avoiding false task failures under the default `C1_local_write` profile while preserving explicit remote close-out for C2-capable runs.
+- Corrected MCP provider documentation to reflect the actual default profile behavior for direct provider-side remote writes.
+
 ## [0.6.3] - 2026-04-02
 
 ### Fixed
diff --git a/docs/providers/mcp.md b/docs/providers/mcp.md
index 97ec478..7608ba6 100644
--- a/docs/providers/mcp.md
+++ b/docs/providers/mcp.md
@@ -144,7 +144,9 @@ Any MCP server your agent supports works — the `mcp_server` value is passed th
 
 ## Effect policy
 
-All MCP operations are classified as `EffectClass.transactional` (C2). Under the default `DEV_IMPLEMENTATION` profile these are permitted. If you have a stricter policy configured (e.g. `C1_LOCAL_WRITE` only), MCP operations will raise `CapabilityViolation` — update `.millstone/policy.toml` to allow `C2_REMOTE_BOUNDED` effects.
+Millstone-classified MCP write operations are `EffectClass.transactional` (`C2_remote_bounded`). The default `dev_implementation` profile is `C1_local_write`, so direct provider writes that millstone issues itself are not permitted unless you configure a `C2_remote_bounded` profile with `transactional` effects allowlisted.
+
+In the default profile, prompt-driven MCP actions performed by the coding agent can still run because the agent is executing those tool calls from the rendered prompt, not through millstone's direct provider-effect path. Orchestrator-driven follow-up writes, such as explicit post-commit remote task completion, are skipped unless the active profile allows those transactional effects.
 
 ---
 
diff --git a/pyproject.toml b/pyproject.toml
index ffed1dd..886a231 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "millstone"
-version = "0.6.3"
+version = "0.6.4"
 description = "Orchestrator for agentic coding tools"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py
index e59ce97..3ae3cc6 100755
--- a/src/millstone/runtime/orchestrator.py
+++ b/src/millstone/runtime/orchestrator.py
@@ -55,7 +55,7 @@
 from millstone.loops.outer import OuterLoopManager
 from millstone.loops.registry_adapter import LoopRegistryAdapter
 from millstone.policy.capability import CapabilityPolicyGate, CapabilityTier
-from millstone.policy.effects import EffectIntent, EffectPolicyGate, NoOpEffectProvider
+from millstone.policy.effects import EffectClass, EffectIntent, EffectPolicyGate, NoOpEffectProvider
 from millstone.policy.schemas import (
     ReviewDecision,
     ReviewStatus,
@@ -2423,6 +2423,16 @@ def _ensure_tasklist_provider_author_callback(self) -> None:
         ):
             provider.set_agent_callback(lambda p, **k: self.run_agent(p, role="author", **k))
 
+    def _can_finalize_remote_task_completion(self) -> tuple[bool, str | None]:
+        """Check whether the active profile may perform direct remote task completion."""
+        try:
+            self._capability_gate.assert_permitted(CapabilityTier.C2_REMOTE_BOUNDED)
+        except Exception as exc:
+            return False, str(exc)
+        if EffectClass.transactional not in self.profile.permitted_effect_classes:
+            return False, "Effect class transactional is not in permitted_effect_classes"
+        return True, None
+
     def _finalize_remote_task_completion(self) -> bool:
         """Ensure MCP-backed tasks are marked done after a successful task run."""
         from millstone.artifact_providers.mcp import MCPTasklistProvider
@@ -2434,6 +2444,15 @@ def _finalize_remote_task_completion(self) -> bool:
         if not isinstance(provider, MCPTasklistProvider):
             return True
 
+        allowed, reason = self._can_finalize_remote_task_completion()
+        if not allowed:
+            self.log(
+                "remote_task_completion_skipped",
+                task_id=self._current_task_id,
+                reason=reason,
+            )
+            return True
+
         try:
             self._ensure_tasklist_provider_author_callback()
             provider.update_task_status(self._current_task_id, TaskStatus.done)
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 0c3baf5..8939b6e 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -1944,7 +1944,23 @@ def test_run_single_task_research_mode_closes_mcp_task(self, temp_repo):
                 stderr="",
             )
 
-            orch = Orchestrator(tasklist="docs/tasklist.md", research=True, cli="claude")
+            profile = Profile(
+                id="test_c2_transactional_research",
+                name="C2 Transactional Research Profile",
+                role_aliases={"builder": "author"},
+                capability_tier=CapabilityTier.C2_REMOTE_BOUNDED,
+                permitted_effect_classes=frozenset({EffectClass.transactional}),
+            )
+            registry = ProfileRegistry()
+            registry.register(profile)
+
+            with patch("millstone.runtime.orchestrator.ProfileRegistry", return_value=registry):
+                orch = Orchestrator(
+                    tasklist="docs/tasklist.md",
+                    research=True,
+                    cli="claude",
+                    profile=profile.id,
+                )
             try:
                 # Inject a mock MCP tasklist provider with a known remote task ID
                 mock_mcp = MagicMock(spec=MCPTasklistProvider)
@@ -1994,7 +2010,18 @@ def test_run_single_task_closes_mcp_task_on_success(self, temp_repo):
         from millstone.artifact_providers.mcp import MCPTasklistProvider
         from millstone.artifacts.models import TasklistItem, TaskStatus
 
-        orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True)
+        profile = Profile(
+            id="test_c2_transactional",
+            name="C2 Transactional Profile",
+            role_aliases={"builder": "author"},
+            capability_tier=CapabilityTier.C2_REMOTE_BOUNDED,
+            permitted_effect_classes=frozenset({EffectClass.transactional}),
+        )
+        registry = ProfileRegistry()
+        registry.register(profile)
+
+        with patch("millstone.runtime.orchestrator.ProfileRegistry", return_value=registry):
+            orch = Orchestrator(tasklist="docs/tasklist.md", profile=profile.id, quiet=True)
         try:
             mock_mcp = MagicMock(spec=MCPTasklistProvider)
             mock_mcp._agent_callback = None
@@ -2033,7 +2060,18 @@ def test_run_single_task_fails_when_mcp_completion_update_fails(self, temp_repo)
         from millstone.artifact_providers.mcp import MCPTasklistProvider
         from millstone.artifacts.models import TasklistItem, TaskStatus
 
-        orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True)
+        profile = Profile(
+            id="test_c2_transactional_failure",
+            name="C2 Transactional Failure Profile",
+            role_aliases={"builder": "author"},
+            capability_tier=CapabilityTier.C2_REMOTE_BOUNDED,
+            permitted_effect_classes=frozenset({EffectClass.transactional}),
+        )
+        registry = ProfileRegistry()
+        registry.register(profile)
+
+        with patch("millstone.runtime.orchestrator.ProfileRegistry", return_value=registry):
+            orch = Orchestrator(tasklist="docs/tasklist.md", profile=profile.id, quiet=True)
         try:
             mock_mcp = MagicMock(spec=MCPTasklistProvider)
             mock_mcp._agent_callback = None
@@ -2068,6 +2106,47 @@ def fake_run_agent(prompt, role="default", **kwargs):
         finally:
             orch.cleanup()
 
+    def test_run_single_task_skips_explicit_mcp_close_when_profile_disallows_remote_effects(
+        self, temp_repo
+    ):
+        """Default C1 profiles leave MCP completion to the builder prompt path."""
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+        from millstone.artifacts.models import TasklistItem, TaskStatus
+
+        orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True)
+        try:
+            mock_mcp = MagicMock(spec=MCPTasklistProvider)
+            mock_mcp._agent_callback = None
+            task = TasklistItem(task_id="GH-42", title="Remote task", status=TaskStatus.todo)
+            mock_mcp.get_prompt_placeholders.return_value = {}
+            mock_mcp.list_tasks.return_value = [task]
+            mock_mcp.get_task.return_value = task
+            orch._outer_loop_manager.tasklist_provider = mock_mcp
+
+            def fake_run_agent(prompt, role="default", **kwargs):
+                if role == "author":
+                    (temp_repo / "impl.py").write_text("value = 1\n")
+                    return "Implemented remote task."
+                if role == "reviewer":
+                    return (
+                        '{"status":"APPROVED","review":"ok","summary":"ok",'
+                        '"findings":[],"findings_by_severity":{"critical":[],"high":[],'
+                        '"medium":[],"low":[],"nit":[]},"impossible_condition":null,'
+                        '"tasklist_fix_recommendation":null}'
+                    )
+                raise AssertionError(f"Unexpected role: {role}")
+
+            with (
+                patch.object(orch, "run_agent", side_effect=fake_run_agent),
+                patch.object(orch, "sanity_check_impl", return_value=True),
+                patch.object(orch, "delegate_commit", return_value=True),
+            ):
+                assert orch.run_single_task() is True
+
+            mock_mcp.update_task_status.assert_not_called()
+        finally:
+            orch.cleanup()
+
 
 class TestIsApproved:
     """Tests for approval detection."""