From e8743a36a3a2b386535ea3577a2ba37fcbeb87e6 Mon Sep 17 00:00:00 2001 From: wittekin <6079900+wittekin@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:04:52 -0700 Subject: [PATCH 1/2] Release v0.6.3 Fix tasklist lifecycle handling around compaction, remote completion, and inner-loop resume. --- CHANGELOG.md | 8 + pyproject.toml | 2 +- src/millstone/artifact_providers/file.py | 8 +- src/millstone/artifacts/tasklist.py | 26 ++- src/millstone/runtime/orchestrator.py | 268 +++++++++++++++++++++-- tests/test_orchestrator.py | 265 ++++++++++++++++++++++ tests/test_tasklist.py | 58 +++++ 7 files changed, 604 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a2f37bf..14617ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ The format is based on Keep a Changelog and this project follows Semantic Versio ## [Unreleased] +## [0.6.3] - 2026-04-02 + +### Fixed +- Single-task scope validation now accepts compacted tasklists that move the selected completed item into the `Completed` summary section, avoiding false scope violations after compaction. +- Inner-loop `--continue` now preserves selected-task identity so resumed builder/reviewer work cannot drift to a different task after a mid-task halt. +- Successful MCP-backed task runs now explicitly mark the selected remote task done after commit, instead of relying only on the builder prompt to do it. +- Automatic compaction on tracked tasklists now finalizes its own tasklist rewrite immediately, preventing compaction-only changes from leaking into the next task's diff, review, or commit. + ## [0.6.2] - 2026-03-31 ### Fixed diff --git a/pyproject.toml b/pyproject.toml index 91696d4..ffed1dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "millstone" -version = "0.6.2" +version = "0.6.3" description = "Orchestrator for agentic coding tools" readme = "README.md" requires-python = ">=3.10" diff --git a/src/millstone/artifact_providers/file.py b/src/millstone/artifact_providers/file.py index f8969ac..c263b02 100644 --- a/src/millstone/artifact_providers/file.py +++ b/src/millstone/artifact_providers/file.py @@ -676,8 +676,12 @@ def get_prompt_placeholders(self) -> dict[str, str]: "`- [ ]` pending, `- [x]` complete. Select the FIRST unchecked task." ), "TASKLIST_COMPLETE_INSTRUCTIONS": ( - f"Mark exactly this one task complete by changing its `- [ ]` to " - f"`- [x]` in `{self.path}` and stop. Do not modify any other tasks." + f"Mark exactly this one task complete in `{self.path}` and stop. " + "Normally, change its `- [ ]` to `- [x]`. " + "If the file has a `Completed` summary section instead of listing older " + "finished tasks individually, you may instead remove the selected task from " + "the pending list and update only that `Completed` summary. Do not modify " + "any other pending tasks." ), "TASKLIST_REWRITE_INSTRUCTIONS": ( f"Write the entire compacted content back to `{self.path}`, " diff --git a/src/millstone/artifacts/tasklist.py b/src/millstone/artifacts/tasklist.py index 2ac050c..145f8af 100644 --- a/src/millstone/artifacts/tasklist.py +++ b/src/millstone/artifacts/tasklist.py @@ -414,6 +414,10 @@ def _extract_tasks(self, content: str) -> list[tuple[bool, str]]: tasks.append((checked.lower() == "x", task_text.strip())) return tasks + def _has_completed_summary_section(self, content: str) -> bool: + """Return True when the tasklist already uses a compacted completed summary.""" + return bool(re.search(r"^#{2,6}\s+Completed\b", content, re.MULTILINE | re.IGNORECASE)) + def validate_single_task_completion( self, original_content: str, @@ -430,7 +434,25 @@ def validate_single_task_completion( if not original_tasks or not new_tasks: return True, "" + first_unchecked = next( + (i for i, (checked, _) in enumerate(original_tasks) if not checked), + None, + ) + if len(new_tasks) < len(original_tasks): + if ( + first_unchecked is not None + and len(new_tasks) == len(original_tasks) - 1 + and ( + self._has_completed_summary_section(original_content) + or self._has_completed_summary_section(new_content) + ) + ): + expected_tasks = ( + original_tasks[:first_unchecked] + original_tasks[first_unchecked + 1 :] + ) + if new_tasks == expected_tasks: + return True, "" return ( False, f"Task count decreased: {len(original_tasks)} -> {len(new_tasks)}. Deleting tasks is not allowed.", @@ -457,10 +479,6 @@ def validate_single_task_completion( f"Multiple tasks were marked complete ({len(checkoffs)}).", ) - first_unchecked = next( - (i for i, (checked, _) in enumerate(original_tasks) if not checked), - None, - ) if first_unchecked is None: return False, "No unchecked tasks remained, but a task was marked complete." diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py index d50eea4..e59ce97 100755 --- a/src/millstone/runtime/orchestrator.py +++ b/src/millstone/runtime/orchestrator.py @@ -528,6 +528,7 @@ def __init__( } # Aggregated severity counts self._current_task_text: str = "" # Current task text for review metrics self._selected_task_item: TasklistItem | None = None + self._resumed_task_state: dict[str, Any] | None = None self._tasklist_fix_scope_active: bool = False self._task_impossible_decision: ReviewDecision | None = None self._task_previous_diff: str | None = ( @@ -998,7 +999,7 @@ def _get_state_file_path(self) -> Path: def _build_state_snapshot(self, halt_reason: str = "") -> dict[str, Any]: """Build the standard state payload for resumable halts.""" - return { + state = { "current_task_num": self.current_task_num, "builder_session_id": self.builder_session_id, "reviewer_session_id": self.reviewer_session_id, @@ -1009,6 +1010,75 @@ def _build_state_snapshot(self, halt_reason: str = "") -> dict[str, Any]: "halt_reason": halt_reason, "timestamp": datetime.now().isoformat(), } + state.update(self._selected_task_state_snapshot()) + return state + + def _serialize_task_item(self, task: TasklistItem | None) -> dict[str, Any] | None: + if task is None: + return None + return { + "task_id": task.task_id, + "title": task.title, + "status": task.status.value, + "design_ref": task.design_ref, + "opportunity_ref": task.opportunity_ref, + "risk": task.risk, + "tests": task.tests, + "criteria": task.criteria, + "acceptance_criteria": list(task.acceptance_criteria or []), + "context": task.context, + "raw": task.raw, + } + + def _deserialize_task_item(self, payload: dict[str, Any] | None) -> TasklistItem | None: + if not payload: + return None + status_value = payload.get("status", TaskStatus.todo.value) + with contextlib.suppress(ValueError): + status = TaskStatus(status_value) + return TasklistItem( + task_id=payload.get("task_id", ""), + title=payload.get("title", "") or "task", + status=status, + design_ref=payload.get("design_ref"), + opportunity_ref=payload.get("opportunity_ref"), + risk=payload.get("risk"), + tests=payload.get("tests"), + criteria=payload.get("criteria"), + acceptance_criteria=list(payload.get("acceptance_criteria") or []), + context=payload.get("context"), + raw=payload.get("raw"), + ) + return None + + def _selected_task_state_snapshot(self) -> dict[str, Any]: + """Capture selected-task identity so inner-loop resumes keep the same scope.""" + snapshot = { + "selected_task_line": self._selected_task_line, + "selected_task_title": self._selected_task_title or self.current_task_title or None, + "selected_task_context_file": self._selected_task_context_file, + "current_task_id": self._current_task_id, + "current_task_text": self._current_task_text, + "current_task_group": self.current_task_group, + "current_task_risk": self.current_task_risk, + "selected_task_item": self._serialize_task_item(self._get_selected_task_item()), + } + return {k: v for k, v in snapshot.items() if v is not None} + + def _restore_selected_task_state(self, state: dict[str, Any]) -> None: + """Restore saved selected-task identity for a resumed inner-loop run.""" + keys = { + "selected_task_line", + "selected_task_title", + "selected_task_context_file", + "current_task_id", + "current_task_text", + "current_task_group", + "current_task_risk", + "selected_task_item", + } + snapshot = {k: state.get(k) for k in keys if state.get(k) is not None} + self._resumed_task_state = snapshot or None def save_state(self, halt_reason: str = "", **extra: object) -> None: """Save current orchestration state to state.json. @@ -2237,16 +2307,12 @@ def _selected_task_scope_block(self) -> str: ) def _current_task_acceptance_criteria(self) -> list[str]: - from millstone.artifact_providers.file import FileTasklistProvider - - provider = self._outer_loop_manager.tasklist_provider - if not isinstance(provider, FileTasklistProvider): - task = self._get_selected_task_item() - if task is not None: - if task.acceptance_criteria: - return task.acceptance_criteria - if task.criteria: - return [task.criteria] + task = self._get_selected_task_item() + if task is not None: + if task.acceptance_criteria: + return task.acceptance_criteria + if task.criteria: + return [task.criteria] return self.extract_current_task_acceptance_criteria() def _tasklist_fix_targeted_update_instruction(self) -> str: @@ -2357,6 +2423,31 @@ def _ensure_tasklist_provider_author_callback(self) -> None: ): provider.set_agent_callback(lambda p, **k: self.run_agent(p, role="author", **k)) + def _finalize_remote_task_completion(self) -> bool: + """Ensure MCP-backed tasks are marked done after a successful task run.""" + from millstone.artifact_providers.mcp import MCPTasklistProvider + + if self.task or not self._current_task_id: + return True + + provider = self._outer_loop_manager.tasklist_provider + if not isinstance(provider, MCPTasklistProvider): + return True + + try: + self._ensure_tasklist_provider_author_callback() + provider.update_task_status(self._current_task_id, TaskStatus.done) + self.log("remote_task_marked_complete", task_id=self._current_task_id) + return True + except Exception as exc: + self.log( + "remote_task_completion_failed", + task_id=self._current_task_id, + error=str(exc), + ) + progress(f"{self._task_prefix()} ERROR: failed to mark remote task complete: {exc}") + return False + def extract_current_task_line(self) -> str: if self._selected_task_line: return self._selected_task_line @@ -2500,6 +2591,81 @@ def run_compaction(self) -> bool: self.completed_task_count = self._tasklist_manager.completed_task_count return result + def _finalize_automatic_compaction(self) -> bool: + """Commit tracked tasklist compaction so it cannot spill into the next task.""" + from millstone.artifact_providers.mcp import MCPTasklistProvider + + provider = self._outer_loop_manager.tasklist_provider + if isinstance(provider, MCPTasklistProvider): + return True + + status_lines = self._get_working_tree_status_lines() + if status_lines is None: + self.log("automatic_compaction_status_failed") + progress("Automatic compaction completed, but git status failed.") + return False + if not status_lines: + return True + + tasklist_path = str(self.tasklist) + remaining_files = [line.split()[-1] for line in status_lines if line.strip()] + unexpected_files = [path for path in remaining_files if path != tasklist_path] + if unexpected_files: + self.log( + "automatic_compaction_unexpected_changes", + tasklist=tasklist_path, + files=",".join(unexpected_files), + ) + progress( + "Automatic compaction left unexpected working-tree changes; halting before the next task." + ) + return False + + add_result = subprocess.run( + ["git", "add", "--", tasklist_path], + cwd=self.repo_dir, + capture_output=True, + text=True, + ) + if add_result.returncode != 0: + self.log( + "automatic_compaction_git_add_failed", + tasklist=tasklist_path, + error=add_result.stderr.strip(), + ) + progress("Automatic compaction could not stage the tasklist change.") + return False + + commit_result = subprocess.run( + [ + "git", + "commit", + "-m", + "Compact completed tasks in tasklist\n\nGenerated with millstone orchestrator", + ], + cwd=self.repo_dir, + capture_output=True, + text=True, + ) + if commit_result.returncode != 0: + self.log( + "automatic_compaction_commit_failed", + tasklist=tasklist_path, + error=commit_result.stderr.strip(), + ) + progress("Automatic compaction could not commit the tasklist change.") + return False + + self._update_loc_baseline() + progress("Auto-committed tasklist compaction.") + return True + + def _run_automatic_compaction(self) -> bool: + """Run compaction and finalize any tracked tasklist changes immediately.""" + if not self.run_compaction(): + return False + return self._finalize_automatic_compaction() + def run_eval(self, coverage: bool = False, mode: str | None = None) -> dict: def _emit_eval_evidence(eval_result: dict) -> None: record = make_eval_evidence( @@ -3695,6 +3861,9 @@ def run_single_task(self) -> bool: # Determine task text and metadata _mcp_task_item: Any = None # Set when MCP provider supplies the current task + restored_task_state = self._resumed_task_state if not self.task else None + self._resumed_task_state = None + if self.task: task_display = self.task[:50] + "..." if len(self.task) > 50 else self.task self.current_task_title = task_display @@ -3706,6 +3875,17 @@ def run_single_task(self) -> bool: self._selected_task_title = self.current_task_title self._selected_task_line = task_text self._selected_task_context_file = task_metadata.get("context") + elif restored_task_state: + self.current_task_title = restored_task_state.get("selected_task_title") or "task" + task_text = restored_task_state.get("current_task_text") or self.current_task_title + self.apply_risk_settings(restored_task_state.get("current_task_risk")) + self.current_task_group = restored_task_state.get("current_task_group") + self._selected_task_title = self.current_task_title + self._selected_task_line = restored_task_state.get("selected_task_line") or task_text + self._selected_task_context_file = restored_task_state.get("selected_task_context_file") + self._selected_task_item = self._deserialize_task_item( + restored_task_state.get("selected_task_item") + ) else: # When the tasklist is MCP-backed, derive task title and ID from the # remote provider's cached task list instead of reading a local file @@ -3749,7 +3929,9 @@ def run_single_task(self) -> bool: self._current_task_text = task_text # Prefer canonical task_id from metadata (stable, ontology-compliant identity). # When an MCP provider supplied the task, use its remote ID directly. - if _mcp_task_item is not None: + if restored_task_state and restored_task_state.get("current_task_id"): + self._current_task_id = restored_task_state["current_task_id"] + elif _mcp_task_item is not None: self._current_task_id = _mcp_task_item.task_id elif self.task: _task_meta = self._tasklist_manager._parse_task_metadata(task_text) @@ -3764,7 +3946,8 @@ def run_single_task(self) -> bool: re.sub(r"[^a-z0-9]+", "-", task_text.lower()).strip("-")[:30] or None ) - self._selected_task_item = _mcp_task_item + if _mcp_task_item is not None: + self._selected_task_item = _mcp_task_item if not self.task and self._selected_task_item is None: with contextlib.suppress(Exception): pending_provider_tasks = [ @@ -3900,14 +4083,13 @@ def _worker_finish( self.log("research_completed", task=task_text[:200], output_file=str(output_file)) if not self.task: self.mark_task_complete() - # Close remote task for MCP providers — mark_task_complete() - # only updates the local tasklist file, which is a no-op when - # the task lives on a remote backend (GitHub Issues, Linear, …). - from millstone.artifact_providers.mcp import MCPTasklistProvider - - provider = self._outer_loop_manager.tasklist_provider - if isinstance(provider, MCPTasklistProvider) and self._current_task_id: - provider.update_task_status(self._current_task_id, TaskStatus.done) + if not self._finalize_remote_task_completion(): + _worker_finish( + "failed", + review_summary=builder_output[:4000], + error="remote_task_completion", + ) + return False progress(f"{self._task_prefix()} Research completed -> {output_file}") _worker_finish("success", review_summary=builder_output[:4000]) return True @@ -4165,6 +4347,10 @@ def builder_on_success(artifact: BuilderArtifact, verdict: BuilderVerdict) -> bo # Update baseline so next task measures LoC from this commit self._update_loc_baseline() + if not self._finalize_remote_task_completion(): + loop_state["failure_reason"] = "remote_task_completion" + return False + if self.eval_on_commit and not self._run_eval_on_commit(task_text=task_text): loop_state["failure_reason"] = "eval_regression" return False @@ -4173,6 +4359,9 @@ def builder_on_success(artifact: BuilderArtifact, verdict: BuilderVerdict) -> bo return True if self.delegate_commit(): + if not self._finalize_remote_task_completion(): + loop_state["failure_reason"] = "remote_task_completion" + return False if self.eval_on_commit and not self._run_eval_on_commit(task_text=task_text): loop_state["failure_reason"] = "eval_regression" return False @@ -4560,6 +4749,7 @@ def run(self) -> int: # Inner-loop resume (LoC/sensitive-file halt): user has # already reviewed the diff, so skip mechanical checks. self._skip_mechanical_checks = True + self._restore_selected_task_state(state) else: print("Warning: --continue specified but no saved state found.") print("Running normally...") @@ -4620,7 +4810,21 @@ def run(self) -> int: "Skipping startup tasklist compaction because the working tree has uncommitted changes." ) else: - self.run_compaction() + if not self._run_automatic_compaction(): + self.log( + "run_completed", + result="HALTED", + reason="startup_compaction_failed", + tasks_completed="0", + ) + self._print_run_summary( + 0, + 1, + "startup_compaction_failed", + run_start_time, + remaining="unknown", + ) + return 1 # Capture baseline eval if eval_on_commit or eval_on_task is enabled # Note: skip_eval only affects eval_on_task gate, not eval_on_commit @@ -4717,8 +4921,24 @@ def run(self) -> int: # motivation to reorganize and drop unchecked tasks. if not self.task: self.completed_task_count = self.count_completed_tasks() - if self.should_compact(): - self.run_compaction() + if self.should_compact() and not self._run_automatic_compaction(): + progress( + f"=== HALTED after {tasks_completed} task(s) === automatic compaction failed" + ) + self.log( + "run_completed", + result="HALTED", + reason="inter_task_compaction_failed", + tasks_completed=str(tasks_completed), + ) + self._print_run_summary( + tasks_completed, + tasks_failed + 1, + "inter_task_compaction_failed", + run_start_time, + remaining=self._remaining_count(task_num), + ) + return 1 else: tasks_failed += 1 # Stop on first failure diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 7c979d4..0c3baf5 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1989,6 +1989,85 @@ def test_run_single_task_research_mode_direct_task_skips_mcp_close(self, temp_re finally: orch.cleanup() + def test_run_single_task_closes_mcp_task_on_success(self, temp_repo): + """Approved MCP-backed tasks are explicitly marked done after commit.""" + from millstone.artifact_providers.mcp import MCPTasklistProvider + from millstone.artifacts.models import TasklistItem, TaskStatus + + orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True) + try: + mock_mcp = MagicMock(spec=MCPTasklistProvider) + mock_mcp._agent_callback = None + task = TasklistItem(task_id="GH-42", title="Remote task", status=TaskStatus.todo) + mock_mcp.get_prompt_placeholders.return_value = {} + mock_mcp.list_tasks.return_value = [task] + mock_mcp.get_task.return_value = task + orch._outer_loop_manager.tasklist_provider = mock_mcp + + def fake_run_agent(prompt, role="default", **kwargs): + if role == "author": + (temp_repo / "impl.py").write_text("value = 1\n") + return "Implemented remote task." + if role == "reviewer": + return ( + '{"status":"APPROVED","review":"ok","summary":"ok",' + '"findings":[],"findings_by_severity":{"critical":[],"high":[],' + '"medium":[],"low":[],"nit":[]},"impossible_condition":null,' + '"tasklist_fix_recommendation":null}' + ) + raise AssertionError(f"Unexpected role: {role}") + + with ( + patch.object(orch, "run_agent", side_effect=fake_run_agent), + patch.object(orch, "sanity_check_impl", return_value=True), + patch.object(orch, "delegate_commit", return_value=True), + ): + assert orch.run_single_task() is True + + mock_mcp.update_task_status.assert_called_once_with("GH-42", TaskStatus.done) + finally: + orch.cleanup() + + def test_run_single_task_fails_when_mcp_completion_update_fails(self, temp_repo): + """Remote completion failures are surfaced instead of silently ignored.""" + from millstone.artifact_providers.mcp import MCPTasklistProvider + from millstone.artifacts.models import TasklistItem, TaskStatus + + orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True) + try: + mock_mcp = MagicMock(spec=MCPTasklistProvider) + mock_mcp._agent_callback = None + task = TasklistItem(task_id="GH-42", title="Remote task", status=TaskStatus.todo) + mock_mcp.get_prompt_placeholders.return_value = {} + mock_mcp.list_tasks.return_value = [task] + mock_mcp.get_task.return_value = task + mock_mcp.update_task_status.side_effect = RuntimeError("remote outage") + orch._outer_loop_manager.tasklist_provider = mock_mcp + + def fake_run_agent(prompt, role="default", **kwargs): + if role == "author": + (temp_repo / "impl.py").write_text("value = 1\n") + return "Implemented remote task." + if role == "reviewer": + return ( + '{"status":"APPROVED","review":"ok","summary":"ok",' + '"findings":[],"findings_by_severity":{"critical":[],"high":[],' + '"medium":[],"low":[],"nit":[]},"impossible_condition":null,' + '"tasklist_fix_recommendation":null}' + ) + raise AssertionError(f"Unexpected role: {role}") + + with ( + patch.object(orch, "run_agent", side_effect=fake_run_agent), + patch.object(orch, "sanity_check_impl", return_value=True), + patch.object(orch, "delegate_commit", return_value=True), + ): + assert orch.run_single_task() is False + + mock_mcp.update_task_status.assert_called_once_with("GH-42", TaskStatus.done) + finally: + orch.cleanup() + class TestIsApproved: """Tests for approval detection.""" @@ -3657,6 +3736,57 @@ def mock_compaction(prompt): finally: orch.cleanup() + def test_finalize_automatic_compaction_commits_tracked_tasklist(self, temp_repo): + """Tracked tasklist compaction is committed immediately and updates baseline.""" + tasklist_path = temp_repo / "docs" / "tasklist.md" + tasklist_path.parent.mkdir(parents=True, exist_ok=True) + tasklist_path.write_text("# Tasklist\n\n- [x] Done 1\n- [ ] Pending\n") + subprocess.run( + ["git", "add", "docs/tasklist.md"], cwd=temp_repo, check=True, capture_output=True + ) + subprocess.run( + ["git", "commit", "-m", "Add tracked tasklist"], + cwd=temp_repo, + check=True, + capture_output=True, + ) + + orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True) + try: + orch._init_loc_baseline() + tasklist_path.write_text("# Tasklist\n\n## Completed\n\nDone 1.\n\n- [ ] Pending\n") + + assert orch._finalize_automatic_compaction() is True + + status = subprocess.run( + ["git", "status", "--porcelain"], + cwd=temp_repo, + check=True, + capture_output=True, + text=True, + ).stdout.strip() + assert status == "" + + head = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=temp_repo, + check=True, + capture_output=True, + text=True, + ).stdout.strip() + assert orch.loc_baseline_ref == head + + message = subprocess.run( + ["git", "log", "-1", "--pretty=%B"], + cwd=temp_repo, + check=True, + capture_output=True, + text=True, + ).stdout + assert "Compact completed tasks in tasklist" in message + finally: + orch.cleanup() + class TestCompactionSanityCheck: """Tests for compaction sanity check functionality.""" @@ -4767,6 +4897,99 @@ def test_save_state_content(self, temp_repo): finally: orch.cleanup() + def test_save_state_persists_selected_task_identity(self, temp_repo): + """save_state captures selected-task identity for inner-loop resume.""" + from millstone.artifacts.models import TasklistItem, TaskStatus + + tasklist = temp_repo / "docs" / "tasklist.md" + tasklist.parent.mkdir(parents=True, exist_ok=True) + tasklist.write_text("# Tasklist\n\n- [ ] Task 1\n- [ ] Task 2\n") + + orch = Orchestrator(tasklist="docs/tasklist.md") + try: + orch._selected_task_line = "- [ ] Task 1" + orch._selected_task_title = "Task 1" + orch._selected_task_context_file = ".millstone/context/task-1.md" + orch._current_task_id = "task-1" + orch._current_task_text = "Task 1\n - Risk: low" + orch.current_task_group = "Core" + orch.current_task_risk = "low" + orch._selected_task_item = TasklistItem( + task_id="task-1", + title="Task 1", + status=TaskStatus.todo, + acceptance_criteria=["Criterion A"], + context=".millstone/context/task-1.md", + ) + + orch.save_state(halt_reason="policy:loc_threshold") + + state = json.loads(orch._get_state_file_path().read_text()) + assert state["selected_task_line"] == "- [ ] Task 1" + assert state["selected_task_title"] == "Task 1" + assert state["current_task_id"] == "task-1" + assert state["current_task_group"] == "Core" + assert state["current_task_risk"] == "low" + assert state["selected_task_item"]["acceptance_criteria"] == ["Criterion A"] + finally: + orch.cleanup() + + def test_run_single_task_uses_resumed_selected_task_scope(self, temp_repo): + """run_single_task preserves the saved selected task on inner-loop resume.""" + from millstone.artifacts.models import TaskStatus + + tasklist = temp_repo / "docs" / "tasklist.md" + tasklist.parent.mkdir(parents=True, exist_ok=True) + tasklist.write_text("# Tasklist\n\n- [x] Task 1\n- [ ] Task 2\n") + + orch = Orchestrator(tasklist="docs/tasklist.md", continue_run=True, quiet=True) + try: + orch._resumed_task_state = { + "selected_task_line": "- [ ] Task 1", + "selected_task_title": "Task 1", + "current_task_id": "task-1", + "current_task_text": "Task 1", + "selected_task_item": { + "task_id": "task-1", + "title": "Task 1", + "status": TaskStatus.todo.value, + "acceptance_criteria": ["Keep using Task 1"], + "criteria": None, + "design_ref": None, + "opportunity_ref": None, + "risk": None, + "tests": None, + "context": None, + "raw": None, + }, + } + + def fake_run_agent(prompt, role="default", **kwargs): + if role == "author": + assert "Task 1" in prompt + assert "Task 2" not in prompt.split("## Selected Task")[-1] + (temp_repo / "impl.py").write_text("value = 1\n") + return "Implemented Task 1." + if role == "reviewer": + return ( + '{"status":"APPROVED","review":"ok","summary":"ok",' + '"findings":[],"findings_by_severity":{"critical":[],"high":[],' + '"medium":[],"low":[],"nit":[]},"impossible_condition":null,' + '"tasklist_fix_recommendation":null}' + ) + raise AssertionError(f"Unexpected role: {role}") + + with ( + patch.object(orch, "run_agent", side_effect=fake_run_agent), + patch.object(orch, "sanity_check_impl", return_value=True), + patch.object(orch, "delegate_commit", return_value=True), + ): + assert orch.run_single_task() is True + assert orch._current_task_id == "task-1" + assert orch.current_task_title == "Task 1" + finally: + orch.cleanup() + def test_load_state_returns_none_when_no_file(self, temp_repo): """load_state returns None when no state file exists.""" orch = Orchestrator() @@ -15101,6 +15324,48 @@ def test_mechanical_checks_logs_tasklist_scope_violation(self, temp_repo): finally: orch.cleanup() + def test_mechanical_checks_allows_compacted_completed_summary_update(self, temp_repo): + """mechanical_checks accepts compacted tasklists that consume only the selected task.""" + from millstone.runtime.orchestrator import POLICY_FILE_NAME, WORK_DIR_NAME + + config_dir = temp_repo / WORK_DIR_NAME + config_dir.mkdir(exist_ok=True) + policy_file = config_dir / POLICY_FILE_NAME + policy_file.write_text(""" +[limits] +max_loc_per_task = 10000 + +[tasklist] +enforce_single_task = true +""") + + orch = Orchestrator() + try: + tasklist_path = temp_repo / ".millstone" / "tasklist.md" + baseline = ( + "# Tasklist\n\n" + "## Completed\n\n" + "Earlier work is summarized here.\n\n" + "## Remaining\n\n" + "- [ ] Task 1: Do something\n" + "- [ ] Task 2: Do another thing\n" + ) + tasklist_path.write_text(baseline) + orch._tasklist_baseline = baseline + + tasklist_path.write_text( + "# Tasklist\n\n" + "## Completed\n\n" + "Earlier work is summarized here. Task 1 is now complete.\n\n" + "## Remaining\n\n" + "- [ ] Task 2: Do another thing\n" + ) + + result = orch.mechanical_checks() + assert result is True + finally: + orch.cleanup() + def test_policy_violation_logged(self, temp_repo): """Policy violations are logged with specific rule info.""" from millstone.runtime.orchestrator import POLICY_FILE_NAME, WORK_DIR_NAME diff --git a/tests/test_tasklist.py b/tests/test_tasklist.py index 88f07ba..7953369 100644 --- a/tests/test_tasklist.py +++ b/tests/test_tasklist.py @@ -184,3 +184,61 @@ def test_extract_current_task_no_file(self, temp_repo): mgr = TasklistManager(repo_dir=temp_repo) result = mgr.extract_current_task_acceptance_criteria() assert result == [] + + +class TestSingleTaskCompletionValidation: + def test_allows_first_unchecked_to_move_into_completed_summary(self, temp_repo): + mgr = TasklistManager(repo_dir=temp_repo) + original = ( + "# Tasklist\n\n" + "## Completed\n\n" + "Earlier work is summarized here.\n\n" + "## Remaining\n\n" + "- [ ] Task 1\n" + "- [ ] Task 2\n" + ) + new = ( + "# Tasklist\n\n" + "## Completed\n\n" + "Earlier work is summarized here. Task 1 is now complete.\n\n" + "## Remaining\n\n" + "- [ ] Task 2\n" + ) + + valid, reason = mgr.validate_single_task_completion(original, new) + + assert valid is True + assert reason == "" + + def test_rejects_task_removal_without_completed_summary(self, temp_repo): + mgr = TasklistManager(repo_dir=temp_repo) + original = "# Tasklist\n\n- [ ] Task 1\n- [ ] Task 2\n" + new = "# Tasklist\n\n- [ ] Task 2\n" + + valid, reason = mgr.validate_single_task_completion(original, new) + + assert valid is False + assert "Task count decreased" in reason + + def test_rejects_removing_later_task_from_completed_summary_mode(self, temp_repo): + mgr = TasklistManager(repo_dir=temp_repo) + original = ( + "# Tasklist\n\n" + "## Completed\n\n" + "Earlier work is summarized here.\n\n" + "## Remaining\n\n" + "- [ ] Task 1\n" + "- [ ] Task 2\n" + ) + new = ( + "# Tasklist\n\n" + "## Completed\n\n" + "Earlier work is summarized here. Task 2 is now complete.\n\n" + "## Remaining\n\n" + "- [ ] Task 1\n" + ) + + valid, reason = mgr.validate_single_task_completion(original, new) + + assert valid is False + assert "Task count decreased" in reason From 182f6ca721112e70ed507f56b51df55386e08899 Mon Sep 17 00:00:00 2001 From: wittekin <6079900+wittekin@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:14:52 -0700 Subject: [PATCH 2/2] Release v0.6.4 Gate direct MCP task completion on C2 transactional capability and fix MCP release docs. --- CHANGELOG.md | 6 ++ docs/providers/mcp.md | 4 +- pyproject.toml | 2 +- src/millstone/runtime/orchestrator.py | 21 ++++++- tests/test_orchestrator.py | 85 ++++++++++++++++++++++++++- 5 files changed, 112 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14617ea..ca71092 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ The format is based on Keep a Changelog and this project follows Semantic Versio ## [Unreleased] +## [0.6.4] - 2026-04-02 + +### Fixed +- MCP-backed task completion finalization now only performs a direct provider-side status update when the active profile permits `C2_remote_bounded` transactional effects, avoiding false task failures under the default `C1_local_write` profile while preserving explicit remote close-out for C2-capable runs. +- Corrected MCP provider documentation to reflect the actual default profile behavior for direct provider-side remote writes. + ## [0.6.3] - 2026-04-02 ### Fixed diff --git a/docs/providers/mcp.md b/docs/providers/mcp.md index 97ec478..7608ba6 100644 --- a/docs/providers/mcp.md +++ b/docs/providers/mcp.md @@ -144,7 +144,9 @@ Any MCP server your agent supports works — the `mcp_server` value is passed th ## Effect policy -All MCP operations are classified as `EffectClass.transactional` (C2). Under the default `DEV_IMPLEMENTATION` profile these are permitted. If you have a stricter policy configured (e.g. `C1_LOCAL_WRITE` only), MCP operations will raise `CapabilityViolation` — update `.millstone/policy.toml` to allow `C2_REMOTE_BOUNDED` effects. +Millstone-classified MCP write operations are `EffectClass.transactional` (`C2_remote_bounded`). The default `dev_implementation` profile is `C1_local_write`, so direct provider writes that millstone issues itself are not permitted unless you configure a `C2_remote_bounded` profile with `transactional` effects allowlisted. + +In the default profile, prompt-driven MCP actions performed by the coding agent can still run because the agent is executing those tool calls from the rendered prompt, not through millstone's direct provider-effect path. Orchestrator-driven follow-up writes, such as explicit post-commit remote task completion, are skipped unless the active profile allows those transactional effects. --- diff --git a/pyproject.toml b/pyproject.toml index ffed1dd..886a231 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "millstone" -version = "0.6.3" +version = "0.6.4" description = "Orchestrator for agentic coding tools" readme = "README.md" requires-python = ">=3.10" diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py index e59ce97..3ae3cc6 100755 --- a/src/millstone/runtime/orchestrator.py +++ b/src/millstone/runtime/orchestrator.py @@ -55,7 +55,7 @@ from millstone.loops.outer import OuterLoopManager from millstone.loops.registry_adapter import LoopRegistryAdapter from millstone.policy.capability import CapabilityPolicyGate, CapabilityTier -from millstone.policy.effects import EffectIntent, EffectPolicyGate, NoOpEffectProvider +from millstone.policy.effects import EffectClass, EffectIntent, EffectPolicyGate, NoOpEffectProvider from millstone.policy.schemas import ( ReviewDecision, ReviewStatus, @@ -2423,6 +2423,16 @@ def _ensure_tasklist_provider_author_callback(self) -> None: ): provider.set_agent_callback(lambda p, **k: self.run_agent(p, role="author", **k)) + def _can_finalize_remote_task_completion(self) -> tuple[bool, str | None]: + """Check whether the active profile may perform direct remote task completion.""" + try: + self._capability_gate.assert_permitted(CapabilityTier.C2_REMOTE_BOUNDED) + except Exception as exc: + return False, str(exc) + if EffectClass.transactional not in self.profile.permitted_effect_classes: + return False, "Effect class transactional is not in permitted_effect_classes" + return True, None + def _finalize_remote_task_completion(self) -> bool: """Ensure MCP-backed tasks are marked done after a successful task run.""" from millstone.artifact_providers.mcp import MCPTasklistProvider @@ -2434,6 +2444,15 @@ def _finalize_remote_task_completion(self) -> bool: if not isinstance(provider, MCPTasklistProvider): return True + allowed, reason = self._can_finalize_remote_task_completion() + if not allowed: + self.log( + "remote_task_completion_skipped", + task_id=self._current_task_id, + reason=reason, + ) + return True + try: self._ensure_tasklist_provider_author_callback() provider.update_task_status(self._current_task_id, TaskStatus.done) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 0c3baf5..8939b6e 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -1944,7 +1944,23 @@ def test_run_single_task_research_mode_closes_mcp_task(self, temp_repo): stderr="", ) - orch = Orchestrator(tasklist="docs/tasklist.md", research=True, cli="claude") + profile = Profile( + id="test_c2_transactional_research", + name="C2 Transactional Research Profile", + role_aliases={"builder": "author"}, + capability_tier=CapabilityTier.C2_REMOTE_BOUNDED, + permitted_effect_classes=frozenset({EffectClass.transactional}), + ) + registry = ProfileRegistry() + registry.register(profile) + + with patch("millstone.runtime.orchestrator.ProfileRegistry", return_value=registry): + orch = Orchestrator( + tasklist="docs/tasklist.md", + research=True, + cli="claude", + profile=profile.id, + ) try: # Inject a mock MCP tasklist provider with a known remote task ID mock_mcp = MagicMock(spec=MCPTasklistProvider) @@ -1994,7 +2010,18 @@ def test_run_single_task_closes_mcp_task_on_success(self, temp_repo): from millstone.artifact_providers.mcp import MCPTasklistProvider from millstone.artifacts.models import TasklistItem, TaskStatus - orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True) + profile = Profile( + id="test_c2_transactional", + name="C2 Transactional Profile", + role_aliases={"builder": "author"}, + capability_tier=CapabilityTier.C2_REMOTE_BOUNDED, + permitted_effect_classes=frozenset({EffectClass.transactional}), + ) + registry = ProfileRegistry() + registry.register(profile) + + with patch("millstone.runtime.orchestrator.ProfileRegistry", return_value=registry): + orch = Orchestrator(tasklist="docs/tasklist.md", profile=profile.id, quiet=True) try: mock_mcp = MagicMock(spec=MCPTasklistProvider) mock_mcp._agent_callback = None @@ -2033,7 +2060,18 @@ def test_run_single_task_fails_when_mcp_completion_update_fails(self, temp_repo) from millstone.artifact_providers.mcp import MCPTasklistProvider from millstone.artifacts.models import TasklistItem, TaskStatus - orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True) + profile = Profile( + id="test_c2_transactional_failure", + name="C2 Transactional Failure Profile", + role_aliases={"builder": "author"}, + capability_tier=CapabilityTier.C2_REMOTE_BOUNDED, + permitted_effect_classes=frozenset({EffectClass.transactional}), + ) + registry = ProfileRegistry() + registry.register(profile) + + with patch("millstone.runtime.orchestrator.ProfileRegistry", return_value=registry): + orch = Orchestrator(tasklist="docs/tasklist.md", profile=profile.id, quiet=True) try: mock_mcp = MagicMock(spec=MCPTasklistProvider) mock_mcp._agent_callback = None @@ -2068,6 +2106,47 @@ def fake_run_agent(prompt, role="default", **kwargs): finally: orch.cleanup() + def test_run_single_task_skips_explicit_mcp_close_when_profile_disallows_remote_effects( + self, temp_repo + ): + """Default C1 profiles leave MCP completion to the builder prompt path.""" + from millstone.artifact_providers.mcp import MCPTasklistProvider + from millstone.artifacts.models import TasklistItem, TaskStatus + + orch = Orchestrator(tasklist="docs/tasklist.md", quiet=True) + try: + mock_mcp = MagicMock(spec=MCPTasklistProvider) + mock_mcp._agent_callback = None + task = TasklistItem(task_id="GH-42", title="Remote task", status=TaskStatus.todo) + mock_mcp.get_prompt_placeholders.return_value = {} + mock_mcp.list_tasks.return_value = [task] + mock_mcp.get_task.return_value = task + orch._outer_loop_manager.tasklist_provider = mock_mcp + + def fake_run_agent(prompt, role="default", **kwargs): + if role == "author": + (temp_repo / "impl.py").write_text("value = 1\n") + return "Implemented remote task." + if role == "reviewer": + return ( + '{"status":"APPROVED","review":"ok","summary":"ok",' + '"findings":[],"findings_by_severity":{"critical":[],"high":[],' + '"medium":[],"low":[],"nit":[]},"impossible_condition":null,' + '"tasklist_fix_recommendation":null}' + ) + raise AssertionError(f"Unexpected role: {role}") + + with ( + patch.object(orch, "run_agent", side_effect=fake_run_agent), + patch.object(orch, "sanity_check_impl", return_value=True), + patch.object(orch, "delegate_commit", return_value=True), + ): + assert orch.run_single_task() is True + + mock_mcp.update_task_status.assert_not_called() + finally: + orch.cleanup() + class TestIsApproved: """Tests for approval detection."""