diff --git a/.gitignore b/.gitignore index 224a7ea..3b6f705 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,11 @@ MultiTool.yaml /worker/** wrangler.toml wrangler.json + +# Scratch prompt location. +/prompt.md + +# storybloq local session state (roadmap/tickets ARE tracked) +.story/snapshots/ +.story/sessions/ +.story/status.json diff --git a/.story/.gitignore b/.story/.gitignore new file mode 100644 index 0000000..15056bf --- /dev/null +++ b/.story/.gitignore @@ -0,0 +1,3 @@ +snapshots/ +status.json +sessions/ diff --git a/.story/config.json b/.story/config.json new file mode 100644 index 0000000..d307352 --- /dev/null +++ b/.story/config.json @@ -0,0 +1,26 @@ +{ + "features": { + "handovers": true, + "issues": true, + "reviews": true, + "roadmap": true, + "tickets": true + }, + "language": "rust", + "project": "multitool", + "schemaVersion": 2, + "type": "cargo", + "version": 2, + "recipeOverrides": { + "stages": { + "TEST": { + "enabled": true, + "command": "cargo nextest run --workspace" + }, + "BUILD": { + "enabled": true, + "command": "cargo build --workspace" + } + } + } +} diff --git a/.story/handovers/2026-06-23-01-setup-mirror-linear-multitool-checks.md b/.story/handovers/2026-06-23-01-setup-mirror-linear-multitool-checks.md new file mode 100644 index 0000000..a45d8e9 --- /dev/null +++ b/.story/handovers/2026-06-23-01-setup-mirror-linear-multitool-checks.md @@ -0,0 +1,68 @@ +# Session handover — storybloq setup for MultiTool Checks + +## What this project is + +`.story/` was initialized for the **multitool** repo (Rust / cargo CLI, the `multi` +binary; mature v0.4.0 canary-deployment tool). The roadmap tracks the **MultiTool +Checks** feature — a new `multi check` subcommand that validates declared +non-functional ("ility") requirements by running AI-agent checks inside copy-on-write +sandboxes and collecting verdicts through an in-process `rmcp` MCP server. Active +branch: `robbie/mt-check`. The PRD lives at `prompt.md` (gitignored scratch file). + +## How this roadmap was created + +This storybloq roadmap is a **one-to-one mirror of the existing Linear project +"MultiTool Checks"** (team MULTI, project id 781c5b95-…). It was NOT independently +designed — Robbie had already authored the Linear project + tickets from `prompt.md` +and asked to mirror them. + +- **Linear milestones → storybloq phases** (9): M0 · Subcommand skeleton, M1 · + Discovery, M2 · Configuration & executor, M3 · Sandboxing, M4 · MCP result server, + M5 · Execution, M6 · Reporting & exit, Tests & docs, Future work (post-MVP). +- **Linear issues → storybloq tickets** (34, one-to-one): MULTI-1331..MULTI-1364. + Created in Linear-numeric order so T-001..T-034 line up with the issue order. +- **Each ticket's description carries its Linear ID, URL, and `gitBranchName`** + (`robbie/multi-XXXX`, verbatim from the Linear API). Pushing that branch to GitHub + auto-links the Linear issue. The branch is the linkage mechanism — storybloq has no + dedicated Linear field, so it lives in the description footer. +- **Epic preserved**: MULTI-1332 (the in-process MCP server) = **T-013**, a `feature`; + its 5 Linear sub-issues (MULTI-1344..1348) = T-014..T-018, set as storybloq + sub-tickets (`parentTicket: T-013`). + +## Ticket-number map (storybloq ↔ Linear) + +T-001→1331 · T-002→1333 · T-003→1334 · T-004→1335 · T-005→1336 · T-006→1337 · +T-007→1338 · T-008→1339 · T-009→1340 · T-010→1341 · T-011→1342 · T-012→1343 · +T-013→1332(epic) · T-014→1344 · T-015→1345 · T-016→1346 · T-017→1347 · T-018→1348 · +T-019→1349 · T-020→1350 · T-021→1351 · T-022→1352 · T-023→1353 · T-024→1354 · +T-025→1355 · T-026→1356 · T-027→1357 · T-028→1358 · T-029→1359 · T-030→1360 · +T-031→1361 · T-032→1362 · T-033→1363 · T-034→1364. +(Stable map is the Linear ID in each ticket's description, not the T-number.) + +## Type mapping convention + +Linear labels don't map cleanly to storybloq's task/feature/chore, so: MVP +implementation issues → `task`; the MCP epic (T-013) and all 8 post-MVP "Future +work" items → `feature`; the two test issues + the docs issue → `chore`. + +## Decisions captured this session + +- **Markdown parser = `comrak`** (NOT pulldown-cmark), per Robbie. Baked into T-004's + description AND written back to Linear MULTI-1335 (its description previously left + comrak/pulldown-cmark as open "candidates"; now a firm Decision section). + +## Config + +- `recipeOverrides.stages`: TEST = `cargo nextest run --workspace`, + BUILD = `cargo build --workspace`. WRITE_TESTS/VERIFY left off (CLI, no dev server). +- `.gitignore`: added `.story/snapshots/`, `.story/sessions/`, `.story/status.json` + (roadmap + tickets are tracked). + +## State / next steps + +- All 34 tickets are `open`; nothing started. Implementation order follows the phases + (M0 → M6, then Tests & docs; Future work is post-MVP backlog). +- Natural first ticket: **T-001** (wire up `multi check` subcommand + phase skeleton), + which everything else builds on. +- If new Linear issues are added to the project later, mirror them the same way + (one ticket, Linear ID + branch in the description, correct phase). diff --git a/.story/roadmap.json b/.story/roadmap.json new file mode 100644 index 0000000..dce72f1 --- /dev/null +++ b/.story/roadmap.json @@ -0,0 +1,76 @@ +{ + "blockers": [], + "date": "2026-06-23", + "phases": [ + { + "description": "Wire `multi check` into the CLI and stand up the phase-orchestration skeleton (configuration → discovery → execution → reporting) with error/exit plumbing. Mirrors the existing `Run` command. Linear milestone: M0 · Subcommand skeleton.", + "id": "m0-subcommand-skeleton", + "label": "M0", + "name": "M0 · Subcommand skeleton", + "summary": "Wire `multi check` + phase-orchestration skeleton" + }, + { + "description": "Discover `CHECKS.md` files, parse them to a Markdown AST (comrak), and extract the requirement/check model (anonymous checks + discovery-time validation). Produces the `Vec` consumed by execution. Linear milestone: M1 · Discovery.", + "id": "m1-discovery", + "label": "M1", + "name": "M1 · Discovery", + "summary": "Discover + parse CHECKS.md into Vec<Requirement>" + }, + { + "description": "The hardcoded-but-injected configuration phase and the boxed agent-executor trait abstraction, plus the concrete `claude -p` executor. Linear milestone: M2 · Configuration & executor.", + "id": "m2-configuration-executor", + "label": "M2", + "name": "M2 · Configuration & executor", + "summary": "Injected config + boxed CheckExecutor + claude -p" + }, + { + "description": "Per-check copy-on-write filesystem sandboxing. macOS-only for the MVP, behind a `cfg`-gated boxed trait so other OSes can be added later. Linear milestone: M3 · Sandboxing.", + "id": "m3-sandboxing", + "label": "M3", + "name": "M3 · Sandboxing", + "summary": "Per-check CoW sandbox (macOS, cfg-gated trait)" + }, + { + "description": "The trustworthy result-reporting guardrail: one in-process `rmcp` server on a localhost port (dedicated tokio task, never a subprocess) exposing the `report-check-result` tool across N per-check endpoints. Built as a parent epic with sub-issues. Linear milestone: M4 · MCP result server.", + "id": "m4-mcp-result-server", + "label": "M4", + "name": "M4 · MCP result server", + "summary": "In-process rmcp result server, N per-check endpoints" + }, + { + "description": "The parallel execution phase: for each check, create a sandbox, dispatch the agent against its MCP endpoint, reconcile the reported result, and aggregate checks into per-requirement verdicts via logical AND. Linear milestone: M5 · Execution.", + "id": "m5-execution", + "label": "M5", + "name": "M5 · Execution", + "summary": "Parallel execution + reconcile + AND aggregation" + }, + { + "description": "Render results and set the process exit code: green/red requirement titles, failing checks in red with evidence, passing checks omitted; exit 0 iff all requirements satisfied, else 1. Linear milestone: M6 · Reporting & exit.", + "id": "m6-reporting-exit", + "label": "M6", + "name": "M6 · Reporting & exit", + "summary": "Colored report + exit code (0 pass / 1 fail)" + }, + { + "description": "Cross-cutting test infrastructure (a fake executor so the pipeline can run without invoking `claude`), unit tests for AST extraction, an end-to-end pipeline test, and user-facing docs. Linear milestone: Tests & docs.", + "id": "tests-docs", + "label": "TESTS", + "name": "Tests & docs", + "summary": "Fake executor, AST unit tests, E2E test, docs" + }, + { + "description": "Post-MVP work explicitly out of scope for the initial release, captured so nothing is lost. These are real tickets but carry no roadmap label. Linear milestone: Future work (post-MVP).", + "id": "future-work", + "label": "FUTURE", + "name": "Future work (post-MVP)", + "summary": "Post-MVP backlog (shell checks, more OSes, hooks…)" + }, + { + "description": "Initial project setup.", + "id": "p0", + "label": "PHASE 0", + "name": "Setup" + } + ], + "title": "multitool" +} diff --git a/.story/tickets/T-001.json b/.story/tickets/T-001.json new file mode 100644 index 0000000..9575d15 --- /dev/null +++ b/.story/tickets/T-001.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Introduce the `multi check` subcommand and the top-level orchestration skeleton the rest of the project fills in. Add a `CheckSubcommand` (clap `Args`) under `src/config/check/`, exported from `src/config/mod.rs`, starting with a working-directory arg (default `.`). Land the configuration → discovery → execution → reporting phases as stubs; mirrors the existing `Run` command (`src/cmd/run`, `src/config/run`).\n\n———\nMirrors Linear **MULTI-1331** · git branch `robbie/multi-1331`\nhttps://linear.app/wack-incorporated/issue/MULTI-1331/wire-up-the-multi-check-subcommand-and-phase-orchestration-skeleton", + "id": "T-001", + "order": 10, + "phase": "m0-subcommand-skeleton", + "status": "open", + "title": "Wire up the `multi check` subcommand and phase-orchestration skeleton", + "type": "task" +} diff --git a/.story/tickets/T-002.json b/.story/tickets/T-002.json new file mode 100644 index 0000000..bd0f97e --- /dev/null +++ b/.story/tickets/T-002.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Define the in-memory representation produced by discovery and consumed by execution: `Requirement { filepath: PathBuf, title: String, checks: Vec }` and `Check { title: String, prompt: String }`. Titles are NOT unique across the set; requirements/checks are grouped by declaring file. `checks` is guaranteed non-empty after M1 validation.\n\n———\nMirrors Linear **MULTI-1333** · git branch `robbie/multi-1333`\nhttps://linear.app/wack-incorporated/issue/MULTI-1333/define-the-requirement-and-check-domain-types", + "id": "T-002", + "order": 10, + "phase": "m1-discovery", + "status": "open", + "title": "Define the `Requirement` and `Check` domain types", + "type": "task" +} diff --git a/.story/tickets/T-003.json b/.story/tickets/T-003.json new file mode 100644 index 0000000..170732b --- /dev/null +++ b/.story/tickets/T-003.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Recursively scan the working directory (and subdirectories) for files named exactly `CHECKS.md`, producing the list of files to parse. Use the `ignore` crate (already a dependency — the ripgrep walker) for a fast, parallel, gitignore-aware walk rooted at the working directory. Default to respecting `.gitignore`; document the decision.\n\n———\nMirrors Linear **MULTI-1334** · git branch `robbie/multi-1334`\nhttps://linear.app/wack-incorporated/issue/MULTI-1334/recursively-discover-checksmd-files-from-the-working-directory", + "id": "T-003", + "order": 20, + "phase": "m1-discovery", + "status": "open", + "title": "Recursively discover `CHECKS.md` files from the working directory", + "type": "task" +} diff --git a/.story/tickets/T-004.json b/.story/tickets/T-004.json new file mode 100644 index 0000000..e636307 --- /dev/null +++ b/.story/tickets/T-004.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Parse each discovered `CHECKS.md` into a Markdown AST that the extraction step walks. **Decision: use `comrak`** (CommonMark, AST node tree) — NOT pulldown-cmark — because its node tree is the easier fit for finding H1/H2 headers and capturing the text between them. Add `comrak` as a dependency (none present today).\n\n———\nMirrors Linear **MULTI-1335** · git branch `robbie/multi-1335`\nhttps://linear.app/wack-incorporated/issue/MULTI-1335/parse-checksmd-into-a-markdown-ast", + "id": "T-004", + "order": 30, + "phase": "m1-discovery", + "status": "open", + "title": "Parse `CHECKS.md` into a Markdown AST", + "type": "task" +} diff --git a/.story/tickets/T-005.json b/.story/tickets/T-005.json new file mode 100644 index 0000000..0153bf8 --- /dev/null +++ b/.story/tickets/T-005.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Walk a single file's AST and produce its `Requirement`s with their `Check`s. Requirement = an H1 whose text matches `^(Requirement|Req) ` (`Req` is an alias); the remainder of the line is the title. Check = an H2 whose text matches `^Check `, associated with the nearest preceding requirement; the Markdown beneath it is the prompt.\n\n———\nMirrors Linear **MULTI-1336** · git branch `robbie/multi-1336`\nhttps://linear.app/wack-incorporated/issue/MULTI-1336/extract-requirements-and-checks-by-walking-the-markdown-ast", + "id": "T-005", + "order": 40, + "phase": "m1-discovery", + "status": "open", + "title": "Extract requirements and checks by walking the Markdown AST", + "type": "task" +} diff --git a/.story/tickets/T-006.json b/.story/tickets/T-006.json new file mode 100644 index 0000000..c8e70bb --- /dev/null +++ b/.story/tickets/T-006.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "When a requirement declares no `## Check`, its prose body becomes a single anonymous check that inherits the requirement's title. Produces one `Check { title: , prompt: }`.\n\n———\nMirrors Linear **MULTI-1337** · git branch `robbie/multi-1337`\nhttps://linear.app/wack-incorporated/issue/MULTI-1337/infer-anonymous-checks-from-requirement-prose", + "id": "T-006", + "order": 50, + "phase": "m1-discovery", + "status": "open", + "title": "Infer anonymous checks from requirement prose", + "type": "task" +} diff --git a/.story/tickets/T-007.json b/.story/tickets/T-007.json new file mode 100644 index 0000000..0adee6c --- /dev/null +++ b/.story/tickets/T-007.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Validate the extracted model and assemble the final `Vec` discovery returns. Errors (not warnings): orphan check — a `## Check` with no preceding requirement in its file; checkless requirement — a requirement with no explicit checks AND no prose to promote to an anonymous check. Surface as miette diagnostics. Joins the parallel per-file parse results.\n\n———\nMirrors Linear **MULTI-1338** · git branch `robbie/multi-1338`\nhttps://linear.app/wack-incorporated/issue/MULTI-1338/validate-discovery-results-and-assemble-the-requirement-set", + "id": "T-007", + "order": 60, + "phase": "m1-discovery", + "status": "open", + "title": "Validate discovery results and assemble the requirement set", + "type": "task" +} diff --git a/.story/tickets/T-008.json b/.story/tickets/T-008.json new file mode 100644 index 0000000..edfee22 --- /dev/null +++ b/.story/tickets/T-008.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Define the abstraction over \"run one check's agent\" — the seam that lets us swap `claude -p` for a Claude Code SDK (or other providers) later without touching execution. Per the spec it MUST be a boxed `#[async_trait]` trait object for dynamic dispatch, following the repo convention (`BoxedIngress`/`BoxedMonitor`/`BoxedPlatform`).\n\n———\nMirrors Linear **MULTI-1339** · git branch `robbie/multi-1339`\nhttps://linear.app/wack-incorporated/issue/MULTI-1339/define-the-boxed-agent-executor-trait-checkexecutor", + "id": "T-008", + "order": 10, + "phase": "m2-configuration-executor", + "status": "open", + "title": "Define the boxed agent-executor trait (`CheckExecutor`)", + "type": "task" +} diff --git a/.story/tickets/T-009.json b/.story/tickets/T-009.json new file mode 100644 index 0000000..3650e69 --- /dev/null +++ b/.story/tickets/T-009.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "The concrete `CheckExecutor` for the MVP: shell out to the Claude Code CLI via `claude -p` with the assembled prompt/instructions, the `haiku` model family, and `--mcp-config` pointing at this check's dedicated endpoint (payload from M4). Run with the sandbox directory (M3) as the working directory; capture exit status/stderr into an `AgentOutcome`.\n\n———\nMirrors Linear **MULTI-1340** · git branch `robbie/multi-1340`\nhttps://linear.app/wack-incorporated/issue/MULTI-1340/implement-the-claude-p-claude-code-executor", + "id": "T-009", + "order": 20, + "phase": "m2-configuration-executor", + "status": "open", + "title": "Implement the `claude -p` Claude Code executor", + "type": "task" +} diff --git a/.story/tickets/T-010.json b/.story/tickets/T-010.json new file mode 100644 index 0000000..d2b353b --- /dev/null +++ b/.story/tickets/T-010.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "The configuration phase: hardcoded for the MVP (no env/file loading) but its values are dependency-injected forward rather than read at point of use, so swapping providers later is a config change not a rewrite. Define a `Config` carrying provider/model-provider URL, model (default the `haiku` family), and effort; construct the concrete `BoxedExecutor` from it and inject into execution.\n\n———\nMirrors Linear **MULTI-1341** · git branch `robbie/multi-1341`\nhttps://linear.app/wack-incorporated/issue/MULTI-1341/implement-the-hardcoded-configuration-phase-and-dependency-injection", + "id": "T-010", + "order": 30, + "phase": "m2-configuration-executor", + "status": "open", + "title": "Implement the hardcoded configuration phase and dependency injection", + "type": "task" +} diff --git a/.story/tickets/T-011.json b/.story/tickets/T-011.json new file mode 100644 index 0000000..9d10e92 --- /dev/null +++ b/.story/tickets/T-011.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Each check runs inside a copy-on-write filesystem sandbox so an agent can read/modify the tree freely without corrupting the real working directory. Define a boxed `#[async_trait] Sandbox: Send + Sync` trait (mirroring `BoxedIngress` etc.) plus `cfg`-gated platform selection. The macOS implementation is a separate ticket.\n\n———\nMirrors Linear **MULTI-1342** · git branch `robbie/multi-1342`\nhttps://linear.app/wack-incorporated/issue/MULTI-1342/define-the-copy-on-write-sandbox-trait-with-cfg-gated-platform", + "id": "T-011", + "order": 10, + "phase": "m3-sandboxing", + "status": "open", + "title": "Define the copy-on-write sandbox trait with `cfg`-gated platform selection", + "type": "task" +} diff --git a/.story/tickets/T-012.json b/.story/tickets/T-012.json new file mode 100644 index 0000000..23c0e8e --- /dev/null +++ b/.story/tickets/T-012.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Implement the `Sandbox` trait on macOS using APFS copy-on-write cloning so each check gets a near-instant, space-efficient clone of the working tree. Prefer `libc::clonefile(2)` (`libc` already a dependency) for a direct `clonefile`/`clonefileat` call over `cp -c`. Place clones under a temp root; clean up on drop.\n\n———\nMirrors Linear **MULTI-1343** · git branch `robbie/multi-1343`\nhttps://linear.app/wack-incorporated/issue/MULTI-1343/implement-the-macos-apfs-copy-on-write-sandbox", + "id": "T-012", + "order": 20, + "phase": "m3-sandboxing", + "status": "open", + "title": "Implement the macOS APFS copy-on-write sandbox", + "type": "task" +} diff --git a/.story/tickets/T-013.json b/.story/tickets/T-013.json new file mode 100644 index 0000000..c801a97 --- /dev/null +++ b/.story/tickets/T-013.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Epic. The result-reporting guardrail. Because agents are nondeterministic, do NOT trust stdout or sentinel files — every agent reports its verdict by calling a single MCP tool, `report-check-result`, served by ONE in-process MCP server built with `rmcp`, bound to a localhost port, running on a dedicated tokio task within the CLI process (never a subprocess). Decomposed into the sub-tickets in this phase.\n\n———\nMirrors Linear **MULTI-1332** (parent epic) · git branch `robbie/multi-1332`\nhttps://linear.app/wack-incorporated/issue/MULTI-1332/in-process-mcp-result-reporting-server", + "id": "T-013", + "order": 10, + "phase": "m4-mcp-result-server", + "status": "open", + "title": "In-process MCP result-reporting server (epic)", + "type": "feature" +} diff --git a/.story/tickets/T-014.json b/.story/tickets/T-014.json new file mode 100644 index 0000000..a907b02 --- /dev/null +++ b/.story/tickets/T-014.json @@ -0,0 +1,13 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Define the single MCP tool every agent uses to report its verdict, and the Rust types behind it. `report-check-result`: `success: bool` (required — true = satisfied), `evidence?: String` (optional — explanation of how the agent concluded). Define the serde input schema and the corresponding server-side result type.\n\n———\nMirrors Linear **MULTI-1344** (sub-issue of MULTI-1332) · git branch `robbie/multi-1344`\nhttps://linear.app/wack-incorporated/issue/MULTI-1344/define-the-report-check-result-tool-contract-and-result-types", + "id": "T-014", + "order": 20, + "parentTicket": "T-013", + "phase": "m4-mcp-result-server", + "status": "open", + "title": "Define the `report-check-result` tool contract and result types", + "type": "task" +} diff --git a/.story/tickets/T-015.json b/.story/tickets/T-015.json new file mode 100644 index 0000000..f7396a8 --- /dev/null +++ b/.story/tickets/T-015.json @@ -0,0 +1,13 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Stand up the single MCP server that receives `report-check-result` calls, using the `rmcp` framework (add as a dependency; not present today). Bind ONE server to a localhost port (ephemeral/OS-assigned is fine; capture the chosen port for endpoint URLs). Run it on a dedicated tokio task spawned on the CLI's runtime.\n\n———\nMirrors Linear **MULTI-1345** (sub-issue of MULTI-1332) · git branch `robbie/multi-1345`\nhttps://linear.app/wack-incorporated/issue/MULTI-1345/stand-up-the-in-process-rmcp-server-on-a-localhost-port", + "id": "T-015", + "order": 30, + "parentTicket": "T-013", + "phase": "m4-mcp-result-server", + "status": "open", + "title": "Stand up the in-process `rmcp` server on a localhost port", + "type": "task" +} diff --git a/.story/tickets/T-016.json b/.story/tickets/T-016.json new file mode 100644 index 0000000..bc6ea0f --- /dev/null +++ b/.story/tickets/T-016.json @@ -0,0 +1,13 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "The one server hosts N endpoints — one per check — so each agent has a unique URL to write its singleton result to. Expose N distinct endpoints (e.g. URL paths like `/checks/{check_id}`) on the single server/port. Enforce single-call semantics (each endpoint expects exactly one `report-check-result` from its associated agent) and plumb each result back over a channel.\n\n———\nMirrors Linear **MULTI-1346** (sub-issue of MULTI-1332) · git branch `robbie/multi-1346`\nhttps://linear.app/wack-incorporated/issue/MULTI-1346/build-n-per-check-endpoints-with-single-call-semantics-and-a-result", + "id": "T-016", + "order": 40, + "parentTicket": "T-013", + "phase": "m4-mcp-result-server", + "status": "open", + "title": "Build N per-check endpoints with single-call semantics and a result channel", + "type": "task" +} diff --git a/.story/tickets/T-017.json b/.story/tickets/T-017.json new file mode 100644 index 0000000..6b9077e --- /dev/null +++ b/.story/tickets/T-017.json @@ -0,0 +1,13 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Own the server's lifetime so it runs only as long as needed, then returns control to the main task. Normal shutdown: once all N endpoints have received their tool call, signal completion, stop the server task, and hand the collected `CheckReport`s back. Missing reports: an agent may exit without ever calling `report-check-result` (crash/timeout) — treat as an unreported/failed check rather than hanging forever.\n\n———\nMirrors Linear **MULTI-1347** (sub-issue of MULTI-1332) · git branch `robbie/multi-1347`\nhttps://linear.app/wack-incorporated/issue/MULTI-1347/manage-server-lifecycle-shutdown-after-all-report-handle-missing", + "id": "T-017", + "order": 50, + "parentTicket": "T-013", + "phase": "m4-mcp-result-server", + "status": "open", + "title": "Manage server lifecycle: shutdown after all report, handle missing reports", + "type": "task" +} diff --git a/.story/tickets/T-018.json b/.story/tickets/T-018.json new file mode 100644 index 0000000..31d0db2 --- /dev/null +++ b/.story/tickets/T-018.json @@ -0,0 +1,13 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Produce, per check, the `--mcp-config` JSON payload that points each agent at ITS endpoint, so `claude -p` connects to the right per-check URL. Declare the `report-check-result` MCP server at that check's localhost endpoint (host + port from the server, path from the per-check endpoint). Decide delivery (a temp config file path passed to `--mcp-config`, or inline).\n\n———\nMirrors Linear **MULTI-1348** (sub-issue of MULTI-1332) · git branch `robbie/multi-1348`\nhttps://linear.app/wack-incorporated/issue/MULTI-1348/generate-the-per-check-mcp-config-payload-for-claude-p", + "id": "T-018", + "order": 60, + "parentTicket": "T-013", + "phase": "m4-mcp-result-server", + "status": "open", + "title": "Generate the per-check `--mcp-config` payload for `claude -p`", + "type": "task" +} diff --git a/.story/tickets/T-019.json b/.story/tickets/T-019.json new file mode 100644 index 0000000..0dbb9aa --- /dev/null +++ b/.story/tickets/T-019.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "The heart of the execution phase: run every check in parallel, each in its own sandbox, against its own MCP endpoint, and collect the reported results. Per check: (1) create a CoW sandbox of the working tree (M3 `BoxedSandbox`); (2) resolve the dedicated MCP endpoint + `--mcp-config` (M4); (3) dispatch the injected `BoxedExecutor` (M2) with instructions + prompt (M5).\n\n———\nMirrors Linear **MULTI-1349** · git branch `robbie/multi-1349`\nhttps://linear.app/wack-incorporated/issue/MULTI-1349/implement-the-parallel-execution-orchestrator", + "id": "T-019", + "order": 10, + "phase": "m5-execution", + "status": "open", + "title": "Implement the parallel execution orchestrator", + "type": "task" +} diff --git a/.story/tickets/T-020.json b/.story/tickets/T-020.json new file mode 100644 index 0000000..de32ea6 --- /dev/null +++ b/.story/tickets/T-020.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Build the instruction text handed to each agent: standing operating instructions (verify the requirement described by the prompt, working in the current directory — the sandbox — and you MUST call the `report-check-result` tool exactly once with your verdict) plus the check's prompt. The instructions are what make the unreliable agent reliably report.\n\n———\nMirrors Linear **MULTI-1350** · git branch `robbie/multi-1350`\nhttps://linear.app/wack-incorporated/issue/MULTI-1350/assemble-agent-instructions-and-bind-the-report-tool", + "id": "T-020", + "order": 20, + "phase": "m5-execution", + "status": "open", + "title": "Assemble agent instructions and bind the report tool", + "type": "task" +} diff --git a/.story/tickets/T-021.json b/.story/tickets/T-021.json new file mode 100644 index 0000000..7f86320 --- /dev/null +++ b/.story/tickets/T-021.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Turn raw signals into a per-requirement verdict. Reconcile each check: the authoritative verdict is the MCP-reported `success` (M4), NOT the agent exit code — reported true → satisfied (carry evidence); reported false → failed (carry evidence); no report (agent died) → failed. Then apply logical AND across the checks within each requirement to mark it satisfied/unsatisfied.\n\n———\nMirrors Linear **MULTI-1351** · git branch `robbie/multi-1351`\nhttps://linear.app/wack-incorporated/issue/MULTI-1351/reconcile-agent-results-and-aggregate-requirement-verdicts-logical-and", + "id": "T-021", + "order": 30, + "phase": "m5-execution", + "status": "open", + "title": "Reconcile agent results and aggregate requirement verdicts (logical AND)", + "type": "task" +} diff --git a/.story/tickets/T-022.json b/.story/tickets/T-022.json new file mode 100644 index 0000000..b7523b4 --- /dev/null +++ b/.story/tickets/T-022.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Render the run's results to the terminal per the spec. Print each requirement title green if satisfied, red if not; for a failed requirement also print its failing checks in red, including the agent's `evidence`; passing checks are omitted (only the green requirement title shows). Route output through the existing `Terminal` and honor global color settings.\n\n———\nMirrors Linear **MULTI-1352** · git branch `robbie/multi-1352`\nhttps://linear.app/wack-incorporated/issue/MULTI-1352/implement-the-reporting-phase-colored-requirementcheck-output", + "id": "T-022", + "order": 10, + "phase": "m6-reporting-exit", + "status": "open", + "title": "Implement the reporting phase (colored requirement/check output)", + "type": "task" +} diff --git a/.story/tickets/T-023.json b/.story/tickets/T-023.json new file mode 100644 index 0000000..a68a210 --- /dev/null +++ b/.story/tickets/T-023.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Map the aggregate result to the process exit code so `multi check` is CI-usable. One or more requirements unsatisfied → exit 1. All requirements satisfied (including the trivial empty-suite case) → exit 0. Thread the exit code out of the reporting phase through `Check::dispatch()` to the process (the CLI's dispatch currently returns `miette::Result<()>`; choose a clean mapping).\n\n———\nMirrors Linear **MULTI-1353** · git branch `robbie/multi-1353`\nhttps://linear.app/wack-incorporated/issue/MULTI-1353/set-the-process-exit-code-from-the-aggregate-result", + "id": "T-023", + "order": 20, + "phase": "m6-reporting-exit", + "status": "open", + "title": "Set the process exit code from the aggregate result", + "type": "task" +} diff --git a/.story/tickets/T-024.json b/.story/tickets/T-024.json new file mode 100644 index 0000000..6b4c89e --- /dev/null +++ b/.story/tickets/T-024.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Make the pipeline testable without invoking real agents or touching the real filesystem, and lock down the parser/extractor with unit tests. Fake executor: a `CheckExecutor` test double returning scripted `AgentOutcome`s / driving scripted MCP reports so M5/M6 can be tested deterministically (repo already depends on `mockall`). Unit tests for AST extraction: requirements, explicit + anonymous checks, and edge cases.\n\n———\nMirrors Linear **MULTI-1354** · git branch `robbie/multi-1354`\nhttps://linear.app/wack-incorporated/issue/MULTI-1354/fake-executor-unit-tests-for-ast-extraction", + "id": "T-024", + "order": 10, + "phase": "tests-docs", + "status": "open", + "title": "Fake executor + unit tests for AST extraction", + "type": "chore" +} diff --git a/.story/tickets/T-025.json b/.story/tickets/T-025.json new file mode 100644 index 0000000..263538e --- /dev/null +++ b/.story/tickets/T-025.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "A black-box test that drives `multi check` over a fixture project and asserts discovery → execution → reporting → exit code, using the fake executor (no real `claude`). Fixture tree with multiple `CHECKS.md` files exercising: a satisfied requirement, a failed requirement (one failing check), a multi-check requirement (AND), and an anonymous check. Inject the fake executor/sandbox so verdicts are scripted.\n\n———\nMirrors Linear **MULTI-1355** · git branch `robbie/multi-1355`\nhttps://linear.app/wack-incorporated/issue/MULTI-1355/end-to-end-pipeline-integration-test-fake-executor", + "id": "T-025", + "order": 20, + "phase": "tests-docs", + "status": "open", + "title": "End-to-end pipeline integration test (fake executor)", + "type": "chore" +} diff --git a/.story/tickets/T-026.json b/.story/tickets/T-026.json new file mode 100644 index 0000000..4955f66 --- /dev/null +++ b/.story/tickets/T-026.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "User-facing documentation for `multi check`. Usage: what it does, how to run it (working directory, exit codes for CI), and MVP constraints (macOS-only sandboxing, `prompt`-type checks via `claude -p` + Haiku, hardcoded config). Authoring `CHECKS.md`: the format with examples — `# Requirement ` / `# Req <title>`, `## Check <title>`, prompts, prose, and anonymous checks.\n\n———\nMirrors Linear **MULTI-1356** · git branch `robbie/multi-1356`\nhttps://linear.app/wack-incorporated/issue/MULTI-1356/docs-multi-check-usage-and-checksmd-authoring-guide", + "id": "T-026", + "order": 30, + "phase": "tests-docs", + "status": "open", + "title": "Docs: `multi check` usage and `CHECKS.md` authoring guide", + "type": "chore" +} diff --git a/.story/tickets/T-027.json b/.story/tickets/T-027.json new file mode 100644 index 0000000..d1d90f3 --- /dev/null +++ b/.story/tickets/T-027.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Add a `shell` check `type` alongside the default `prompt` type. The check body is a shell script; a zero exit status = success, non-zero = failure; no agent involved. Introduces check `type` selection (a header attribute or fenced metadata) and a non-agent execution path that bypasses the MCP reporting guardrail (the script's exit status IS the report).\n\n———\nMirrors Linear **MULTI-1357** · git branch `robbie/multi-1357`\nhttps://linear.app/wack-incorporated/issue/MULTI-1357/support-the-shell-check-type", + "id": "T-027", + "order": 10, + "phase": "future-work", + "status": "open", + "title": "Support the `shell` check type", + "type": "feature" +} diff --git a/.story/tickets/T-028.json b/.story/tickets/T-028.json new file mode 100644 index 0000000..f1c08f5 --- /dev/null +++ b/.story/tickets/T-028.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Let a user override the model (and effort) on a per-check basis, so an expensive check can use a stronger model while the rest stay cheap. Needs per-check metadata (e.g. an attribute on `## Check`) flowing into executor selection, building on the configuration seam (M2) and the `CheckExecutor` abstraction.\n\n———\nMirrors Linear **MULTI-1358** · git branch `robbie/multi-1358`\nhttps://linear.app/wack-incorporated/issue/MULTI-1358/per-check-model-overrides", + "id": "T-028", + "order": 20, + "phase": "future-work", + "status": "open", + "title": "Per-check model overrides", + "type": "feature" +} diff --git a/.story/tickets/T-029.json b/.story/tickets/T-029.json new file mode 100644 index 0000000..d610dd3 --- /dev/null +++ b/.story/tickets/T-029.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Replace the hardcoded configuration with real configuration loading: adjust the global default model/provider/effort via environment variables and/or a config file. M2 deliberately hardcodes config behind the injection seam; this adds the loading layer — env (the CLI already uses clap's `env` feature) and/or a file (the repo already uses `toml`/manifest patterns) — feeding the same `Config` the executor consumes.\n\n———\nMirrors Linear **MULTI-1359** · git branch `robbie/multi-1359`\nhttps://linear.app/wack-incorporated/issue/MULTI-1359/global-model-configuration-environment-file-loading", + "id": "T-029", + "order": 30, + "phase": "future-work", + "status": "open", + "title": "Global model configuration (environment / file loading)", + "type": "feature" +} diff --git a/.story/tickets/T-030.json b/.story/tickets/T-030.json new file mode 100644 index 0000000..948127c --- /dev/null +++ b/.story/tickets/T-030.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Add a Linux implementation of the `Sandbox` trait (M3) so checks can run sandboxed on Linux. The MVP ships macOS-only (APFS `clonefile`) behind the `cfg`-gated trait. Linux CoW options to evaluate: btrfs/xfs reflink (`FICLONE` ioctl / `cp --reflink`) or overlayfs, with a copy fallback where reflinks aren't supported. Slots in as a new `Sandbox` impl + platform selection; no execution-phase changes expected.\n\n———\nMirrors Linear **MULTI-1360** · git branch `robbie/multi-1360`\nhttps://linear.app/wack-incorporated/issue/MULTI-1360/linux-copy-on-write-sandbox-support", + "id": "T-030", + "order": 40, + "phase": "future-work", + "status": "open", + "title": "Linux copy-on-write sandbox support", + "type": "feature" +} diff --git a/.story/tickets/T-031.json b/.story/tickets/T-031.json new file mode 100644 index 0000000..66843a1 --- /dev/null +++ b/.story/tickets/T-031.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Add a Windows implementation of the `Sandbox` trait (M3). The MVP is macOS-only. Windows CoW options to evaluate: ReFS block cloning, or a junction/copy-based fallback on NTFS. Slots in as a new `Sandbox` impl behind the existing platform gating.\n\n———\nMirrors Linear **MULTI-1361** · git branch `robbie/multi-1361`\nhttps://linear.app/wack-incorporated/issue/MULTI-1361/windows-copy-on-write-sandbox-support", + "id": "T-031", + "order": 50, + "phase": "future-work", + "status": "open", + "title": "Windows copy-on-write sandbox support", + "type": "feature" +} diff --git a/.story/tickets/T-032.json b/.story/tickets/T-032.json new file mode 100644 index 0000000..147cd54 --- /dev/null +++ b/.story/tickets/T-032.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. An MCP server (exposed to the user's own agent/editor) with tools for writing and updating requirements/checks in `CHECKS.md` files, so requirements can be authored conversationally. Distinct from the MVP's internal `report-check-result` guardrail (one tool, in-process): this is a separate, user-facing server whose tools mutate `CHECKS.md` on disk — different concerns and lifecycle.\n\n———\nMirrors Linear **MULTI-1362** · git branch `robbie/multi-1362`\nhttps://linear.app/wack-incorporated/issue/MULTI-1362/mcp-server-for-writingupdating-requirements", + "id": "T-032", + "order": 60, + "phase": "future-work", + "status": "open", + "title": "MCP server for writing/updating requirements", + "type": "feature" +} diff --git a/.story/tickets/T-033.json b/.story/tickets/T-033.json new file mode 100644 index 0000000..5c6e84c --- /dev/null +++ b/.story/tickets/T-033.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Emit check/requirement results to a remote HTTP endpoint (webhooks/hooks) so runs can feed dashboards, audit logs, or CI systems. The MVP reports only to the terminal + exit code; this adds an outbound reporting sink (`reqwest` is already a dependency) with a payload schema, target/auth config, and failure handling that doesn't block the run.\n\n———\nMirrors Linear **MULTI-1363** · git branch `robbie/multi-1363`\nhttps://linear.app/wack-incorporated/issue/MULTI-1363/remote-http-result-hooks", + "id": "T-033", + "order": 70, + "phase": "future-work", + "status": "open", + "title": "Remote HTTP result hooks", + "type": "feature" +} diff --git a/.story/tickets/T-034.json b/.story/tickets/T-034.json new file mode 100644 index 0000000..12f668d --- /dev/null +++ b/.story/tickets/T-034.json @@ -0,0 +1,12 @@ +{ + "blockedBy": [], + "completedDate": null, + "createdDate": "2026-06-23", + "description": "Post-MVP. Run checks continuously as monitors during interactive development — re-evaluating on file changes and surfacing live pass/fail status, rather than a single batch run. The MVP is a one-shot batch invocation; this introduces a watch loop (file-change triggers), incremental/affected-check evaluation, and a live UI/status surface — a substantial mode shift on top of the batch pipeline.\n\n———\nMirrors Linear **MULTI-1364** · git branch `robbie/multi-1364`\nhttps://linear.app/wack-incorporated/issue/MULTI-1364/checks-as-monitors-for-interactive-development", + "id": "T-034", + "order": 80, + "phase": "future-work", + "status": "open", + "title": "Checks as monitors for interactive development", + "type": "feature" +} diff --git a/CHECKS.md b/CHECKS.md new file mode 100644 index 0000000..41c7f20 --- /dev/null +++ b/CHECKS.md @@ -0,0 +1,69 @@ +> **Self-validation suite for MultiTool Checks.** Each requirement below is a non-functional ("-ility") property of the `multi check` implementation, validated by a `prompt`-type check — instructions for an AI agent inspecting this repository. Place this file at the repository root and run `multi check` to dogfood the tool on itself. + +# Requirement Provider-Agnostic Execution + +The agent executor must be swappable so we can evolve from shelling out to `claude -p` toward a Claude Code SDK (or another provider) without rewriting the execution phase. This extensibility is the whole point of the configuration/executor seam. + +## Check Boxed Executor Trait + +Inspect the Rust sources for the `multi check` feature. Confirm that agent execution is defined behind a trait (for example `CheckExecutor`) that is consumed as a boxed, dynamically-dispatched trait object — a type alias of the form `Box<dyn CheckExecutor + Send + Sync>`, mirroring the existing `BoxedIngress`, `BoxedMonitor`, and `BoxedPlatform` aliases. The check passes only if the execution phase depends on this trait rather than naming a concrete `claude -p` executor type. Report a failure if the execution path references a concrete executor struct directly instead of the trait object. + +# Requirement Trustworthy In-Process Reporting + +Check verdicts must arrive through the single `report-check-result` MCP tool, served from inside the CLI process. This guardrail is what makes results reliable despite agent nondeterminism; running the server out-of-process, or trusting agent stdout, would defeat it. + +## Check Server Runs In-Process + +Inspect how the result-reporting MCP server is started. Confirm it is built with the `rmcp` framework and run on a Tokio task within the CLI process — not spawned as a child process. Search for any use of `std::process::Command` or `tokio::process` that would launch the server externally; if the MCP server runs as a subprocess, the check fails. It passes only if the server runs on a task in the same process as the CLI. + +## Check Verdict From Tool Call + +Confirm that a check's pass/fail verdict is derived from the `success` boolean of the `report-check-result` tool call, and never from the agent's stdout, stderr, or process exit code. Verify that a check whose agent exits without ever calling the tool is treated as a failure or error rather than silently passing. The check fails if the verdict is obtained by parsing stdout or by reading the agent's exit status. + +# Requirement Stateful MCP Sessions + +The Claude Code MCP client connects to the result-reporting server over Streamable HTTP and expects the standard *stateful* session flow: it sends `initialize`, receives an `Mcp-Session-Id`, and issues subsequent requests under that session. A stateless server stalls this multi-step handshake, so the in-process MCP server must run in stateful mode. + +## Check Server Is Stateful + +Inspect how the `rmcp` result-reporting MCP server is configured (its `StreamableHttpServerConfig`). Confirm the server runs in stateful Streamable HTTP mode — that is, `stateful_mode` is true and is not set to `false`. The check passes only if the server is configured for stateful sessions; it fails if `stateful_mode` is set to `false` (stateless mode). + +# Requirement Checks Cannot Corrupt the Workspace + +Every check runs against a copy-on-write clone of the working tree, so a misbehaving agent cannot mutate the user's real files. Both checks below must pass for this requirement to be satisfied. + +## Check Sandbox Exists and Is Platform-Gated + +Inspect the sandboxing code. Confirm there is a sandbox abstraction (a trait such as `Sandbox`) with a macOS copy-on-write implementation built on APFS `clonefile`, and that operating-system-specific code is selected with `cfg` attributes so the crate still compiles on non-macOS targets. The check fails if sandboxing is compiled unconditionally for a single operating system in a way that would break the build on other targets. + +## Check Execution Uses the Sandbox + +Confirm that the execution phase creates a sandbox for each check and runs the agent with the sandbox directory as its working directory, rather than executing the agent against the real working directory. The check fails if any check is executed outside of a sandbox. + +# Req Concurrent Check Execution + +(`Req` is the short alias for `Requirement`; this requirement uses it on purpose.) Checks are independent and each one spawns a potentially slow agent, so they must run concurrently rather than one after another. + +## Check Concurrent Dispatch + +Inspect the execution phase. Confirm that checks are dispatched concurrently — for example via a `JoinSet`, `FuturesUnordered`, or per-check `tokio::spawn` — and awaited together, rather than run inside a blocking loop that starts and awaits one check before beginning the next. The check fails if check execution is strictly sequential. + +# Requirement Authoring Errors Are Actionable + +When a `CHECKS.md` file is malformed, the tool must tell the author exactly what is wrong and where, instead of failing opaquely. Clear diagnostics are what make the format usable. + +## Check Diagnostics Name the File + +Confirm that discovery-time validation produces `miette` diagnostics for both malformed cases — a check with no associated requirement, and a requirement with neither an explicit check nor prose to promote into an anonymous check — and that each diagnostic identifies the offending `CHECKS.md` file. The check fails if either condition is unhandled, or if the diagnostic does not name the source file. + +# Requirement CI-Friendly Exit Status + +`multi check` must work as a CI gate: a clean pass exits zero, and any unsatisfied requirement exits non-zero. + +## Check Exit Code Reflects Result + +Confirm that the command exits with status code 0 when every requirement is satisfied (including the trivial case of an empty suite), and with status code 1 when one or more requirements are unsatisfied. The check fails if a run containing an unsatisfied requirement can exit 0, or if an all-satisfied run exits with a non-zero code. + +# Requirement No Hidden Configuration + +For the MVP, the model, model-provider URL, and effort level are hardcoded and injected into the pipeline. Inspect the configuration phase of `multi check` and confirm these values are hardcoded, and that the discovery and execution phases do not read them from environment variables or from a configuration file. The requirement is satisfied only if configuration is hardcoded for the MVP; it is not satisfied if any model, provider, or effort value is sourced from the environment or a configuration file. diff --git a/Cargo.lock b/Cargo.lock index 69a48fe..caeb630 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -657,6 +657,54 @@ dependencies = [ "tracing", ] +[[package]] +name = "axum" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" +dependencies = [ + "axum-core", + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "hyper 1.7.0", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "sync_wrapper 1.0.2", + "tokio", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper 1.0.2", + "tower-layer", + "tower-service", +] + [[package]] name = "backtrace" version = "0.3.76" @@ -784,7 +832,7 @@ version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645" dependencies = [ - "darling", + "darling 0.21.3", "ident_case", "prettyplease", "proc-macro2", @@ -872,6 +920,15 @@ dependencies = [ "either", ] +[[package]] +name = "caseless" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6fd507454086c8edfd769ca6ada439193cdb209c7681712ef6275cccbfe5d8" +dependencies = [ + "unicode-normalization", +] + [[package]] name = "cc" version = "1.2.44" @@ -905,6 +962,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.1", +] + [[package]] name = "chrono" version = "0.4.42" @@ -1024,6 +1092,23 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "comrak" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aac0b255932a9cd52fbfd664b67957f9f2e095ae4711cb0e41b4e291edef94c2" +dependencies = [ + "caseless", + "entities", + "finl_unicode", + "jetscii", + "phf", + "phf_codegen", + "rustc-hash", + "smallvec", + "typed-arena", +] + [[package]] name = "console" version = "0.15.11" @@ -1072,6 +1157,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1149,8 +1243,18 @@ version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core 0.23.0", + "darling_macro 0.23.0", ] [[package]] @@ -1167,13 +1271,37 @@ dependencies = [ "syn 2.0.109", ] +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim 0.11.1", + "syn 2.0.109", +] + [[package]] name = "darling_macro" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ - "darling_core", + "darling_core 0.21.3", + "quote", + "syn 2.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", "quote", "syn 2.0.109", ] @@ -1310,6 +1438,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "entities" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca" + [[package]] name = "equivalent" version = "1.0.2" @@ -1350,6 +1484,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" +[[package]] +name = "finl_unicode" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9844ddc3a6e533d62bba727eb6c28b5d360921d5175e9ff0f1e621a5c590a4d5" + [[package]] name = "flate2" version = "1.1.5" @@ -1530,11 +1670,23 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.1", +] + [[package]] name = "gimli" version = "0.32.3" @@ -1768,6 +1920,7 @@ dependencies = [ "http 1.3.1", "http-body 1.0.1", "httparse", + "httpdate", "itoa", "pin-project-lite", "pin-utils", @@ -2066,6 +2219,12 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jetscii" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e" + [[package]] name = "jobserver" version = "0.1.34" @@ -2094,9 +2253,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.177" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libloading" @@ -2197,6 +2356,12 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "memchr" version = "2.7.6" @@ -2340,11 +2505,13 @@ dependencies = [ "aws-sdk-lambda", "aws-smithy-runtime-api", "aws-smithy-types", + "axum", "base64 0.22.1", "bigdecimal", "bon", "chrono", "clap 4.5.51", + "comrak", "console", "derive-getters", "dialoguer", @@ -2354,6 +2521,7 @@ dependencies = [ "futures-util", "ignore", "indexmap 2.12.0", + "libc", "miette", "mockall", "multi-core", @@ -2363,6 +2531,8 @@ dependencies = [ "pretty_assertions", "rand 0.9.2", "reqwest 0.12.24", + "rmcp", + "schemars 1.1.0", "serde", "serde_json", "serde_json5", @@ -2615,6 +2785,12 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pastey" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ee67f1008b1ba2321834326597b8e186293b049a023cdef258527550b9935b4" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -2664,6 +2840,45 @@ dependencies = [ "sha2", ] +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_shared", + "serde", +] + +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.10" @@ -2852,7 +3067,7 @@ dependencies = [ "pingora-http", "pingora-ketama", "pingora-runtime", - "rand 0.9.2", + "rand 0.10.1", "tokio", ] @@ -2865,7 +3080,7 @@ dependencies = [ "arrayvec", "hashbrown 0.16.0", "parking_lot", - "rand 0.9.2", + "rand 0.10.1", ] [[package]] @@ -3089,7 +3304,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.35", - "socket2 0.5.10", + "socket2 0.6.1", "thiserror 2.0.17", "tokio", "tracing", @@ -3126,9 +3341,9 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.1", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -3146,6 +3361,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "rand" version = "0.8.5" @@ -3167,6 +3388,17 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rand" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" +dependencies = [ + "chacha20", + "getrandom 0.4.3", + "rand_core 0.10.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -3205,6 +3437,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" + [[package]] name = "redox_syscall" version = "0.5.18" @@ -3381,6 +3619,49 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rmcp" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1f571c72940a19d9532fe52dbea8bc9912bf1d766c2970bb824056b86f3f59" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "pastey", + "pin-project-lite", + "rand 0.10.1", + "rmcp-macros", + "schemars 1.1.0", + "serde", + "serde_json", + "sse-stream", + "thiserror 2.0.17", + "tokio", + "tokio-stream", + "tokio-util", + "tower-service", + "tracing", + "uuid", +] + +[[package]] +name = "rmcp-macros" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aad0035b69380782d78ea95b508327e6deaa2235909053e596eea8f27b5e1d5" +dependencies = [ + "darling 0.23.0", + "proc-macro2", + "quote", + "serde_json", + "syn 2.0.109", +] + [[package]] name = "rmp" version = "0.8.14" @@ -3604,12 +3885,26 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9558e172d4e8533736ba97870c4b2cd63f84b382a3d6eb063da41b91cce17289" dependencies = [ + "chrono", "dyn-clone", "ref-cast", + "schemars_derive", "serde", "serde_json", ] +[[package]] +name = "schemars_derive" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "301858a4023d78debd2353c7426dc486001bddc91ae31a76fb1f55132f7e2633" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn 2.0.109", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -3778,6 +4073,17 @@ dependencies = [ "syn 2.0.109", ] +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.109", +] + [[package]] name = "serde_json" version = "1.0.145" @@ -3802,6 +4108,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -3859,7 +4176,7 @@ version = "3.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b91a903660542fced4e99881aa481bdbaec1634568ee02e0b8bd57c64cb38955" dependencies = [ - "darling", + "darling 0.21.3", "proc-macro2", "quote", "syn 2.0.109", @@ -3895,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -3935,6 +4252,12 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + [[package]] name = "slab" version = "0.4.11" @@ -3967,6 +4290,19 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "sse-stream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3962b63f038885f15bce2c6e02c0e7925c072f1ac86bb60fd44c5c6b762fb72" +dependencies = [ + "bytes", + "futures-util", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -4622,6 +4958,12 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + [[package]] name = "typenum" version = "1.19.0" @@ -4661,6 +5003,15 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-width" version = "0.1.14" diff --git a/Cargo.toml b/Cargo.toml index 5d1c0eb..a1c366a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -76,6 +76,12 @@ uuid = { version = "1.9", features = ["serde", "v4"] } ignore.workspace = true base64.workspace = true tokio-util.workspace = true +comrak = { version = "0.52", default-features = false } +libc = "0.2.186" +rmcp = { version = "1.8", default-features = false, features = ["server", "macros", "schemars", "transport-streamable-http-server", "transport-streamable-http-server-session", "tower"] } +axum = { version = "0.8", default-features = false, features = ["http1", "tokio", "json"] } +schemars = "1" +tempfile = "3" [workspace.dependencies] pretty_assertions = "1.4" @@ -113,3 +119,34 @@ lto = "thin" [package.metadata.dist] formula = "multi" + +[package.metadata.bacon] +default_job = "fmt-check" +env.CARGO_TERM_COLOR = "always" + +[package.metadata.bacon.jobs.fmt-check] +command = ["cargo", "fmt", "--all", "--", "--check"] +need_stdout = false +on_success = "job:check" + +[package.metadata.bacon.jobs.check] +command = ["cargo", "check", "--all-targets", "--workspace"] +need_stdout = false +on_success = "job:clippy" + +[package.metadata.bacon.jobs.clippy] +command = ["cargo", "make", "clippy"] +need_stdout = false + +[package.metadata.bacon.jobs.nextest] +command = [ + "cargo", + "nextest", + "run", + "--workspace", + "--hide-progress-bar", + "--failure-output", + "final", +] +need_stdout = true +analyzer = "nextest" diff --git a/Makefile.toml b/Makefile.toml index 128bc19..6e50d48 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -2,7 +2,6 @@ default_to_workspace = false [env] -CARGO_MAKE_CLIPPY_ARGS = "-- --no-deps" CARGO_MAKE_COVERAGE_PROVIDER = "llvm-cov" CARGO_MAKE_CARGO_BUILD_TEST_FLAGS = "" # This is the directory where we generate the CLI's reference @@ -31,6 +30,19 @@ dependencies = [ [tasks.pre-build] dependencies = ["format-toml-conditioned-flow", "unused-dependencies-flow"] +# Canonical clippy command — the single source of truth shared by CI +# (ci-flow -> clippy-flow) and the bacon clippy job. Overrides cargo-make's +# built-in `clippy` task so both paths run the IDENTICAL command. `-- -D warnings` +# makes any clippy or rustc warning fail both CI and the bacon monitor, so a +# clean monitor run predicts a clean CI clippy run exactly. `workspace = false` +# is explicit (the command iterates the workspace itself via `--workspace`). +[tasks.clippy] +description = "Canonical clippy command — shared by CI (ci-flow) and bacon." +category = "Development" +workspace = false +command = "cargo" +args = ["clippy", "--all-targets", "--workspace", "--locked", "--", "-D", "warnings"] + [tasks.ci-flow] dependencies = [ "pre-ci-flow", @@ -109,3 +121,10 @@ args = [ "--workspace", "@@split(CARGO_MAKE_CARGO_BUILD_TEST_FLAGS, )", ] + +[tasks.monitor] +description = "Run bacon headless for Claude Code's Monitor tool (no TTY, summary output on each change)." +category = "Development" +workspace = false +command = "bacon" +args = ["--headless", "--summary", "--no-help-line"] diff --git a/guides/checks.md b/guides/checks.md new file mode 100644 index 0000000..f36544c --- /dev/null +++ b/guides/checks.md @@ -0,0 +1,165 @@ +# Validate requirements with `multi check` + +`multi check` lets you declare **requirements** for a repository and have an +automated program validate that they hold. It's like a test suite — run it from +the CLI, get a pass/fail exit code — but it targets **non-functional ("ility") +requirements** that have no direct, programmatic unit to test (e.g. "no serif +fonts", "no images over 5 MB", "every public function is documented"). + +Each requirement is validated by one or more **checks**. In the MVP a check is a +`prompt`: a natural-language instruction that a Claude Code agent carries out to +decide whether the requirement is satisfied. A requirement is satisfied only if +**all** of its checks pass (logical AND). + +## ✅ Prerequisites + +- [ ] A working [`claude`](https://docs.claude.com/en/docs/claude-code) CLI on + your `PATH` (checks shell out to `claude -p`). +- [ ] **macOS** — the MVP sandboxes each check with an APFS copy-on-write clone. + Other operating systems are not yet supported. + +## 🏃 Running it + +From the root of your project: + +```bash +multi check +``` + +`multi check` recursively scans the working directory for files named exactly +`CHECKS.md`, runs every check it finds, and prints a report. You can point it at +a different directory: + +```bash +multi check path/to/project +``` + +### Exit codes (for CI) + +| Exit code | Meaning | +| --------- | ------- | +| `0` | Every requirement is satisfied (an empty project — no `CHECKS.md` — also exits `0`). | +| `1` | One or more requirements are unsatisfied. | +| non-zero | An operational error (e.g. a malformed `CHECKS.md`), printed as a diagnostic. | + +Because a clean failure is exit `1`, you can gate CI on it directly: + +```bash +multi check || echo "requirements not met" +``` + +## ✍️ Authoring `CHECKS.md` + +`CHECKS.md` files are ordinary Markdown. Two header patterns carry metadata; +everything else is prose. + +### Requirements + +An **H1** whose text begins with `Requirement ` (or the alias `Req `) declares a +requirement. The rest of the line is its **title**. + +```markdown +# Requirement No Yellow Text +``` + +```markdown +# Req No Yellow Text +``` + +Both declare a requirement titled `No Yellow Text`. + +### Checks + +An **H2** whose text begins with `Check ` declares a check. The text after +`Check ` is the check's title, and the Markdown **beneath** it (up to the next +requirement or check) is the **prompt** handed to the agent. A check belongs to +the nearest requirement above it. + +```markdown +# Requirement No Yellow Text +I don't like the color yellow. + +## Check Confirm No Yellow Text +Scan each CSS file in this directory. For every rule that sets a `color`, +ensure the value is not `yellow`. +``` + +Here, `I don't like the color yellow.` is **prose** — optional metadata that is +ignored when a requirement has explicit `## Check` headers. + +### Anonymous checks (prose-as-check) + +If a requirement declares **no** `## Check`, its prose body becomes a single +**anonymous check** that **inherits the requirement's title**: + +```markdown +# Requirement No Serif Fonts +Scan the CSS files in this directory. Check each font. +Ensure none of the named fonts are serif fonts. +``` + +This is equivalent to one requirement `No Serif Fonts` with one check, also +titled `No Serif Fonts`, whose prompt is the prose above. + +A requirement with **neither** a `## Check` **nor** prose is an error, as is a +`## Check` with no requirement above it. + +### Multiple checks (ANDed) + +A requirement may declare several checks. It is satisfied only if **all** of +them pass: + +```markdown +# Requirement Images must be under 5 MB +To keep downloads snappy, no image in this folder may exceed 5 MB. + +## Check JPEGs +List the `.jpg` files with `ls`/`grep`, `stat` each one, and flag any larger +than 5 MB. + +## Check SVGs +List the `.svg` files with `ls`/`grep`, `stat` each one, and flag any larger +than 5 MB. +``` + +### Multiple files + +You can keep more than one `CHECKS.md` in a project — colocate requirements with +the code they describe, or split a long suite into pieces. Every `CHECKS.md` +under the working directory is discovered recursively. (`.gitignore` rules are +respected, so generated and vendored trees are skipped.) + +Titles are **not** unique — two requirements (or two checks) may share a title, +and both are kept. + +## 📊 How results are reported + +Each requirement's title is printed in **green** if it passed and **red** if it +failed. For a failed requirement, its **failing checks** are listed in red along +with the agent's *evidence* explaining why. Passing checks are omitted to keep +the output focused. Color follows the global `--enable-colors` flag and degrades +to a clear plain-text form when disabled. + +## 🔒 The trust model + +Agents are nondeterministic, so `multi check` does **not** trust their stdout or +any sentinel file. Instead, each agent reports its verdict by calling a single +MCP tool, `report-check-result(success, evidence?)`, served by an in-process MCP +server the CLI runs on `localhost` (one dedicated endpoint per check). An agent +that finishes **without** calling the tool fails its check. This keeps results +trustworthy despite agent nondeterminism. + +## ⚠️ MVP constraints + +- **macOS only** — copy-on-write sandboxing uses APFS `clonefile`. Linux and + Windows support is planned. +- **`prompt`-type checks only** — checks run via `claude -p` against the `sonnet` + model family. A `shell` check type is planned. +- **Hardcoded configuration** — the model/provider are fixed for the MVP; there + is no environment or file-based configuration yet. + +## 📬 Need help? + +If you have questions, ideas, or bugs to report: + +👉 [support@multitool.run](mailto:support@multitool.run) diff --git a/rust-toolchain b/rust-toolchain deleted file mode 100644 index 2bf5ad0..0000000 --- a/rust-toolchain +++ /dev/null @@ -1 +0,0 @@ -stable diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..2da6190 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,9 @@ +[toolchain] +channel = "1.96.0" +# The "default" profile adds the components: +# "clippy", "rustfmt", "rustdocs", "rustc", "cargo", and "ruststd". +# "llvm-tools-preview" is required by cargo-llvm-cov (the `coverage` job in +# on-merge.yml). Declaring it here provisions it onto this pinned toolchain so +# cargo-llvm-cov doesn't have to `rustup component add` it lazily at build time. +components = ["rust-analyzer", "rust-src", "llvm-tools-preview"] +profile = "default" diff --git a/src/adapters/cloudflare/deployments.rs b/src/adapters/cloudflare/deployments.rs index 02825a8..fba2b66 100644 --- a/src/adapters/cloudflare/deployments.rs +++ b/src/adapters/cloudflare/deployments.rs @@ -253,7 +253,7 @@ mod tests { // Test deserialization let response: CloudflareResponse<TestResult> = serde_json::from_str(json_str).unwrap(); - assert_eq!(response.success, true); + assert!(response.success); // Test errors let errors = response.errors.as_ref().unwrap(); diff --git a/src/checks/config.rs b/src/checks/config.rs new file mode 100644 index 0000000..9488692 --- /dev/null +++ b/src/checks/config.rs @@ -0,0 +1,95 @@ +//! The configuration phase (M2 #1341). +//! +//! For the MVP this is **hardcoded** (no env/file loading), but its values are +//! **dependency-injected** forward — execution receives a [`BoxedExecutor`] built +//! from this [`Config`] rather than reading provider details at point of use. So +//! swapping providers later is a config change, not a rewrite. + +use std::time::Duration; + +use crate::checks::executor::BoxedExecutor; +use crate::checks::executor::claude::ClaudeExecutor; + +/// Effort level for the agent. Carried through configuration and logged by the +/// executor; not yet mapped to a concrete `claude -p` flag for the MVP +/// (see TODO in the executor). `Medium`/`High` are reserved for richer providers. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(dead_code)] // Medium/High are reserved for future provider wiring. +pub enum Effort { + Low, + Medium, + High, +} + +/// The resolved configuration for a `multi check` run. +#[derive(Debug, Clone)] +pub struct Config { + /// Optional model-provider base URL. `None` uses the `claude` CLI default. + pub provider_url: Option<String>, + /// The model family to run (default: the `sonnet` family). + pub model: String, + /// The effort level (default: low). + pub effort: Effort, + /// Maximum number of checks executed concurrently. + pub concurrency: usize, + /// Per-agent wall-clock timeout (reaps an agent that hangs before reporting). + pub agent_timeout: Duration, + /// How many times to (re)run a check whose agent fails to report. Agents are + /// nondeterministic and occasionally hang or finish without calling the + /// report tool; a fresh attempt against the same endpoint usually succeeds. + /// A check only resolves as errored after all attempts are exhausted. + pub max_attempts: usize, +} + +impl Config { + /// Construct the concrete [`BoxedExecutor`] from this configuration. This is + /// the injection point: execution is handed the boxed executor, never a + /// concrete type or a global. + pub fn build_executor(&self) -> BoxedExecutor { + Box::new(ClaudeExecutor::new( + self.model.clone(), + self.provider_url.clone(), + self.effort, + self.agent_timeout, + )) + } +} + +/// The configuration phase: produce the hardcoded MVP [`Config`]. +/// +/// Hardcoded to `claude -p` + the `sonnet` family. Environment/file loading is +/// explicitly out of scope (see *Future work: global model configuration*). +pub fn configuration() -> Config { + Config { + provider_url: None, + // The `sonnet` family. (The original MVP target was `haiku`, but on the + // multi-file *reasoning* checks this feature exists for, haiku reliably + // spins in a runaway exploration loop and never reaches a verdict; + // sonnet reasons efficiently and reports in well under a minute.) + model: "sonnet".to_string(), + effort: Effort::Low, + // Each check is a full `claude` agent process, killed the instant it + // reports (see execution::run_one), so they don't linger. A small fan-out + // gives each (CPU-heavy) reasoning agent enough cores to finish promptly. + concurrency: 2, + // Reaps an agent that hangs *before* reporting so the check can be + // retried. Generous: the heaviest reasoning checks can take a few minutes + // under contention before they report. + agent_timeout: Duration::from_secs(240), + max_attempts: 3, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn defaults_are_hardcoded_and_bounded_concurrency() { + let cfg = configuration(); + assert_eq!(cfg.model, "sonnet"); + assert!(cfg.concurrency >= 1); + // The executor is constructible (DI seam works). + let _exec = cfg.build_executor(); + } +} diff --git a/src/checks/discovery/mod.rs b/src/checks/discovery/mod.rs new file mode 100644 index 0000000..bdd5dba --- /dev/null +++ b/src/checks/discovery/mod.rs @@ -0,0 +1,94 @@ +//! The discovery phase: find `CHECKS.md` files, parse each to an AST, extract +//! the requirement/check model (including anonymous checks), validate, and +//! assemble the final `Vec<Requirement>` consumed by execution. +//! +//! File loading + parsing run in parallel (one blocking task per file); this +//! module joins the results in deterministic (sorted-file) order and aggregates +//! all validation errors into a single diagnostic rather than failing fast, for +//! authoring ergonomics. + +mod parse; +mod walk; + +use std::path::Path; + +use miette::{IntoDiagnostic, Result}; +use multi_core::ManyError; + +use crate::checks::model::Requirement; + +/// Run the full discovery phase rooted at `root`. +/// +/// Returns the validated requirement set (every requirement guaranteed to have +/// ≥1 check), or an aggregated diagnostic naming every offending file/line. An +/// empty tree yields an empty set (the pipeline then succeeds with exit 0). +pub async fn discover(root: &Path) -> Result<Vec<Requirement>> { + let files = walk::find_checks_files(root)?; + + // Parse + extract each file in parallel on blocking tasks (file IO + CPU). + let handles: Vec<_> = files + .into_iter() + .map(|path| tokio::task::spawn_blocking(move || parse::extract_file(&path))) + .collect(); + + let mut requirements = Vec::new(); + let mut errors = ManyError::default(); + for handle in handles { + let extraction = handle.await.into_diagnostic()?; + for err in extraction.errors { + errors.append(err); + } + requirements.extend(extraction.requirements); + } + + if !errors.is_empty() { + return Err(errors.into()); + } + Ok(requirements) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::TempDir; + + #[tokio::test] + async fn valid_multi_file_tree_returns_all_requirements() { + let dir = TempDir::new().unwrap(); + fs::write(dir.path().join("CHECKS.md"), "# Requirement A\ndo a\n").unwrap(); + fs::create_dir_all(dir.path().join("sub")).unwrap(); + fs::write( + dir.path().join("sub/CHECKS.md"), + "# Requirement B\n## Check B1\nb1\n## Check B2\nb2\n", + ) + .unwrap(); + + let reqs = discover(dir.path()).await.unwrap(); + assert_eq!(reqs.len(), 2); + // Each requirement has at least one check. + assert!(reqs.iter().all(|r| !r.checks.is_empty())); + // Sorted-file order: root CHECKS.md ("A") before sub/CHECKS.md ("B"). + assert_eq!(reqs[0].title, "A"); + assert_eq!(reqs[1].checks.len(), 2); + } + + #[tokio::test] + async fn empty_tree_is_ok_and_empty() { + let dir = TempDir::new().unwrap(); + let reqs = discover(dir.path()).await.unwrap(); + assert!(reqs.is_empty()); + } + + #[tokio::test] + async fn orphan_check_aborts_discovery() { + let dir = TempDir::new().unwrap(); + fs::write( + dir.path().join("CHECKS.md"), + "## Check Orphan\nno req above\n", + ) + .unwrap(); + let err = discover(dir.path()).await.unwrap_err(); + assert!(format!("{err:?}").contains("orphan")); + } +} diff --git a/src/checks/discovery/parse.rs b/src/checks/discovery/parse.rs new file mode 100644 index 0000000..f005b30 --- /dev/null +++ b/src/checks/discovery/parse.rs @@ -0,0 +1,401 @@ +//! Parse a single `CHECKS.md` into a Markdown AST (via `comrak`) and walk it to +//! extract [`Requirement`]s and their [`Check`]s. +//! +//! Sentinel rules (header-encoded metadata): +//! +//! * An `H1` whose text matches `^(Requirement|Req)\s+` declares a **requirement**; +//! the remainder is the title. +//! * An `H2` whose text matches `^Check\b` declares a **check**, associated with +//! the nearest preceding requirement. The Markdown beneath it (until the next +//! sentinel heading) is the check **prompt**. +//! * Anything else is **prose**: optional metadata. When a requirement declares +//! no explicit `## Check`, its prose body is promoted to a single **anonymous +//! check** that inherits the requirement's title. +//! +//! Validation errors (orphan check, checkless requirement) are collected rather +//! than thrown so the caller can aggregate them across files. + +use std::path::Path; + +use comrak::{Arena, Options, nodes::NodeValue, parse_document}; +use miette::{Diagnostic, miette}; +use thiserror::Error; + +use crate::checks::model::{Check, Requirement}; + +/// The result of extracting one file: the requirements it declared plus any +/// validation errors (each a ready-to-render `miette` diagnostic). +pub struct FileExtraction { + pub requirements: Vec<Requirement>, + pub errors: Vec<miette::Error>, +} + +#[derive(Debug, Error, Diagnostic)] +#[error("orphan check in {file}: a `## Check` at line {line} has no preceding requirement")] +#[diagnostic(help("move this `## Check` beneath a `# Requirement` / `# Req` heading"))] +struct OrphanCheck { + file: String, + line: usize, +} + +#[derive(Debug, Error, Diagnostic)] +#[error( + "checkless requirement in {file}: requirement {title:?} (line {line}) declares no `## Check` and has no prose to infer an anonymous check from" +)] +#[diagnostic(help( + "add a `## Check ...` beneath it, or write prose describing how to validate it" +))] +struct ChecklessRequirement { + file: String, + title: String, + line: usize, +} + +/// A structural sentinel found while scanning the AST. +enum Marker { + Requirement { title: String, line: usize }, + Check { title: String, line: usize }, +} + +/// Parse `source` (the contents of `path`) and extract its requirements/checks. +/// +/// This is the AST-walking heart of discovery (MULTI-1335/1336/1337/1338). It +/// never fails outright — structural problems are returned as `errors` so the +/// caller can aggregate across all files. +pub fn extract(path: &Path, source: &str) -> FileExtraction { + let file = path.display().to_string(); + let line_starts = line_byte_starts(source); + + // Parse to a comrak AST. The arena owns the nodes for this scope. + let arena = Arena::new(); + let root = parse_document(&arena, source, &Options::default()); + + // Pass 1: collect the structural markers (Requirement/Check headings) in + // document order, with the 1-based line each heading starts and ends on. + let mut markers: Vec<(Marker, usize)> = Vec::new(); // (marker, heading_end_line) + for node in root.children() { + let (level, start_line, end_line) = { + let data = node.data(); + match &data.value { + NodeValue::Heading(h) => { + (h.level, data.sourcepos.start.line, data.sourcepos.end.line) + } + _ => continue, + } + }; + let text = heading_text(node); + let marker = match level { + 1 => requirement_title(&text).map(|title| Marker::Requirement { + title, + line: start_line, + }), + 2 => check_title(&text).map(|title| Marker::Check { + title, + line: start_line, + }), + _ => None, + }; + if let Some(marker) = marker { + markers.push((marker, end_line)); + } + } + + // Pass 2: assemble requirements. Each marker's body is the raw Markdown from + // the line after its heading up to (but not including) the next marker. + let total_lines = line_starts.len(); + let mut errors: Vec<miette::Error> = Vec::new(); + let mut builders: Vec<ReqBuilder> = Vec::new(); + + for (idx, (marker, heading_end_line)) in markers.iter().enumerate() { + let next_start = markers + .get(idx + 1) + .map(|(m, _)| marker_line(m)) + .unwrap_or(total_lines + 1); + let body = slice_lines(source, &line_starts, heading_end_line + 1, next_start); + + match marker { + Marker::Requirement { title, line } => { + builders.push(ReqBuilder { + title: title.clone(), + line: *line, + prose: body.to_string(), + checks: Vec::new(), + }); + } + Marker::Check { title, line } => match builders.last_mut() { + Some(req) => { + let title = if title.is_empty() { + req.title.clone() // inherit when `## Check` has no title text + } else { + title.clone() + }; + req.checks.push(Check { + title, + prompt: body.to_string(), + }); + } + None => errors.push( + OrphanCheck { + file: file.clone(), + line: *line, + } + .into(), + ), + }, + } + } + + // Pass 3: anonymous-check inference + checkless validation. + let mut requirements = Vec::new(); + for builder in builders { + let ReqBuilder { + title, + line, + prose, + mut checks, + } = builder; + if checks.is_empty() { + let prose = prose.trim(); + if prose.is_empty() { + errors.push( + ChecklessRequirement { + file: file.clone(), + title: title.clone(), + line, + } + .into(), + ); + continue; + } + // Promote prose to a single anonymous check inheriting the req title. + checks.push(Check { + title: title.clone(), + prompt: prose.to_string(), + }); + } + requirements.push(Requirement { + filepath: path.to_path_buf(), + title, + checks, + }); + } + + FileExtraction { + requirements, + errors, + } +} + +/// A requirement under construction during extraction. +struct ReqBuilder { + title: String, + line: usize, + prose: String, + checks: Vec<Check>, +} + +fn marker_line(m: &Marker) -> usize { + match m { + Marker::Requirement { line, .. } | Marker::Check { line, .. } => *line, + } +} + +/// If `text` declares a requirement (`Requirement <title>` / `Req <title>`), +/// return the title. Requires whitespace after the keyword, per the spec's +/// `^(Requirement|Req) ` sentinel. +fn requirement_title(text: &str) -> Option<String> { + for kw in ["Requirement", "Req"] { + if let Some(rest) = text.strip_prefix(kw) + && rest.starts_with(char::is_whitespace) + { + return Some(rest.trim().to_string()); + } + } + None +} + +/// If `text` declares a check (`Check` or `Check <title>`), return the title +/// (empty string for a bare `## Check`, which inherits the requirement title). +fn check_title(text: &str) -> Option<String> { + let rest = text.strip_prefix("Check")?; + if rest.is_empty() { + return Some(String::new()); + } + if rest.starts_with(char::is_whitespace) { + return Some(rest.trim().to_string()); + } + None +} + +/// Concatenate the inline text of a heading node into a plain string. +fn heading_text<'a>(node: &'a comrak::nodes::AstNode<'a>) -> String { + let mut out = String::new(); + for d in node.descendants() { + match &d.data().value { + NodeValue::Text(s) => out.push_str(s.as_ref()), + NodeValue::Code(c) => out.push_str(&c.literal), + NodeValue::SoftBreak | NodeValue::LineBreak => out.push(' '), + _ => {} + } + } + out +} + +/// Byte offset of the start of each 1-based line in `source`. +fn line_byte_starts(source: &str) -> Vec<usize> { + let mut starts = vec![0usize]; + for (i, b) in source.bytes().enumerate() { + if b == b'\n' { + starts.push(i + 1); + } + } + starts +} + +/// Slice the raw source spanning 1-based lines `[start_line, end_line_excl)`, +/// trimmed. Out-of-range bounds clamp to the document. +fn slice_lines<'a>( + source: &'a str, + line_starts: &[usize], + start_line: usize, + end_line_excl: usize, +) -> &'a str { + let byte_at = |line_1based: usize| -> usize { + if line_1based == 0 { + 0 + } else if line_1based - 1 < line_starts.len() { + line_starts[line_1based - 1] + } else { + source.len() + } + }; + let start = byte_at(start_line).min(source.len()); + let end = byte_at(end_line_excl).min(source.len()); + if start >= end { + return ""; + } + source[start..end].trim() +} + +/// Convenience: read + extract, surfacing read errors as a one-off diagnostic. +pub fn extract_file(path: &Path) -> FileExtraction { + match std::fs::read_to_string(path) { + Ok(source) => extract(path, &source), + Err(e) => FileExtraction { + requirements: Vec::new(), + errors: vec![miette!("failed to read {}: {e}", path.display())], + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + fn extract_str(src: &str) -> FileExtraction { + extract(&PathBuf::from("CHECKS.md"), src) + } + + #[test] + fn requirement_with_two_checks_keeps_order_and_verbatim_prompts() { + let src = "# Requirement Images small\nintro prose\n\n## Check JPegs\nstat the jpgs\nflag big ones\n\n## Check SVGs\nstat the svgs\n"; + let out = extract_str(src); + assert!(out.errors.is_empty(), "errors: {:?}", out.errors); + assert_eq!(out.requirements.len(), 1); + let req = &out.requirements[0]; + assert_eq!(req.title, "Images small"); + assert_eq!(req.checks.len(), 2); + assert_eq!(req.checks[0].title, "JPegs"); + assert_eq!(req.checks[0].prompt, "stat the jpgs\nflag big ones"); + assert_eq!(req.checks[1].title, "SVGs"); + assert_eq!(req.checks[1].prompt, "stat the svgs"); + } + + #[test] + fn req_alias_parses_same_as_requirement() { + let a = extract_str("# Requirement Foo\n## Check C\ndo it\n"); + let b = extract_str("# Req Foo\n## Check C\ndo it\n"); + assert_eq!(a.requirements[0].title, "Foo"); + assert_eq!(b.requirements[0].title, "Foo"); + } + + #[test] + fn anonymous_check_inherits_title_and_uses_prose() { + let src = "# Requirement No Serif Fonts\nScan the CSS files in this directory. Check each font.\nEnsure none of the named fonts are serif fonts.\n"; + let out = extract_str(src); + assert!(out.errors.is_empty()); + let req = &out.requirements[0]; + assert_eq!(req.checks.len(), 1); + assert_eq!(req.checks[0].title, "No Serif Fonts"); + assert!(req.checks[0].prompt.starts_with("Scan the CSS files")); + assert!(req.checks[0].prompt.ends_with("serif fonts.")); + } + + #[test] + fn explicit_checks_do_not_promote_prose() { + let src = "# Requirement R\nthis prose is metadata\n## Check C\nthe real prompt\n"; + let out = extract_str(src); + let req = &out.requirements[0]; + assert_eq!(req.checks.len(), 1); + assert_eq!(req.checks[0].prompt, "the real prompt"); + } + + #[test] + fn orphan_check_is_an_error() { + let out = extract_str("## Check Bar\ndo something\n"); + assert_eq!(out.requirements.len(), 0); + assert_eq!(out.errors.len(), 1); + assert!(format!("{:?}", out.errors[0]).contains("orphan")); + } + + #[test] + fn checkless_requirement_is_an_error() { + let out = extract_str("# Requirement Empty\n\n# Requirement Other\ndo a thing\n"); + // "Empty" has no prose and no checks -> error; "Other" is a valid anon check. + assert_eq!(out.requirements.len(), 1); + assert_eq!(out.requirements[0].title, "Other"); + assert_eq!(out.errors.len(), 1); + assert!(format!("{:?}", out.errors[0]).contains("checkless")); + } + + #[test] + fn non_sentinel_headings_are_prose() { + // `# Overview` is not a requirement; it should be ignored as prose and + // not yield a requirement. + let out = extract_str("# Overview\nsome words\n"); + assert_eq!(out.requirements.len(), 0); + assert!(out.errors.is_empty()); + } + + #[test] + fn multiple_requirements_in_one_file() { + let src = "# Requirement No JSON\nno json files\n\n# Requirement Small Images\n## Check J\njpgs\n"; + let out = extract_str(src); + assert!(out.errors.is_empty()); + assert_eq!(out.requirements.len(), 2); + assert_eq!(out.requirements[0].title, "No JSON"); + assert_eq!(out.requirements[0].checks[0].title, "No JSON"); + assert_eq!(out.requirements[1].title, "Small Images"); + } + + #[test] + fn duplicate_titles_are_preserved_not_deduped() { + // Two requirements share a title; one of them has two checks with the + // same title. None of these are unique — all must survive verbatim. + let src = + "# Requirement Dup\ndo a\n\n# Requirement Dup\n## Check Same\nb1\n## Check Same\nb2\n"; + let out = extract_str(src); + assert!(out.errors.is_empty(), "errors: {:?}", out.errors); + assert_eq!(out.requirements.len(), 2); + assert_eq!(out.requirements[0].title, "Dup"); + assert_eq!(out.requirements[1].title, "Dup"); + // Both same-titled checks are kept, in order. + let checks = &out.requirements[1].checks; + assert_eq!(checks.len(), 2); + assert_eq!(checks[0].title, "Same"); + assert_eq!(checks[1].title, "Same"); + assert_eq!(checks[0].prompt, "b1"); + assert_eq!(checks[1].prompt, "b2"); + } +} diff --git a/src/checks/discovery/walk.rs b/src/checks/discovery/walk.rs new file mode 100644 index 0000000..1bb0d9b --- /dev/null +++ b/src/checks/discovery/walk.rs @@ -0,0 +1,61 @@ +//! Recursively discover `CHECKS.md` files beneath the working directory. +//! +//! Uses the `ignore` crate (the ripgrep walker) for a fast, gitignore-aware +//! walk. Ignore files (`.gitignore`, etc.) are **respected** by default so that +//! generated / vendored trees are skipped. + +use std::path::{Path, PathBuf}; + +use ignore::WalkBuilder; +use miette::{IntoDiagnostic, Result}; + +/// The exact filename that declares checks. +pub const CHECKS_FILENAME: &str = "CHECKS.md"; + +/// Recursively find every file named exactly `CHECKS.md` under `root`. +/// +/// Results are sorted lexicographically so downstream parsing and reporting are +/// deterministic regardless of filesystem iteration order. +pub fn find_checks_files(root: &Path) -> Result<Vec<PathBuf>> { + let mut files = Vec::new(); + for entry in WalkBuilder::new(root).standard_filters(true).build() { + let entry = entry.into_diagnostic()?; + let is_file = entry.file_type().is_some_and(|ft| ft.is_file()); + if is_file && entry.file_name() == CHECKS_FILENAME { + files.push(entry.path().to_path_buf()); + } + } + files.sort(); + Ok(files) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::TempDir; + + #[test] + fn finds_nested_checks_and_ignores_others() { + let dir = TempDir::new().unwrap(); + let root = dir.path(); + fs::write(root.join("CHECKS.md"), "# Requirement A\ndo a\n").unwrap(); + fs::create_dir_all(root.join("sub/deep")).unwrap(); + fs::write(root.join("sub/CHECKS.md"), "# Requirement B\ndo b\n").unwrap(); + fs::write(root.join("sub/deep/CHECKS.md"), "# Requirement C\ndo c\n").unwrap(); + // Non-matching files are ignored. + fs::write(root.join("README.md"), "nope").unwrap(); + fs::write(root.join("sub/checks.md"), "wrong case").unwrap(); + + let found = find_checks_files(root).unwrap(); + assert_eq!(found.len(), 3, "found: {found:?}"); + assert!(found.iter().all(|p| p.file_name().unwrap() == "CHECKS.md")); + } + + #[test] + fn empty_tree_yields_empty_set() { + let dir = TempDir::new().unwrap(); + let found = find_checks_files(dir.path()).unwrap(); + assert!(found.is_empty()); + } +} diff --git a/src/checks/e2e.rs b/src/checks/e2e.rs new file mode 100644 index 0000000..c6120c7 --- /dev/null +++ b/src/checks/e2e.rs @@ -0,0 +1,152 @@ +//! End-to-end pipeline test (MULTI-1355). +//! +//! Drives the real discovery → execution → reporting → exit-code pipeline over a +//! fixture tree, with the fake executor and a no-op sandbox injected so verdicts +//! are scripted. Runs deterministically in CI with no `claude`, network, or APFS +//! dependency, guarding the end-to-end contract against regressions. + +use std::fs; +use std::sync::Arc; + +use clap::Parser; +use tempfile::TempDir; + +use crate::checks::config::configuration; +use crate::checks::discovery::discover; +use crate::checks::execution::execute; +use crate::checks::executor::FakeExecutor; +use crate::checks::reporting::report; +use crate::checks::sandbox::NoopSandbox; +use crate::{Cli, Terminal}; + +/// A terminal with color forced off, so reporting emits deterministic text. +fn plain_terminal() -> Terminal { + let cli = Cli::parse_from(["multi", "--enable-colors", "never"]); + Terminal::new(&cli) +} + +/// A fixture tree spanning two files: a satisfied anonymous-check requirement, a +/// two-check (AND) requirement, and — nested — a failing requirement. +fn write_fixture() -> TempDir { + let dir = TempDir::new().unwrap(); + fs::write( + dir.path().join("CHECKS.md"), + "# Requirement Anon Satisfied\n\ + scan and confirm this holds\n\ + \n\ + # Requirement Multi And\n\ + ## Check First\n\ + first prompt\n\ + ## Check Second\n\ + second prompt\n", + ) + .unwrap(); + fs::create_dir_all(dir.path().join("nested")).unwrap(); + fs::write( + dir.path().join("nested/CHECKS.md"), + "# Requirement Failing\n\ + ## Check Bad\n\ + this one fails\n", + ) + .unwrap(); + dir +} + +#[tokio::test] +async fn pipeline_satisfied_failed_multi_and_anonymous() { + let dir = write_fixture(); + let reqs = discover(dir.path()).await.unwrap(); + + // Discovery: three requirements across two files, sorted (root before nested). + assert_eq!(reqs.len(), 3); + assert_eq!(reqs[0].title, "Anon Satisfied"); + assert_eq!(reqs[1].title, "Multi And"); + assert_eq!(reqs[2].title, "Failing"); + // The anonymous check inherits the requirement's title. + assert_eq!(reqs[0].checks.len(), 1); + assert_eq!(reqs[0].checks[0].title, "Anon Satisfied"); + // The multi-check requirement carries two checks (ANDed). + assert_eq!(reqs[1].checks.len(), 2); + + // Script the fake by check id (ids are assigned in requirement/check order). + // Everything passes except the "Failing" requirement's check. + let mut fake = FakeExecutor::new(); + let mut id = 0; + let mut total = 0; + for req in &reqs { + for _ in &req.checks { + let pass = req.title != "Failing"; + fake = fake.with_report(id, pass, Some(if pass { "ok" } else { "nope" })); + id += 1; + total += 1; + } + } + let fake = Arc::new(fake); + + let cfg = configuration(); + let outcomes = execute(&cfg, fake.clone(), Arc::new(NoopSandbox), dir.path(), &reqs) + .await + .unwrap(); + + // Aggregated verdicts: satisfied / satisfied(AND) / failed. + assert!( + outcomes[0].satisfied, + "anonymous-check requirement should pass" + ); + assert!( + outcomes[1].satisfied, + "multi-check AND requirement should pass" + ); + assert!(!outcomes[2].satisfied, "failing requirement should fail"); + assert_eq!(outcomes[2].failing_checks().count(), 1); + assert_eq!( + outcomes[2].check_outcomes[0].evidence.as_deref(), + Some("nope") + ); + + // Every check was dispatched exactly once. + assert_eq!(fake.seen().len(), total); + + // Reporting renders without error and yields exit 1 (one requirement failed). + let code = report(&plain_terminal(), &outcomes).unwrap(); + assert_eq!(code, 1); +} + +#[tokio::test] +async fn all_satisfied_exits_zero() { + let dir = TempDir::new().unwrap(); + fs::write(dir.path().join("CHECKS.md"), "# Requirement Ok\ndo it\n").unwrap(); + let reqs = discover(dir.path()).await.unwrap(); + + let fake = Arc::new(FakeExecutor::new().with_report(0, true, None)); + let cfg = configuration(); + let outcomes = execute(&cfg, fake, Arc::new(NoopSandbox), dir.path(), &reqs) + .await + .unwrap(); + assert!(outcomes[0].satisfied); + + let code = report(&plain_terminal(), &outcomes).unwrap(); + assert_eq!(code, 0); +} + +#[tokio::test] +async fn empty_tree_exits_zero() { + let dir = TempDir::new().unwrap(); + let reqs = discover(dir.path()).await.unwrap(); + assert!(reqs.is_empty()); + + let cfg = configuration(); + let outcomes = execute( + &cfg, + Arc::new(FakeExecutor::new()), + Arc::new(NoopSandbox), + dir.path(), + &reqs, + ) + .await + .unwrap(); + assert!(outcomes.is_empty()); + + let code = report(&plain_terminal(), &outcomes).unwrap(); + assert_eq!(code, 0); +} diff --git a/src/checks/execution.rs b/src/checks/execution.rs new file mode 100644 index 0000000..5ddd615 --- /dev/null +++ b/src/checks/execution.rs @@ -0,0 +1,396 @@ +//! The execution phase (M5). +//! +//! Run every check **in parallel** (bounded), each in its own CoW sandbox and +//! against its own MCP endpoint; then reconcile each check's verdict (the +//! MCP-reported `success` is authoritative) and aggregate checks into +//! per-requirement outcomes via logical AND. + +use std::collections::HashMap; +use std::path::Path; +use std::sync::Arc; + +use miette::{IntoDiagnostic, Result}; +use tokio::sync::{Notify, Semaphore}; +use tokio::task::JoinSet; + +use crate::checks::config::Config; +use crate::checks::executor::{ + AgentOutcome, AgentRunRequest, BoxedExecutor, CheckExecutor, assemble_instructions, +}; +use crate::checks::mcp::{CheckReport, ReportStore, ResultServer, mcp_config_json}; +use crate::checks::model::{ + Check, CheckId, CheckOutcome, Requirement, RequirementOutcome, Verdict, +}; +use crate::checks::sandbox::Sandbox; + +/// A single check flattened out of the requirement set for execution, tagged +/// with its run-unique id and the requirement it rolls up into. +struct PlannedCheck { + id: CheckId, + req_index: usize, + check: Check, +} + +/// Convenience wrapper used by the pipeline orchestrator: build the executor and +/// sandbox from configuration (DI) and run [`execute`]. +pub async fn execution_phase( + cfg: &Config, + executor: BoxedExecutor, + working_dir: &Path, + requirements: &[Requirement], +) -> Result<Vec<RequirementOutcome>> { + let executor: Arc<dyn CheckExecutor + Send + Sync> = Arc::from(executor); + let sandbox: Arc<dyn Sandbox + Send + Sync> = + Arc::from(crate::checks::sandbox::select_sandbox()); + execute(cfg, executor, sandbox, working_dir, requirements).await +} + +/// Run all checks and produce per-requirement outcomes. +/// +/// `executor` and `sandbox` are injected so tests can substitute fakes. +pub async fn execute( + cfg: &Config, + executor: Arc<dyn CheckExecutor + Send + Sync>, + sandbox: Arc<dyn Sandbox + Send + Sync>, + working_dir: &Path, + requirements: &[Requirement], +) -> Result<Vec<RequirementOutcome>> { + // Flatten checks, assigning each a run-unique id. + let mut planned: Vec<PlannedCheck> = Vec::new(); + for (req_index, req) in requirements.iter().enumerate() { + for check in &req.checks { + planned.push(PlannedCheck { + id: planned.len(), + req_index, + check: check.clone(), + }); + } + } + + // Nothing to run: every requirement trivially aggregates (empty AND = true, + // though validation guarantees ≥1 check in practice). + if planned.is_empty() { + return Ok(aggregate(requirements, HashMap::new())); + } + + // Stand up the one MCP server with an endpoint per check. + let ids: Vec<CheckId> = planned.iter().map(|p| p.id).collect(); + let server = ResultServer::start(&ids).await?; + + // The most recent agent outcome per check. The MCP-reported verdict is folded + // into `AgentOutcome::reported` by `run_one` (which kills the agent the + // instant it reports). + let mut last_outcome: HashMap<CheckId, Result<AgentOutcome>> = HashMap::new(); + + // Re-run any check whose agent fails to report, up to `max_attempts`. Agents + // are nondeterministic and occasionally hang or stop without calling the + // tool; a fresh attempt usually succeeds. The per-check endpoint's + // single-call flag stays unset until a real report arrives, so a retry + // reports to the same endpoint. + let mut pending: Vec<&PlannedCheck> = planned.iter().collect(); + let attempts = cfg.max_attempts.max(1); + for attempt in 1..=attempts { + if pending.is_empty() { + break; + } + if attempt > 1 { + tracing::info!( + attempt, + checks = pending.len(), + "retrying checks whose agent did not report" + ); + } + + // Dispatch the pending checks concurrently, bounded by the limit. + let permits = Arc::new(Semaphore::new(cfg.concurrency.max(1))); + let mut set: JoinSet<(CheckId, Result<AgentOutcome>)> = JoinSet::new(); + for p in &pending { + let permit = permits.clone().acquire_owned().await.into_diagnostic()?; + let executor = executor.clone(); + let sandbox = sandbox.clone(); + let id = p.id; + let instructions = assemble_instructions(&p.check); + let endpoint_url = server.endpoint_url(id); + let (notify, reports) = server.report_handle(id); + let working_dir = working_dir.to_path_buf(); + set.spawn(async move { + let _permit = permit; + let outcome = run_one( + executor, + sandbox, + id, + instructions, + &endpoint_url, + &working_dir, + notify, + reports, + ) + .await; + (id, outcome) + }); + } + while let Some(joined) = set.join_next().await { + let (id, outcome) = joined.into_diagnostic()?; + last_outcome.insert(id, outcome); + } + + // Whatever still has no reported verdict is retried in the next round. + pending = planned + .iter() + .filter(|p| !has_report(last_outcome.get(&p.id))) + .collect(); + } + + server.shutdown().await; + + // Reconcile each check, then aggregate per requirement. + let mut outcomes: HashMap<CheckId, CheckOutcome> = HashMap::new(); + for p in &planned { + let outcome = reconcile(last_outcome.get(&p.id), &p.check.title); + outcomes.insert(p.id, outcome); + } + Ok(aggregate_planned(requirements, &planned, outcomes)) +} + +/// Whether an agent outcome carries a reported verdict. +fn has_report(outcome: Option<&Result<AgentOutcome>>) -> bool { + matches!(outcome, Some(Ok(o)) if o.reported.is_some()) +} + +/// Drive one check: sandbox → mcp-config → dispatch the agent, racing the +/// agent's MCP report against its process. The agent's job is done the instant +/// it reports, so on a report we drop the run future — which kills the agent +/// (`kill_on_drop`) and avoids the post-report cleanup hangs some agents +/// exhibit. If the process exits (or the executor's timeout fires) first, we +/// fold in any report that landed alongside. +#[allow(clippy::too_many_arguments)] +async fn run_one( + executor: Arc<dyn CheckExecutor + Send + Sync>, + sandbox: Arc<dyn Sandbox + Send + Sync>, + id: CheckId, + instructions: String, + endpoint_url: &str, + working_dir: &Path, + notify: Arc<Notify>, + reports: ReportStore, +) -> Result<AgentOutcome> { + let handle = sandbox.create(working_dir).await?; + let config_file = write_mcp_config(&mcp_config_json(endpoint_url))?; + + let request = AgentRunRequest { + check_id: id, + instructions, + working_dir: handle.path().to_path_buf(), + mcp_config_path: config_file.path().to_path_buf(), + }; + + let report_for = |reports: &ReportStore| reports.lock().unwrap().get(&id).cloned(); + // `run_check` already returns a boxed (Unpin) future, so `&mut run` is fine. + let mut run = executor.run_check(request); + let outcome = tokio::select! { + // The agent reported: take the verdict. `run` is dropped when this + // function returns (just below), which kills the now-redundant agent. + _ = notify.notified() => { + AgentOutcome { + exited_cleanly: true, + exit_code: None, + stderr: String::new(), + reported: report_for(&reports), + } + } + // The process finished (clean exit, error, or the executor's timeout). + result = &mut run => { + let mut o = result?; + if o.reported.is_none() { + o.reported = report_for(&reports); + } + o + } + }; + + // Drop the run future first so a still-running agent is killed before we + // tear down its sandbox and mcp-config file. + drop(run); + drop(config_file); + drop(handle); + Ok(outcome) +} + +/// Write the `--mcp-config` JSON to a temp file the agent can read. +fn write_mcp_config(json: &str) -> Result<tempfile::NamedTempFile> { + use std::io::Write; + let mut file = tempfile::Builder::new() + .prefix("multi-mcp-") + .suffix(".json") + .tempfile() + .into_diagnostic()?; + file.write_all(json.as_bytes()).into_diagnostic()?; + file.flush().into_diagnostic()?; + Ok(file) +} + +/// Reconcile a single check's verdict from its agent outcome. The report folded +/// into [`AgentOutcome::reported`] (the MCP-reported `success`) is authoritative; +/// its absence is an error. +fn reconcile(agent: Option<&Result<AgentOutcome>>, title: &str) -> CheckOutcome { + if let Some(report) = inline_report(agent) { + let verdict = if report.success { + Verdict::Satisfied + } else { + Verdict::Failed + }; + return CheckOutcome { + title: title.to_string(), + verdict, + evidence: report.evidence, + }; + } + + let reason = match agent { + Some(Ok(o)) if o.exited_cleanly => { + "agent finished without calling report-check-result".to_string() + } + Some(Ok(o)) => format!( + "agent exited without reporting (exit {:?}){}", + o.exit_code, + stderr_suffix(&o.stderr) + ), + Some(Err(e)) => format!("execution error: {e}"), + None => "no result was collected for this check".to_string(), + }; + CheckOutcome { + title: title.to_string(), + verdict: Verdict::Errored, + evidence: Some(reason), + } +} + +fn inline_report(agent: Option<&Result<AgentOutcome>>) -> Option<CheckReport> { + match agent { + Some(Ok(o)) => o.reported.clone(), + _ => None, + } +} + +fn stderr_suffix(stderr: &str) -> String { + let trimmed = stderr.trim(); + if trimmed.is_empty() { + String::new() + } else { + let snippet: String = trimmed.chars().take(200).collect(); + format!(": {snippet}") + } +} + +/// Aggregate when there are reconciled per-check outcomes. +fn aggregate_planned( + requirements: &[Requirement], + planned: &[PlannedCheck], + mut outcomes: HashMap<CheckId, CheckOutcome>, +) -> Vec<RequirementOutcome> { + let mut buckets: Vec<Vec<CheckOutcome>> = vec![Vec::new(); requirements.len()]; + for p in planned { + if let Some(outcome) = outcomes.remove(&p.id) { + buckets[p.req_index].push(outcome); + } + } + requirements + .iter() + .zip(buckets) + .map(|(req, checks)| { + RequirementOutcome::aggregate(req.title.clone(), req.filepath.clone(), checks) + }) + .collect() +} + +/// Aggregate when there are no checks to run (degenerate suites). +fn aggregate( + requirements: &[Requirement], + _outcomes: HashMap<CheckId, CheckOutcome>, +) -> Vec<RequirementOutcome> { + requirements + .iter() + .map(|req| { + RequirementOutcome::aggregate(req.title.clone(), req.filepath.clone(), Vec::new()) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::checks::config::configuration; + use crate::checks::executor::FakeExecutor; + use crate::checks::sandbox::NoopSandbox; + use std::path::PathBuf; + + fn req(title: &str, checks: Vec<(&str, &str)>) -> Requirement { + Requirement { + filepath: PathBuf::from("CHECKS.md"), + title: title.to_string(), + checks: checks + .into_iter() + .map(|(t, p)| Check { + title: t.to_string(), + prompt: p.to_string(), + }) + .collect(), + } + } + + #[tokio::test] + async fn requirement_satisfied_only_when_all_checks_pass() { + // Two requirements: first has two passing checks (ids 0,1); second has + // one failing check (id 2). + let reqs = vec![ + req("Both pass", vec![("a", "pa"), ("b", "pb")]), + req("One fails", vec![("c", "pc")]), + ]; + let executor = Arc::new( + FakeExecutor::new() + .with_report(0, true, Some("a ok")) + .with_report(1, true, None) + .with_report(2, false, Some("c failed")), + ); + let cfg = configuration(); + let out = execute( + &cfg, + executor.clone(), + Arc::new(NoopSandbox), + &PathBuf::from("."), + &reqs, + ) + .await + .unwrap(); + + assert_eq!(out.len(), 2); + assert!(out[0].satisfied); + assert!(!out[1].satisfied); + assert_eq!(out[1].failing_checks().count(), 1); + assert_eq!( + out[1].check_outcomes[0].evidence.as_deref(), + Some("c failed") + ); + // All three checks were dispatched. + assert_eq!(executor.seen().len(), 3); + } + + #[tokio::test] + async fn missing_report_errors_the_check_without_hanging() { + let reqs = vec![req("Silent", vec![("c", "prompt")])]; + let executor = FakeExecutor::new().with_silent(0); + let mut cfg = configuration(); + cfg.max_attempts = 1; + let out = execute( + &cfg, + Arc::new(executor), + Arc::new(NoopSandbox), + &PathBuf::from("."), + &reqs, + ) + .await + .unwrap(); + assert!(!out[0].satisfied); + assert_eq!(out[0].check_outcomes[0].verdict, Verdict::Errored); + } +} diff --git a/src/checks/executor/claude.rs b/src/checks/executor/claude.rs new file mode 100644 index 0000000..8343640 --- /dev/null +++ b/src/checks/executor/claude.rs @@ -0,0 +1,103 @@ +//! The concrete [`CheckExecutor`] for the MVP: shell out to the Claude Code CLI +//! via `claude -p`. The shell specifics live behind the trait so a future Claude +//! Code SDK executor can replace this without touching execution. + +use std::time::Duration; + +use async_trait::async_trait; +use miette::{IntoDiagnostic, Result}; +use tokio::process::Command; + +use super::{AgentOutcome, AgentRunRequest, CheckExecutor}; +use crate::checks::config::Effort; + +/// Runs each check by invoking `claude -p` non-interactively. Model/provider/ +/// effort come from injected configuration (see [`crate::checks::config::Config`]), +/// never hardcoded here. +pub struct ClaudeExecutor { + /// The model family to run (e.g. the `sonnet` family for the MVP). + model: String, + /// Optional model-provider base URL; when set, passed as `ANTHROPIC_BASE_URL`. + provider_url: Option<String>, + /// The effort level (logged for now; see TODO in `run_check`). + effort: Effort, + /// Per-agent wall-clock timeout; on expiry the child is killed and the check + /// resolves as errored (no report). + timeout: Duration, + /// The CLI program to invoke (`claude`). + program: String, +} + +impl ClaudeExecutor { + pub fn new( + model: String, + provider_url: Option<String>, + effort: Effort, + timeout: Duration, + ) -> Self { + Self { + model, + provider_url, + effort, + timeout, + program: "claude".to_string(), + } + } +} + +#[async_trait] +impl CheckExecutor for ClaudeExecutor { + async fn run_check(&self, req: AgentRunRequest) -> Result<AgentOutcome> { + // TODO: "effort level" has no clean `claude -p` flag yet; for + // now it is recorded for diagnostics and wired through when a richer + // provider lands. + tracing::debug!( + check_id = req.check_id, + model = %self.model, + effort = ?self.effort, + "dispatching claude -p check", + ); + + let mut cmd = Command::new(&self.program); + cmd.arg("-p") + .arg(&req.instructions) + .arg("--model") + .arg(&self.model) + .arg("--mcp-config") + .arg(&req.mcp_config_path) + // The sandbox is a throwaway CoW clone, so skip interactive + // permission prompts (the agent runs non-interactively). + .arg("--dangerously-skip-permissions") + .current_dir(&req.working_dir) + .stdin(std::process::Stdio::null()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + // Ensure the child is reaped if we drop it on timeout. + .kill_on_drop(true); + if let Some(url) = &self.provider_url { + cmd.env("ANTHROPIC_BASE_URL", url); + } + + let child = cmd.spawn().into_diagnostic()?; + match tokio::time::timeout(self.timeout, child.wait_with_output()).await { + Ok(result) => { + let output = result.into_diagnostic()?; + Ok(AgentOutcome { + exited_cleanly: output.status.success(), + exit_code: output.status.code(), + stderr: String::from_utf8_lossy(&output.stderr).into_owned(), + reported: None, // authoritative verdict arrives via MCP + }) + } + Err(_elapsed) => { + // The wait future is dropped here; `kill_on_drop` reaps the child. + Ok(AgentOutcome { + exited_cleanly: false, + exit_code: None, + stderr: format!("agent timed out after {:?}", self.timeout), + reported: None, + }) + } + } + } +} diff --git a/src/checks/executor/fake.rs b/src/checks/executor/fake.rs new file mode 100644 index 0000000..4f4b658 --- /dev/null +++ b/src/checks/executor/fake.rs @@ -0,0 +1,72 @@ +//! A test-only [`CheckExecutor`] double: returns scripted inline verdicts per +//! check id without spawning any process or touching the MCP server, so the +//! execution → reconciliation → reporting pipeline can be driven +//! deterministically. (Tests & docs, MULTI-1354) + +use std::collections::{HashMap, HashSet}; +use std::sync::Mutex; + +use async_trait::async_trait; +use miette::Result; + +use super::{AgentOutcome, AgentRunRequest, CheckExecutor}; +use crate::checks::mcp::CheckReport; +use crate::checks::model::CheckId; + +#[derive(Default)] +pub struct FakeExecutor { + scripted: HashMap<CheckId, CheckReport>, + /// Check ids that should simulate an agent crashing without reporting. + silent: HashSet<CheckId>, + seen: Mutex<Vec<CheckId>>, +} + +impl FakeExecutor { + pub fn new() -> Self { + Self::default() + } + + /// Script a verdict for `id`. + pub fn with_report(mut self, id: CheckId, success: bool, evidence: Option<&str>) -> Self { + self.scripted.insert( + id, + CheckReport { + success, + evidence: evidence.map(str::to_string), + }, + ); + self + } + + /// Make `id` simulate an agent that crashes/exits without reporting. + pub fn with_silent(mut self, id: CheckId) -> Self { + self.silent.insert(id); + self + } + + /// The check ids the fake was asked to run, in call order. + pub fn seen(&self) -> Vec<CheckId> { + self.seen.lock().unwrap().clone() + } +} + +#[async_trait] +impl CheckExecutor for FakeExecutor { + async fn run_check(&self, req: AgentRunRequest) -> Result<AgentOutcome> { + self.seen.lock().unwrap().push(req.check_id); + if self.silent.contains(&req.check_id) { + return Ok(AgentOutcome { + exited_cleanly: false, + exit_code: Some(1), + stderr: "fake: agent crashed".into(), + reported: None, + }); + } + Ok(AgentOutcome { + exited_cleanly: true, + exit_code: Some(0), + stderr: String::new(), + reported: self.scripted.get(&req.check_id).cloned(), + }) + } +} diff --git a/src/checks/executor/mod.rs b/src/checks/executor/mod.rs new file mode 100644 index 0000000..fe97700 --- /dev/null +++ b/src/checks/executor/mod.rs @@ -0,0 +1,107 @@ +//! The agent-executor seam (M2). [`CheckExecutor`] abstracts "run one check's +//! agent" so the concrete `claude -p` executor can later be swapped for a Claude +//! Code SDK (or other provider) without touching the execution phase. Per the +//! spec it is a boxed trait object for dynamic dispatch, mirroring the repo's +//! `BoxedIngress` / `BoxedMonitor` / `BoxedPlatform` convention. + +pub mod claude; +#[cfg(test)] +mod fake; + +use std::path::PathBuf; + +use async_trait::async_trait; +use miette::Result; + +use crate::checks::mcp::{CheckReport, REPORT_TOOL}; +use crate::checks::model::{Check, CheckId}; + +#[cfg(test)] +pub use fake::FakeExecutor; + +/// Everything an executor needs to run one check's agent. +pub struct AgentRunRequest { + /// The check this request runs, for routing/labelling. + pub check_id: CheckId, + /// The assembled instructions + check prompt (see [`assemble_instructions`]). + pub instructions: String, + /// The sandbox directory to run the agent in (its working directory). + pub working_dir: PathBuf, + /// Path to the per-check `--mcp-config` JSON file (points at this check's + /// dedicated MCP endpoint). + pub mcp_config_path: PathBuf, +} + +/// Process-level signal from running an agent. +/// +/// **Note:** the authoritative verdict is the MCP-reported result, not this. +/// [`AgentOutcome::reported`] is an *optional* inline verdict for executors that +/// capture the tool call directly (a future in-process SDK, or the test fake); +/// the shell-out `claude -p` executor always leaves it `None`. +#[derive(Debug, Clone, Default)] +pub struct AgentOutcome { + /// Whether the agent process exited with a success status. + pub exited_cleanly: bool, + /// The process exit code, if one was produced. + pub exit_code: Option<i32>, + /// Captured stderr, for surfacing execution errors (distinct from a check + /// merely *failing*). + pub stderr: String, + /// An inline verdict obtained by the executor itself, if any. + pub reported: Option<CheckReport>, +} + +/// The abstraction over running a single check's agent. +#[async_trait] +pub trait CheckExecutor: Send + Sync { + /// Run a single check's agent against its dedicated MCP endpoint. + async fn run_check(&self, req: AgentRunRequest) -> Result<AgentOutcome>; +} + +/// A boxed [`CheckExecutor`] for dynamic dispatch (DI seam). +pub type BoxedExecutor = Box<dyn CheckExecutor + Send + Sync>; + +/// Assemble the instruction text handed to an agent: standing operating +/// instructions (it MUST call the report tool exactly once) plus the check +/// prompt verbatim. (MULTI-1350) +pub fn assemble_instructions(check: &Check) -> String { + format!( + "You are validating a single requirement for the MultiTool Checks tool.\n\ +Your current working directory is a sandboxed, throwaway copy of the user's repository; \ +you may read it and run commands against it freely.\n\ +\n\ +Carry out the check described below. When — and only when — you have reached a conclusion, \ +you MUST call the `{REPORT_TOOL}` tool EXACTLY ONCE:\n\ + - set `success` to true if the check passes, or false if it fails;\n\ + - optionally set `evidence` to a short explanation of how you concluded.\n\ +Report your result ONLY through `{REPORT_TOOL}` — not via stdout, not via a file — and do not \ +call it more than once. After calling it, stop. If you finish without calling `{REPORT_TOOL}`, \ +the check is treated as a FAILURE.\n\ +\n\ +--- CHECK: {title} ---\n\ +{prompt}\n", + title = check.title, + prompt = check.prompt, + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use static_assertions::assert_obj_safe; + + assert_obj_safe!(CheckExecutor); + + #[test] + fn instructions_embed_prompt_and_demand_single_report() { + let check = Check { + title: "No yellow".into(), + prompt: "scan for yellow text".into(), + }; + let text = assemble_instructions(&check); + assert!(text.contains("scan for yellow text")); + assert!(text.contains(REPORT_TOOL)); + assert!(text.contains("EXACTLY ONCE")); + assert!(text.contains("No yellow")); + } +} diff --git a/src/checks/mcp/mod.rs b/src/checks/mcp/mod.rs new file mode 100644 index 0000000..0182bae --- /dev/null +++ b/src/checks/mcp/mod.rs @@ -0,0 +1,302 @@ +//! The in-process MCP result-reporting server — the trustworthy guardrail. +//! +//! Because agents are nondeterministic, we do **not** trust stdout or sentinel +//! files. Every agent reports its verdict by calling a single MCP tool, +//! [`REPORT_TOOL`] (`report-check-result`), served by **one** in-process `rmcp` +//! server bound to a localhost port and run on a dedicated tokio task within +//! this process (never a subprocess). +//! +//! The single server hosts **N endpoints — one per check** (`/checks/{id}`), so +//! each agent has a unique URL to write its singleton result to. Each check has +//! a [`tokio::sync::Notify`] and a slot in a shared map; when an agent reports, +//! the handler records the verdict and notifies, so execution can wake the +//! instant a check reports (and kill that agent — its job is done). + +use std::collections::HashMap; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; + +use miette::{IntoDiagnostic, Result}; +use rmcp::handler::server::wrapper::Parameters; +use rmcp::model::{CallToolResult, Content}; +use rmcp::transport::streamable_http_server::session::local::LocalSessionManager; +use rmcp::transport::streamable_http_server::{StreamableHttpServerConfig, StreamableHttpService}; +use rmcp::{ErrorData, ServerHandler, tool, tool_handler, tool_router}; +use schemars::JsonSchema; +use serde::Deserialize; +use tokio::sync::Notify; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; + +use crate::checks::model::CheckId; + +/// The exact MCP tool name agents call to report a verdict. Referenced verbatim +/// by the agent instructions (M5) and the `--mcp-config` payload. +pub const REPORT_TOOL: &str = "report-check-result"; + +/// The MCP server name advertised in the `--mcp-config` payload. +pub const SERVER_NAME: &str = "multitool-checks"; + +/// Shared store of reported verdicts, keyed by check id. +pub type ReportStore = Arc<Mutex<HashMap<CheckId, CheckReport>>>; + +/// The arguments of a `report-check-result` tool call (the wire contract). +#[derive(Debug, Clone, Deserialize, JsonSchema)] +pub struct ReportCheckResult { + /// The check's verdict — `true` means the requirement is satisfied. + pub success: bool, + /// Optional explanation of how the agent reached its conclusion. + #[serde(default)] + pub evidence: Option<String>, +} + +/// A verdict recorded by the server for one check. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CheckReport { + pub success: bool, + pub evidence: Option<String>, +} + +/// The per-check MCP handler. One is constructed per session by the service +/// factory; all sessions for a given check share the same `reported` flag, +/// result store, and notifier, so single-call semantics hold across reconnects +/// and the report is observable to execution. +#[derive(Clone)] +struct ReportServer { + check_id: CheckId, + reported: Arc<AtomicBool>, + reports: ReportStore, + notify: Arc<Notify>, +} + +#[tool_router] +impl ReportServer { + #[tool( + name = "report-check-result", + description = "Report whether this check passed. Call exactly once: set success=true if the check passes or false if it fails, with optional evidence explaining your reasoning." + )] + async fn report_check_result( + &self, + params: Parameters<ReportCheckResult>, + ) -> Result<CallToolResult, ErrorData> { + let Parameters(input) = params; + tracing::debug!( + check_id = self.check_id, + success = input.success, + "report-check-result received" + ); + let recorded = self.record(CheckReport { + success: input.success, + evidence: input.evidence, + }); + let msg = if recorded { + "result recorded" + } else { + "result already recorded for this check; ignoring duplicate" + }; + Ok(CallToolResult::success(vec![Content::text(msg)])) + } +} + +impl ReportServer { + /// Record the report with single-call semantics. Returns `true` if this was + /// the first (and only honored) call for the check, `false` for a duplicate. + fn record(&self, report: CheckReport) -> bool { + if self.reported.swap(true, Ordering::SeqCst) { + tracing::warn!( + check_id = self.check_id, + "duplicate report-check-result call ignored" + ); + return false; + } + self.reports.lock().unwrap().insert(self.check_id, report); + // `notify_one` stores a permit if no one is waiting yet, so a waiter that + // arrives after the report still wakes immediately (no lost wakeups). + self.notify.notify_one(); + true + } +} + +#[tool_handler] +impl ServerHandler for ReportServer {} + +/// A handle to the running result server: the bound port, the per-check result +/// store + notifiers, and the means to shut the server task down. +pub struct ResultServer { + base_url: String, + cancel: CancellationToken, + join: JoinHandle<()>, + reports: ReportStore, + notifiers: HashMap<CheckId, Arc<Notify>>, +} + +/// The URL path hosting the endpoint for `check_id`. +fn endpoint_path(check_id: CheckId) -> String { + format!("/checks/{check_id}") +} + +impl ResultServer { + /// Stand up the single server with one endpoint per check id, bound to an + /// OS-assigned localhost port, on a dedicated tokio task. + pub async fn start(check_ids: &[CheckId]) -> Result<Self> { + let cancel = CancellationToken::new(); + let session_manager = Arc::new(LocalSessionManager::default()); + let reports: ReportStore = Arc::new(Mutex::new(HashMap::new())); + let mut notifiers: HashMap<CheckId, Arc<Notify>> = HashMap::new(); + + let mut router = axum::Router::new(); + for &id in check_ids { + let notify = Arc::new(Notify::new()); + notifiers.insert(id, notify.clone()); + let reported = Arc::new(AtomicBool::new(false)); + let reports_for_check = reports.clone(); + // `StreamableHttpServerConfig` is `#[non_exhaustive]`, so build it + // from `default()` and override the fields we care about. We run in + // *stateful* Streamable HTTP mode: the Claude Code MCP client expects + // the standard session flow (initialize → `Mcp-Session-Id` → + // subsequent requests), and a stateless server stalls its multi-step + // handshake. + let mut config = StreamableHttpServerConfig::default(); + config.stateful_mode = true; + config.cancellation_token = cancel.clone(); + let factory = move || { + Ok::<_, std::io::Error>(ReportServer { + check_id: id, + reported: reported.clone(), + reports: reports_for_check.clone(), + notify: notify.clone(), + }) + }; + let service = StreamableHttpService::new(factory, session_manager.clone(), config); + router = router.nest_service(&endpoint_path(id), service); + } + + let listener = tokio::net::TcpListener::bind(("127.0.0.1", 0)) + .await + .into_diagnostic()?; + let port = listener.local_addr().into_diagnostic()?.port(); + + let shutdown = cancel.clone(); + let join = tokio::spawn(async move { + let server = axum::serve(listener, router) + .with_graceful_shutdown(async move { shutdown.cancelled().await }); + if let Err(e) = server.await { + tracing::error!("MCP result server error: {e}"); + } + }); + + Ok(Self { + base_url: format!("http://127.0.0.1:{port}"), + cancel, + join, + reports, + notifiers, + }) + } + + /// The full endpoint URL an agent should connect to for `check_id`. + pub fn endpoint_url(&self, check_id: CheckId) -> String { + format!("{}{}", self.base_url, endpoint_path(check_id)) + } + + /// The notifier + result store for `check_id`, so a caller can await the + /// check's report and read it once it arrives. + pub fn report_handle(&self, check_id: CheckId) -> (Arc<Notify>, ReportStore) { + (self.notifiers[&check_id].clone(), self.reports.clone()) + } + + /// The verdict recorded for `check_id`, if any. + pub fn report_for(&self, check_id: CheckId) -> Option<CheckReport> { + self.reports.lock().unwrap().get(&check_id).cloned() + } + + /// Signal the server task to stop and wait for it to wind down. + pub async fn shutdown(self) { + self.cancel.cancel(); + let _ = self.join.await; + } +} + +/// Build the `--mcp-config` JSON payload pointing an agent at `endpoint_url`, +/// declaring the `report-check-result` server under [`SERVER_NAME`]. (M4 #1348) +pub fn mcp_config_json(endpoint_url: &str) -> String { + let server = serde_json::json!({ "type": "http", "url": endpoint_url }); + let mut servers = serde_json::Map::new(); + servers.insert(SERVER_NAME.to_string(), server); + let mut root = serde_json::Map::new(); + root.insert("mcpServers".to_string(), serde_json::Value::Object(servers)); + serde_json::Value::Object(root).to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn report_server(check_id: CheckId) -> (ReportServer, ReportStore, Arc<Notify>) { + let reports: ReportStore = Arc::new(Mutex::new(HashMap::new())); + let notify = Arc::new(Notify::new()); + let server = ReportServer { + check_id, + reported: Arc::new(AtomicBool::new(false)), + reports: reports.clone(), + notify: notify.clone(), + }; + (server, reports, notify) + } + + #[test] + fn record_enforces_single_call_and_delivers() { + let (server, reports, _notify) = report_server(7); + + assert!(server.record(CheckReport { + success: true, + evidence: Some("ok".into()) + })); + // Duplicate is ignored and does not overwrite. + assert!(!server.record(CheckReport { + success: false, + evidence: None + })); + + let stored = reports.lock().unwrap().get(&7).cloned().expect("recorded"); + assert!(stored.success); + assert_eq!(stored.evidence.as_deref(), Some("ok")); + assert_eq!(reports.lock().unwrap().len(), 1); + } + + #[tokio::test] + async fn record_wakes_a_waiter() { + let (server, _reports, notify) = report_server(0); + // A report that lands before the wait still wakes it (notify_one permit). + server.record(CheckReport { + success: true, + evidence: None, + }); + // Should return promptly rather than hang. + tokio::time::timeout(std::time::Duration::from_secs(1), notify.notified()) + .await + .expect("notified"); + } + + #[test] + fn mcp_config_targets_the_endpoint_and_tool_server() { + let json = mcp_config_json("http://127.0.0.1:5050/checks/3"); + let value: serde_json::Value = serde_json::from_str(&json).unwrap(); + assert_eq!( + value["mcpServers"]["multitool-checks"]["url"], + "http://127.0.0.1:5050/checks/3" + ); + assert_eq!(value["mcpServers"]["multitool-checks"]["type"], "http"); + } + + #[tokio::test] + async fn server_binds_a_port_and_shuts_down() { + let server = ResultServer::start(&[0, 1, 2]).await.unwrap(); + assert!(server.endpoint_url(1).ends_with("/checks/1")); + assert!(server.endpoint_url(1).starts_with("http://127.0.0.1:")); + // No reports arrived. + assert!(server.report_for(1).is_none()); + let _ = server.report_handle(2); + server.shutdown().await; + } +} diff --git a/src/checks/mod.rs b/src/checks/mod.rs new file mode 100644 index 0000000..ab2a838 --- /dev/null +++ b/src/checks/mod.rs @@ -0,0 +1,51 @@ +//! `multi check`: validate declared non-functional ("ility") requirements by +//! running AI-agent checks, reporting results through an in-process MCP server. +//! +//! The feature is a four-phase pipeline, each phase living in its own submodule: +//! +//! 1. [`config`] — the (hardcoded, dependency-injected) configuration phase. +//! 2. [`discovery`] — find/parse `CHECKS.md` files into a `Vec<Requirement>`. +//! 3. [`execution`] — run each check in a CoW [`sandbox`] via a boxed +//! [`executor`], reporting verdicts through the [`mcp`] result server. +//! 4. [`reporting`] — render verdicts and produce the process exit code. + +pub mod config; +mod discovery; +mod execution; +pub mod executor; +pub mod mcp; +pub mod model; +mod reporting; +pub mod sandbox; + +#[cfg(test)] +mod e2e; + +use std::path::Path; + +use miette::Result; + +use crate::Terminal; + +pub use config::configuration; + +/// Run the full `multi check` pipeline rooted at `working_dir`. +/// +/// Returns the process exit code: `0` if every requirement is satisfied (an +/// empty tree counts as success), `1` if any requirement is unsatisfied. +/// Operational errors (e.g. an invalid `CHECKS.md`) surface as `Err` diagnostics +/// rather than an exit code, so CI can tell "checks failed" from "tool errored". +pub async fn run(terminal: &Terminal, working_dir: &Path) -> Result<i32> { + // Phase 1: configuration (hardcoded for the MVP, injected forward). + let cfg = configuration(); + + // Phase 2: discovery. + let requirements = discovery::discover(working_dir).await?; + + // Phase 3: execution — the executor is built from config and injected. + let executor = cfg.build_executor(); + let outcomes = execution::execution_phase(&cfg, executor, working_dir, &requirements).await?; + + // Phase 4: reporting + exit code. + reporting::report(terminal, &outcomes) +} diff --git a/src/checks/model.rs b/src/checks/model.rs new file mode 100644 index 0000000..26043d7 --- /dev/null +++ b/src/checks/model.rs @@ -0,0 +1,157 @@ +//! The in-memory domain model produced by discovery and consumed by execution. +//! +//! The two primary objects are the [`Requirement`] (a statement of fact about +//! the repository that must hold) and the [`Check`] (one of the steps used to +//! decide whether a requirement is satisfied). A requirement is satisfied iff +//! *all* of its checks pass (logical AND). +//! +//! Titles are **not** unique across the set — requirements and checks are +//! grouped by the file that declares them, never keyed on their titles. + +use std::path::PathBuf; + +/// A stable identifier for a check within a single `multi check` run. +/// +/// Ids are assigned when the flat list of checks is built for execution and are +/// used to key the per-check MCP endpoint and to route reported results back to +/// the right check. +pub type CheckId = usize; + +/// A requirement: a statement of fact about the repository that must be true. +/// +/// Analogous to a *control* in compliance terminology. A requirement whose fact +/// is true is **satisfied**. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Requirement { + /// The `CHECKS.md` file that declared this requirement. + pub filepath: PathBuf, + /// The requirement title (the text after the `# Requirement`/`# Req` sentinel). + pub title: String, + /// The checks that attest to this requirement. + /// + /// Invariant: guaranteed **non-empty** after discovery validation (M1). An + /// empty `checks` is a discovery-time error, never surfaced to execution. + pub checks: Vec<Check>, +} + +/// A check: instructions for deciding whether a requirement is satisfied. +/// +/// In the MVP every check is a `prompt`-type check whose body is a prompt for a +/// Claude Code agent. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Check { + /// The check title. May be **inherited** from the requirement when the check + /// is anonymous (the requirement declared no explicit `## Check`). + pub title: String, + /// The agent prompt: the raw Markdown body beneath the `## Check` (or the + /// requirement prose, for an anonymous check). + pub prompt: String, +} + +/// The verdict for a single check after execution and reconciliation. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Verdict { + /// The agent reported `success: true`. + Satisfied, + /// The agent reported `success: false`. + Failed, + /// The agent never reported (crash/timeout), or the executor itself errored. + /// Non-satisfying, but distinguished from a clean `Failed` for reporting. + Errored, +} + +impl Verdict { + /// Whether this verdict counts as satisfying the check. Only [`Verdict::Satisfied`] + /// is satisfying; both [`Verdict::Failed`] and [`Verdict::Errored`] are not. + pub fn is_satisfied(self) -> bool { + matches!(self, Verdict::Satisfied) + } +} + +/// The outcome of a single check: its verdict plus any evidence the agent gave. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CheckOutcome { + /// The check's title (carried through from [`Check::title`]). + pub title: String, + /// The reconciled verdict. + pub verdict: Verdict, + /// Optional explanation: the agent's reported `evidence`, or a synthesized + /// reason when the check errored. + pub evidence: Option<String>, +} + +impl CheckOutcome { + /// Whether this check was satisfied. + pub fn is_satisfied(&self) -> bool { + self.verdict.is_satisfied() + } +} + +/// The aggregate outcome of a requirement: satisfied iff *all* its checks are. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RequirementOutcome { + /// The requirement title. + pub title: String, + /// The file that declared the requirement. + pub filepath: PathBuf, + /// `true` iff every check in [`RequirementOutcome::check_outcomes`] is satisfied. + pub satisfied: bool, + /// The per-check outcomes, in declaration order. + pub check_outcomes: Vec<CheckOutcome>, +} + +impl RequirementOutcome { + /// Aggregate a requirement's check outcomes into a verdict via logical AND. + pub fn aggregate(title: String, filepath: PathBuf, check_outcomes: Vec<CheckOutcome>) -> Self { + let satisfied = check_outcomes.iter().all(CheckOutcome::is_satisfied); + Self { + title, + filepath, + satisfied, + check_outcomes, + } + } + + /// The failing (non-satisfied) checks, for red reporting. + pub fn failing_checks(&self) -> impl Iterator<Item = &CheckOutcome> { + self.check_outcomes.iter().filter(|c| !c.is_satisfied()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn requirement_satisfied_only_when_all_checks_pass() { + let pass = CheckOutcome { + title: "a".into(), + verdict: Verdict::Satisfied, + evidence: None, + }; + let fail = CheckOutcome { + title: "b".into(), + verdict: Verdict::Failed, + evidence: Some("nope".into()), + }; + + let all_pass = RequirementOutcome::aggregate( + "r".into(), + "CHECKS.md".into(), + vec![pass.clone(), pass.clone()], + ); + assert!(all_pass.satisfied); + + let one_fail = + RequirementOutcome::aggregate("r".into(), "CHECKS.md".into(), vec![pass, fail]); + assert!(!one_fail.satisfied); + assert_eq!(one_fail.failing_checks().count(), 1); + } + + #[test] + fn errored_check_does_not_satisfy() { + assert!(!Verdict::Errored.is_satisfied()); + assert!(!Verdict::Failed.is_satisfied()); + assert!(Verdict::Satisfied.is_satisfied()); + } +} diff --git a/src/checks/reporting.rs b/src/checks/reporting.rs new file mode 100644 index 0000000..d2c4fdf --- /dev/null +++ b/src/checks/reporting.rs @@ -0,0 +1,98 @@ +//! The reporting phase + exit code (M6). +//! +//! Renders each requirement's verdict and returns the process exit code: +//! +//! * requirement title in **green** (satisfied) / **red** (not); +//! * failing checks printed in **red** with their evidence; +//! * passing checks omitted; +//! * exit `0` iff every requirement is satisfied (empty suite included), else `1`. +//! +//! Output is routed through the existing [`Terminal`] and honors the global +//! `--enable-colors` setting; with color disabled it degrades to plain text. + +use miette::Result; + +use crate::Terminal; +use crate::checks::model::{CheckOutcome, RequirementOutcome}; + +/// Render `outcomes` and return the exit code (0 = all satisfied, 1 = any not). +pub fn report(terminal: &Terminal, outcomes: &[RequirementOutcome]) -> Result<i32> { + if outcomes.is_empty() { + terminal.write_stdout_line("No requirements found.")?; + return Ok(0); + } + + let color = terminal.stdout_allows_color(); + let mut all_satisfied = true; + + for outcome in outcomes { + terminal.write_stdout_line(&format_requirement(outcome, color))?; + if !outcome.satisfied { + all_satisfied = false; + for check in outcome.failing_checks() { + terminal.write_stdout_line(&format_failing_check(check, color))?; + } + } + } + + Ok(if all_satisfied { 0 } else { 1 }) +} + +fn format_requirement(outcome: &RequirementOutcome, color: bool) -> String { + if color { + let styled = console::style(outcome.title.clone()).bold(); + let styled = if outcome.satisfied { + styled.green() + } else { + styled.red() + }; + styled.to_string() + } else { + let mark = if outcome.satisfied { "PASS" } else { "FAIL" }; + format!("[{mark}] {}", outcome.title) + } +} + +fn format_failing_check(check: &CheckOutcome, color: bool) -> String { + let evidence = check.evidence.as_deref().unwrap_or("no evidence provided"); + let body = format!(" ✗ {}: {}", check.title, evidence); + if color { + console::style(body).red().to_string() + } else { + body + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::checks::model::{RequirementOutcome, Verdict}; + use std::path::PathBuf; + + fn outcome(title: &str, satisfied: bool, checks: Vec<CheckOutcome>) -> RequirementOutcome { + RequirementOutcome { + title: title.into(), + filepath: PathBuf::from("CHECKS.md"), + satisfied, + check_outcomes: checks, + } + } + + #[test] + fn exit_code_reflects_aggregate_satisfaction() { + // Plain-text formatting is exercised here (color = false). + let failing = CheckOutcome { + title: "c".into(), + verdict: Verdict::Failed, + evidence: Some("bad".into()), + }; + let line = format_failing_check(&failing, false); + assert!(line.contains("bad")); + assert!(line.contains('c')); + + let pass = outcome("ok", true, vec![]); + assert_eq!(format_requirement(&pass, false), "[PASS] ok"); + let fail = outcome("nope", false, vec![failing]); + assert_eq!(format_requirement(&fail, false), "[FAIL] nope"); + } +} diff --git a/src/checks/sandbox/fallback.rs b/src/checks/sandbox/fallback.rs new file mode 100644 index 0000000..0cb4292 --- /dev/null +++ b/src/checks/sandbox/fallback.rs @@ -0,0 +1,24 @@ +//! Non-macOS sandbox stub. +//! +//! The MVP only implements copy-on-write sandboxing on macOS (APFS). On other +//! platforms this stub keeps the crate building but fails fast with a clear +//! diagnostic. Linux/Windows CoW support is tracked under *Future work*. + +use std::path::Path; + +use async_trait::async_trait; +use miette::{Result, miette}; + +use super::{Sandbox, SandboxHandle}; + +/// A sandbox that is not available on this platform. +pub struct UnsupportedSandbox; + +#[async_trait] +impl Sandbox for UnsupportedSandbox { + async fn create(&self, _source: &Path) -> Result<SandboxHandle> { + Err(miette!( + "copy-on-write sandboxing is only implemented on macOS in this MVP; this platform is not yet supported" + )) + } +} diff --git a/src/checks/sandbox/macos.rs b/src/checks/sandbox/macos.rs new file mode 100644 index 0000000..b61b025 --- /dev/null +++ b/src/checks/sandbox/macos.rs @@ -0,0 +1,68 @@ +//! macOS APFS copy-on-write sandbox via `clonefile(2)`. +//! +//! `clonefile` produces a near-instant, space-efficient (metadata-only) clone of +//! a whole tree on APFS. The clone lives under a fresh temp directory and is +//! removed when the [`SandboxHandle`] is dropped. + +use std::ffi::CString; +use std::os::unix::ffi::OsStrExt; +use std::path::Path; + +use async_trait::async_trait; +use miette::{IntoDiagnostic, Result, miette}; + +use super::{Sandbox, SandboxHandle}; + +/// APFS `clonefile`-backed sandbox. +pub struct ApfsSandbox; + +impl ApfsSandbox { + pub fn new() -> Self { + Self + } +} + +#[async_trait] +impl Sandbox for ApfsSandbox { + async fn create(&self, source: &Path) -> Result<SandboxHandle> { + let source = source.to_path_buf(); + // `clonefile` is a blocking syscall — run it off the async runtime. + tokio::task::spawn_blocking(move || clone_tree(&source)) + .await + .into_diagnostic()? + } +} + +fn clone_tree(source: &Path) -> Result<SandboxHandle> { + let temp = tempfile::Builder::new() + .prefix("multi-check-") + .tempdir() + .into_diagnostic()?; + // `clonefile` requires the destination NOT to exist and its parent to exist. + let dest = temp.path().join("sandbox"); + + let src_c = cstring(source)?; + let dst_c = cstring(&dest)?; + + // SAFETY: both pointers are valid NUL-terminated C strings that outlive the + // call; `clonefile` does not retain them. + let ret = unsafe { libc::clonefile(src_c.as_ptr(), dst_c.as_ptr(), 0) }; + if ret != 0 { + let err = std::io::Error::last_os_error(); + return Err(miette!( + "clonefile({}, {}) failed: {err}. APFS copy-on-write requires the source and destination to be on the same APFS volume.", + source.display(), + dest.display(), + )); + } + + Ok(SandboxHandle { + root: dest, + _temp: Some(temp), + }) +} + +fn cstring(path: &Path) -> Result<CString> { + CString::new(path.as_os_str().as_bytes()) + .map_err(|e| miette!("path {} contains an interior NUL byte: {e}", path.display())) +} diff --git a/src/checks/sandbox/mod.rs b/src/checks/sandbox/mod.rs new file mode 100644 index 0000000..83fe3d6 --- /dev/null +++ b/src/checks/sandbox/mod.rs @@ -0,0 +1,121 @@ +//! Per-check copy-on-write filesystem sandboxing (M3). +//! +//! Each check runs inside a CoW clone of the working tree so an agent can read +//! and modify files freely without corrupting the real working directory. The +//! abstraction is a boxed trait object (mirroring `BoxedIngress` etc.); the +//! concrete implementation is selected per platform via `cfg`. macOS ships an +//! APFS `clonefile` implementation; other targets get a stub that errors, so the +//! crate still builds everywhere. + +use std::path::{Path, PathBuf}; + +use async_trait::async_trait; +use miette::Result; + +#[cfg(target_os = "macos")] +mod macos; + +#[cfg(not(target_os = "macos"))] +mod fallback; + +/// A copy-on-write sandbox factory. +#[async_trait] +pub trait Sandbox: Send + Sync { + /// Create a CoW clone of `source` and return a handle whose path is the + /// sandbox root. + async fn create(&self, source: &Path) -> Result<SandboxHandle>; +} + +/// A boxed [`Sandbox`] for dynamic dispatch (the OS-injection seam). +pub type BoxedSandbox = Box<dyn Sandbox + Send + Sync>; + +/// A live sandbox. Its [`SandboxHandle::path`] is an independent clone root; the +/// clone is removed when the handle is dropped (RAII teardown). +pub struct SandboxHandle { + root: PathBuf, + /// Owns the temp directory containing the clone; dropping it removes the + /// clone. `None` only for hand-constructed handles in tests. + _temp: Option<tempfile::TempDir>, +} + +impl SandboxHandle { + /// The sandbox root directory (use as the agent's working directory). + pub fn path(&self) -> &Path { + &self.root + } +} + +/// Select the platform sandbox implementation. +/// +/// macOS → the real APFS CoW sandbox. Other platforms → an unsupported stub +/// that fails with a clear diagnostic (CoW support for Linux/Windows is tracked +/// under *Future work*). +pub fn select_sandbox() -> BoxedSandbox { + #[cfg(target_os = "macos")] + { + Box::new(macos::ApfsSandbox::new()) + } + #[cfg(not(target_os = "macos"))] + { + Box::new(fallback::UnsupportedSandbox) + } +} + +/// A test-only sandbox that does not clone: it hands back the source path +/// directly (agents in tests are fakes that never touch the filesystem). +#[cfg(test)] +pub struct NoopSandbox; + +#[cfg(test)] +#[async_trait] +impl Sandbox for NoopSandbox { + async fn create(&self, source: &Path) -> Result<SandboxHandle> { + Ok(SandboxHandle { + root: source.to_path_buf(), + _temp: None, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use static_assertions::assert_obj_safe; + + assert_obj_safe!(Sandbox); + + #[tokio::test] + #[cfg(target_os = "macos")] + async fn clone_is_independent_of_source() { + use std::fs; + let src = tempfile::TempDir::new().unwrap(); + fs::write(src.path().join("a.txt"), "original").unwrap(); + fs::create_dir(src.path().join("nested")).unwrap(); + fs::write(src.path().join("nested/b.txt"), "b").unwrap(); + + let sandbox = select_sandbox(); + let handle = sandbox.create(src.path()).await.unwrap(); + + // The clone has the files. + assert_eq!( + fs::read_to_string(handle.path().join("a.txt")).unwrap(), + "original" + ); + assert_eq!( + fs::read_to_string(handle.path().join("nested/b.txt")).unwrap(), + "b" + ); + + // Writing inside the sandbox does not touch the source. + fs::write(handle.path().join("a.txt"), "modified").unwrap(); + assert_eq!( + fs::read_to_string(src.path().join("a.txt")).unwrap(), + "original" + ); + + // Teardown removes the clone. + let clone_path = handle.path().to_path_buf(); + drop(handle); + assert!(!clone_path.exists()); + } +} diff --git a/src/cmd/check.rs b/src/cmd/check.rs new file mode 100644 index 0000000..0b347ed --- /dev/null +++ b/src/cmd/check.rs @@ -0,0 +1,33 @@ +use miette::{IntoDiagnostic, Result}; +use tokio::runtime::Runtime; + +use crate::Terminal; +use crate::config::CheckSubcommand; + +/// The `multi check` command handler. Mirrors `Run`/`Login`/`Init`: a +/// synchronous `dispatch()` that builds a `tokio` runtime and `block_on`s the +/// async pipeline. The runtime is established here because the MCP result server +/// (M4) must run on a task *within this process* (never a subprocess). +pub struct Check { + terminal: Terminal, + args: CheckSubcommand, +} + +impl Check { + pub fn new(terminal: Terminal, args: CheckSubcommand) -> Result<Self> { + Ok(Self { terminal, args }) + } + + pub fn dispatch(self) -> Result<()> { + let rt = Runtime::new().into_diagnostic()?; + let _guard = rt.enter(); + // The pipeline returns a process exit code distinct from operational + // errors: check *failures* exit 1 cleanly; invalid input surfaces as a + // `miette` diagnostic (non-zero) instead. + let code = rt.block_on(crate::checks::run(&self.terminal, self.args.directory()))?; + if code != 0 { + std::process::exit(code); + } + Ok(()) + } +} diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 031d3e8..9599661 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -1,3 +1,4 @@ +pub use check::Check; pub use init::Init; pub use login::Login; pub use logout::Logout; @@ -7,6 +8,7 @@ pub use version::Version; #[cfg(feature = "proxy")] pub use proxy::Proxy; +mod check; mod init; mod login; mod logout; diff --git a/src/config/check/mod.rs b/src/config/check/mod.rs new file mode 100644 index 0000000..4e94e59 --- /dev/null +++ b/src/config/check/mod.rs @@ -0,0 +1,21 @@ +use std::path::{Path, PathBuf}; + +use clap::Args; + +/// `multi check`: validate the requirements declared in `CHECKS.md` files. +/// +/// Model/provider flags are intentionally absent — configuration is hardcoded +/// for the MVP (see M2). Only the working directory is configurable. +#[derive(Args, Clone)] +pub struct CheckSubcommand { + /// The directory to recursively scan for `CHECKS.md` files. + #[arg(default_value = ".")] + directory: PathBuf, +} + +impl CheckSubcommand { + /// The directory to scan (defaults to the current directory). + pub fn directory(&self) -> &Path { + &self.directory + } +} diff --git a/src/config/command.rs b/src/config/command.rs index ed52d4b..baadd0f 100644 --- a/src/config/command.rs +++ b/src/config/command.rs @@ -3,10 +3,10 @@ use miette::Result; #[cfg(feature = "proxy")] use crate::cmd::Proxy; -use crate::cmd::{Init, Login, Logout, Run, Version}; +use crate::cmd::{Check, Init, Login, Logout, Run, Version}; use crate::terminal::Terminal; -use super::{InitSubcommand, LoginSubcommand, RunSubcommand}; +use super::{CheckSubcommand, InitSubcommand, LoginSubcommand, RunSubcommand}; #[cfg(feature = "proxy")] use super::ProxySubcommand; @@ -26,6 +26,8 @@ pub enum MultiCommand { /// Run will execute `multi` in "runner mode", where it will /// immediately deploy the provided artifact and start canarying. Run(RunSubcommand), + /// Validate the requirements declared in `CHECKS.md` files using AI-agent checks. + Check(CheckSubcommand), /// Print the CLI version and exit Version, } @@ -40,6 +42,7 @@ impl MultiCommand { #[cfg(feature = "proxy")] Self::Proxy(flags) => Proxy::new(console, flags).dispatch(), Self::Run(flags) => Run::new(console, flags)?.dispatch(), + Self::Check(flags) => Check::new(console, flags)?.dispatch(), Self::Version => Version::new(console).dispatch(), } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 4aca83e..1816b49 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1,8 +1,12 @@ +pub use check::CheckSubcommand; pub use cli::Cli; pub use init::InitSubcommand; pub use login::LoginSubcommand; +#[cfg(feature = "proxy")] +pub use proxy::ProxySubcommand; pub use run::RunSubcommand; +mod check; mod cli; mod colors; mod command; diff --git a/src/lib.rs b/src/lib.rs index 98547da..d90d346 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,12 @@ +// The pre-existing (largely abandoned) legacy modules carry dead code and a few +// non-idiomatic API shapes that the Rust 1.96 toolchain refresh surfaced as +// warnings. Allow those categories crate-wide rather than churn abandoned code; +// the `checks` feature added in this PR is clippy-clean on its own. +#![allow(dead_code)] +#![allow(private_interfaces)] +#![allow(clippy::wrong_self_convention)] +#![allow(clippy::large_enum_variant)] + pub use config::Cli; pub use fs::manifest; pub use terminal::Terminal; @@ -15,6 +24,9 @@ mod adapters; /// Currently, we expect all artifacts to be zipped /// lambda functions. pub mod artifacts; +/// The `multi check` feature: discover `CHECKS.md` requirements and validate +/// them with AI-agent checks, reporting through an in-process MCP server. +mod checks; /// Contains the dispatch logic for running individual CLI subcommands. /// The CLI's main function calls into these entrypoints for each subcommand. mod cmd; diff --git a/src/stats/categorical.rs b/src/stats/categorical.rs index 47e6761..27d41e3 100644 --- a/src/stats/categorical.rs +++ b/src/stats/categorical.rs @@ -9,14 +9,15 @@ /// You can think of a [Categorical] as a hashmap with fixed integer keys. When the map is /// created, its keys must already be known and completely cover the range `[0, N)`. /// -/// ```rust -/// use std::collections::HashSet; -/// use canary::stats::Categorical; +/// `Categorical` lives in a crate-internal module, so this example is shown for +/// illustration only (it can't be a runnable doctest); the same logic is +/// exercised by a unit test below. /// +/// ```ignore /// #[derive(PartialEq, Eq, Debug, Hash)] /// enum Coin { -/// Heads, -/// Tails, +/// Heads, +/// Tails, /// } /// /// impl Categorical<2> for Coin { @@ -40,4 +41,25 @@ mod tests { // The categorical trait must be object-safe. assert_obj_safe!(Categorical<5>); + + #[test] + fn coin_maps_to_its_category() { + #[derive(PartialEq, Eq, Debug, Hash)] + enum Coin { + Heads, + Tails, + } + + impl Categorical<2> for Coin { + fn category(&self) -> usize { + match self { + Self::Heads => 0, + Self::Tails => 1, + } + } + } + + assert_eq!(Coin::Heads.category(), 0); + assert_eq!(Coin::Tails.category(), 1); + } } diff --git a/src/stats/contingency.rs b/src/stats/contingency.rs index 953f96c..dbd5531 100644 --- a/src/stats/contingency.rs +++ b/src/stats/contingency.rs @@ -96,9 +96,6 @@ impl<const N: usize, C: Categorical<N>> Default for ContingencyTable<N, C> { } } -#[cfg(test)] -pub(crate) use tests::Coin; - #[cfg(test)] mod tests { use std::num::NonZeroUsize; diff --git a/src/terminal/mod.rs b/src/terminal/mod.rs index c6540ca..a34956c 100644 --- a/src/terminal/mod.rs +++ b/src/terminal/mod.rs @@ -62,6 +62,17 @@ impl Terminal { .into_diagnostic() } + /// Write a single line to stdout. Used by the `multi check` reporting phase, + /// which applies its own `console` styling before calling this. + pub fn write_stdout_line(&self, line: &str) -> Result<()> { + self.stdout.term().write_line(line).into_diagnostic() + } + + /// Whether stdout may emit color, honoring the global `--enable-colors` flag. + pub fn stdout_allows_color(&self) -> bool { + self.stdout.allow_color() + } + pub fn print_version(&self, version: &'static str) -> Result<()> { let msg = format!("v{version}"); self.stdout