diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 2250a50..359dfc0 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -392,6 +392,17 @@ "url": "https://github.com/trailofbits" }, "source": "./plugins/dimensional-analysis" + }, + { + "name": "adversarial-verification", + "version": "1.0.0", + "description": "Verify claims, designs, and bug findings by dispatching isolated advocate/skeptic sub-agents and synthesizing their arguments into a structured verdict. Counters sycophancy and single-agent agreement bias.", + "author": { + "name": "Julius Alexandre", + "email": "opensource@trailofbits.com", + "url": "https://github.com/trailofbits" + }, + "source": "./plugins/adversarial-verification" } ] } diff --git a/.codex/skills/adversarial-verification b/.codex/skills/adversarial-verification new file mode 120000 index 0000000..eef6da4 --- /dev/null +++ b/.codex/skills/adversarial-verification @@ -0,0 +1 @@ +../../plugins/adversarial-verification/skills/adversarial-verification \ No newline at end of file diff --git a/CODEOWNERS b/CODEOWNERS index 9e2ecad..ce00c59 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,6 +2,7 @@ * @dguido # Plugin-specific owners (alphabetical) +/plugins/adversarial-verification/ @wizardengineer @dguido /plugins/agentic-actions-auditor/ @BuffaloWill @elopez @dguido /plugins/ask-questions-if-underspecified/ @kevin-valerio @dguido /plugins/audit-context-building/ @omarinuwa @dguido diff --git a/README.md b/README.md index 1752f5b..9426805 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ cd /path/to/parent # e.g., if repo is at ~/projects/skills, be in ~/projects | Plugin | Description | |--------|-------------| +| [adversarial-verification](plugins/adversarial-verification/) | Stress-test claims, designs, and bug findings with isolated advocate/skeptic sub-agents | | [constant-time-analysis](plugins/constant-time-analysis/) | Detect compiler-induced timing side-channels in cryptographic code | | [mutation-testing](plugins/mutation-testing/) | Configure mewt/muton mutation testing campaigns — scope targets, tune timeouts, optimize long runs | | [property-based-testing](plugins/property-based-testing/) | Property-based testing guidance for multiple languages and smart contracts | diff --git a/plugins/adversarial-verification/.claude-plugin/plugin.json b/plugins/adversarial-verification/.claude-plugin/plugin.json new file mode 100644 index 0000000..097fc81 --- /dev/null +++ b/plugins/adversarial-verification/.claude-plugin/plugin.json @@ -0,0 +1,10 @@ +{ + "name": "adversarial-verification", + "version": "1.0.0", + "description": "Verify claims, designs, and bug findings by dispatching isolated advocate/skeptic sub-agents and synthesizing their arguments into a structured verdict. Counters sycophancy and single-agent agreement bias.", + "author": { + "name": "Julius Alexandre", + "email": "opensource@trailofbits.com", + "url": "https://github.com/trailofbits" + } +} diff --git a/plugins/adversarial-verification/README.md b/plugins/adversarial-verification/README.md new file mode 100644 index 0000000..cff044c --- /dev/null +++ b/plugins/adversarial-verification/README.md @@ -0,0 +1,29 @@ +# adversarial-verification + +Stress-test claims, designs, and bug findings by dispatching two isolated sub-agents — one advocate, one skeptic — and synthesizing their arguments into a structured verdict. + +## When to Use + +- Choosing between competing technical approaches +- Verifying a bug finding is real (not a false positive) +- Reviewing a design decision before commit +- Any claim you're inclined to agree with by default +- Stress-testing your own reasoning when you suspect it may be one-sided + +## What It Does + +Counters sycophancy and single-agent agreement bias by forcing maximal disagreement before committing. Each sub-agent runs in isolated context — the advocate never sees the skeptic's arguments and vice versa. After both return, the caller synthesizes a verdict table that picks winners per dimension and produces a concrete recommendation. + +### Two modes + +| Mode | Claim type | Structure | +|------|-----------|-----------| +| **Decision mode** | "X is the best approach" | Free-form arguments organized by evaluation dimensions | +| **Proof mode** | "X is a real bug/finding" | N null hypotheses — skeptic proves, advocate refutes | + +## Installation + +``` +/plugin marketplace add trailofbits/skills +/plugin install adversarial-verification +``` diff --git a/plugins/adversarial-verification/skills/adversarial-verification/SKILL.md b/plugins/adversarial-verification/skills/adversarial-verification/SKILL.md new file mode 100644 index 0000000..86e984b --- /dev/null +++ b/plugins/adversarial-verification/skills/adversarial-verification/SKILL.md @@ -0,0 +1,133 @@ +--- +name: adversarial-verification +description: Verify a claim, idea, approach, design, or finding by dispatching two isolated sub-agents — an advocate (argues the claim is correct/best) and a skeptic (argues it is wrong/inferior) — then synthesize their arguments into a structured verdict. Counters sycophancy and agreement bias by forcing maximal disagreement before the caller commits. Use when making technical decisions ("should we use X or Y?"), verifying bug findings ("is this a real bug?"), reviewing system designs ("is this architecture sound?"), evaluating strategic claims, or whenever the caller suspects their own reasoning may be one-sided. Triggers on phrases like "verify this claim", "adversarial verification", "is this right?", "prove this is a real bug", "which approach is best", "stress-test this idea", "get a second opinion on", "argue against this", "devil's advocate", or whenever pattern-breaking from agreement is needed. +--- + +# Adversarial Verification + +## Overview + +Dispatch two sub-agents with isolated context — one **advocate**, one **skeptic** — to argue opposite sides of a claim as strongly as possible. Then synthesize their arguments into a structured verdict table. This breaks the pattern of single-agent reasoning converging toward agreement and surfaces the strongest objections and the strongest supports in one pass. + +**Core principle:** Independent isolated context is non-negotiable. An agent that has read the other side's arguments will soften to accommodate them. The adversarial value comes from each agent arguing without knowledge of the counter-argument. + +**Announce at start:** "I'm using the adversarial-verification skill to stress-test this claim." + +## When to Use + +- Choosing between 2+ technical approaches +- Verifying a bug finding is real (not a false positive) +- Reviewing a design decision before commit +- User asks "is this correct?" about a non-trivial claim +- Any claim you're inclined to agree with by default — that's the tell + +## When NOT to Use + +- Simple factual lookup ("what version is X?") +- Obvious syntax error fix +- User has already made the decision and is executing + +## The Process + +### Step 1: State the claim precisely + +Before dispatching agents, state the claim in a single sentence. Ambiguous claims produce worthless verifications. + +**Bad:** "Should we use yarpgen?" +**Good:** "YARPGen program-level differential testing is the best strategy for finding semantic translation bugs in Rosetta 2, better than grammar-aware x86 mutation or a Cascade-style oracle." + +The claim must be **falsifiable** — something the skeptic could in principle prove wrong. + +### Step 2: Select the mode + +Two modes, chosen by the claim type: + +| Claim type | Mode | Details | +|-----------|------|---------| +| Bug finding / security claim | **Proof mode** | Structured N-proof hypotheses (e.g., P1-P5). See [references/proof-mode.md](references/proof-mode.md) | +| Approach / design decision | **Decision mode** | Free-form arguments with evidence. See [references/decision-mode.md](references/decision-mode.md) | + +If unsure, default to decision mode. + +### Step 3: Dispatch both agents in parallel + +Use the Agent tool with TWO tool calls in a SINGLE message (parallel dispatch). Each agent is a fresh context with no knowledge of the other. + +Load prompt templates from [references/prompt-templates.md](references/prompt-templates.md). The templates enforce: +- Each agent argues ONE side maximally, not balanced +- Each agent is told explicitly "do not be balanced" and "argue as hard as possible" +- Each agent cites specific evidence (files, line numbers, facts) +- Each agent anticipates and pre-refutes the obvious counter-arguments + +Give each agent the **same claim**, the **same background context**, but **opposite instructions**. Never mention the other agent's existence or arguments in either prompt. + +### Step 4: Synthesize with a verdict table + +After both agents return, produce a verdict table. For each significant point raised by either side: + +| Point | Advocate position | Skeptic position | Verdict | +|-------|-------------------|------------------|---------| + +Verdict values: +- **Survives** — one side's position held up; the other failed to counter it +- **Weakens** — partially rebutted; position should be qualified +- **Falls** — cleanly refuted by the other side + +Then write a one-paragraph **recommendation**: which overall position won, which specific claims survived, and what the caller should actually do. + +See [references/synthesis.md](references/synthesis.md) for the full synthesis template. + +### Step 5: Report to the caller + +Present three things: +1. The claim (one sentence, as stated in Step 1) +2. The verdict table +3. The recommendation (what action follows from the verdict) + +Do NOT dump the raw agent outputs unless the user asks. The verdict is the product. + +## Reference Guide + +- Mode selection quick reference: [references/decision-mode.md](references/decision-mode.md) and [references/proof-mode.md](references/proof-mode.md) +- Prompt templates for advocate and skeptic dispatch: [references/prompt-templates.md](references/prompt-templates.md) +- Verdict table and recommendation shape: [references/synthesis.md](references/synthesis.md) +- Failure modes and recovery patterns: [references/anti-patterns.md](references/anti-patterns.md) + +## Anti-patterns + +See [references/anti-patterns.md](references/anti-patterns.md) for full failure modes. The three most important: + +1. **False symmetry** — treating both sides as equally valid when one is clearly stronger. The verdict must pick a winner, not split the difference. +2. **Hedged agents** — agents that softened their argument. If an agent returns a balanced view, re-dispatch with a stronger prompt. Real adversarial value requires real adversarial arguments. +3. **Shared context leakage** — mentioning the other agent's arguments in either prompt. This collapses independence. Each prompt must be written as if that agent is the only one you've asked. + +## Examples + +### Example A — approach decision (decision mode) + +Claim: *"Using YARPGen to generate C programs is the fastest path to finding semantic translation bugs in Rosetta 2."* + +Dispatch: +- Advocate prompt: "Make the strongest case FOR this claim. Cite known bugs YARPGen would catch, expected exec/s, why compiler-emitted code is the right attack surface. Do not be balanced." +- Skeptic prompt: "Make the strongest case AGAINST. YARPGen has no FP support, known Rosetta bugs are in FP/SIMD/implicit registers, oracle problem without Intel hardware. Do not be balanced." + +Result: verdict table shows skeptic's "no FP support" and "oracle problem" survive; advocate's "fastest to set up" survives. Recommendation: use YARPGen as complement, not primary strategy. + +### Example B — bug verification (proof mode) + +Claim: *"FINDING-001 (pcmpestrm register allocator abort) is a real translation bug, not a false positive."* + +Dispatch with 5 proofs, each tests one null hypothesis: +- P1: "This is just normal input rejection (exit -302)." +- P2: "This is a harness artifact (doesn't reproduce in clean env)." +- P3: "This is a benign assertion (SIGABRT in validation code)." +- P4: "The input is unreachable in practice (no compiler emits it)." +- P5: "Already fixed in a newer macOS." + +Skeptic tries to prove each null; advocate tries to refute each. If all 5 fail to prove = CONFIRMED bug. + +## Integration + +**Called by:** +- User directly via explicit request +- Any skill that needs to verify a claim before acting on it — e.g., when brainstorming an approach choice, when evaluating a code review suggestion that seems technically questionable, or when verifying a proposed root cause before applying a fix diff --git a/plugins/adversarial-verification/skills/adversarial-verification/references/anti-patterns.md b/plugins/adversarial-verification/skills/adversarial-verification/references/anti-patterns.md new file mode 100644 index 0000000..7b6e93e --- /dev/null +++ b/plugins/adversarial-verification/skills/adversarial-verification/references/anti-patterns.md @@ -0,0 +1,81 @@ +# Anti-patterns + +Common failure modes that destroy the adversarial value of this skill. Each has a diagnosis and a fix. + +## 1. False symmetry + +**Symptom:** The verdict says "both sides have merit" and splits the difference. No clear winner. Vague recommendation like "consider both approaches." + +**Why it happens:** Reluctance to pick a side. Treating the adversarial structure as performative rather than decisional. + +**Fix:** Pick a winner. The dimensions in the verdict table each have one winner, not a tie. If truly 3/3 split across dimensions, the winner is whichever dimensions matter most for the caller's actual decision — name those dimensions explicitly in the recommendation. + +## 2. Hedged agents + +**Symptom:** Agent returns things like "I see merits on both sides," "this is a nuanced question," "both approaches have tradeoffs." No strong adversarial position. + +**Why it happens:** Weak prompt. The agent defaulted to balanced reasoning because nothing stopped it. + +**Fix:** Re-dispatch with a stronger prompt. The key phrase: "Do not acknowledge merit in the opposing position. Do not hedge." Do not accept a hedged response as valid output. If you need the full templates, return to [SKILL.md](../SKILL.md) Step 3. + +## 3. Shared context leakage + +**Symptom:** Advocate mentions the skeptic's arguments (or vice versa), softening the tone to accommodate. Arguments collapse toward agreement. + +**Why it happens:** Prompt mentioned the other agent's existence or previewed their arguments. + +**Fix:** Each prompt must be written as if that agent is the ONLY agent you've asked. Do not mention the other side. Do not say "another agent will argue X." Do not give hints about counter-arguments. The synthesis happens separately after both return. + +## 4. Unfalsifiable claim + +**Symptom:** The skeptic can't argue against the claim because it's too vague or too broad. + +**Why it happens:** Claim wasn't stated precisely in Step 1. + +**Fix:** Return to Step 1. Rewrite the claim as a specific, falsifiable sentence. "YARPGen is good" is unfalsifiable. "YARPGen will catch more Rosetta 2 semantic bugs than grammar-aware mutation in 7 days of fuzzing" is falsifiable. + +## 5. Missing evidence + +**Symptom:** Agents make claims but don't cite specific files, line numbers, CVEs, benchmarks. Arguments are plausible but unverifiable. + +**Why it happens:** Prompt didn't require citations. + +**Fix:** Every prompt includes: "Cite specific files, line numbers, facts, CVEs, benchmarks where possible." Reject outputs that make unsupported claims on critical dimensions. + +## 6. Wrong mode + +**Symptom:** Trying to prove a bug is real with "what do you think is best" prompts, or trying to pick between approaches with proof-style null hypotheses. + +**Fix:** Reread [SKILL.md](../SKILL.md) Step 2. Decision mode = approach/design choice. Proof mode = bug/finding/security claim verification. Pick the right one. + +## 7. Synthesis dump + +**Symptom:** Presenting both agents' full outputs as the result. No verdict table. No recommendation. + +**Why it happens:** Skipping the synthesis step. + +**Fix:** The verdict table is the product, not the raw arguments. Always produce the table + recommendation. Only dump raw agent outputs if the user explicitly asks for them. + +## 8. Confirmation bias in prompt + +**Symptom:** Advocate wins trivially because the prompt was stacked in its favor. The skeptic has nothing to work with. + +**Why it happens:** Caller's preferred answer leaks into the prompt framing. + +**Fix:** Both prompts should frame the claim neutrally. "Make the case FOR/AGAINST {CLAIM}" not "Make the case FOR the obviously correct {CLAIM}". Both agents get the SAME background. If the advocate gets more context than the skeptic, the test is rigged. + +## 9. Too many dimensions + +**Symptom:** Verdict table has 15 rows. Each row is shallow. Recommendation is unclear because so many points survived/fell. + +**Fix:** Pick 3-5 dimensions. The dimensions should be the ones that actually determine the decision. Cut dimensions that don't move the verdict either way. + +## 10. Ignoring UNCERTAIN + +**Symptom:** Proof-mode verdict marks every null as REFUTED/PROVED when some were actually UNCERTAIN. Finding is reported as CONFIRMED when one null is still plausible. + +**Fix:** UNCERTAIN is a valid outcome. If P4 ("input unreachable") has evidence on both sides, the finding is NOT confirmed. Either: +- Gather more evidence on that specific P before concluding, OR +- Report the finding WITH the caveat that P4 is uncertain + +Do not round UNCERTAIN up to CONFIRMED. diff --git a/plugins/adversarial-verification/skills/adversarial-verification/references/decision-mode.md b/plugins/adversarial-verification/skills/adversarial-verification/references/decision-mode.md new file mode 100644 index 0000000..1d71e96 --- /dev/null +++ b/plugins/adversarial-verification/skills/adversarial-verification/references/decision-mode.md @@ -0,0 +1,112 @@ +# Decision Mode + +Use for approach selection, architecture choice, tool selection, strategy debates — any case where the claim is "X is the best way to do Y." + +## When to pick decision mode + +- "Should we use X or Y?" +- "Is this the right architecture?" +- "Is strategy A better than strategy B?" +- Any comparison between 2+ alternatives +- Any design decision before implementation + +If the claim is "this bug is real" or "this vulnerability exists," return to [SKILL.md](../SKILL.md) Step 2 and pick Proof mode instead. + +## Required inputs before dispatching + +1. **The claim** — one sentence, falsifiable, names the preferred option +2. **The alternatives** — explicit list of competing options (2-4 is ideal) +3. **Evaluation dimensions** — 3-5 axes the agents should address +4. **Evidence available** — known facts, prior findings, constraints both agents can cite + +## Structure of decision-mode arguments + +Each agent should produce arguments organized by dimension: + +**Advocate output shape:** +``` +## Why {CLAIM} is the best approach + +### Dimension 1: {e.g., "Bug-finding effectiveness"} +- Specific claim with evidence +- Specific claim with evidence +- Pre-refutation of obvious counter + +### Dimension 2: {e.g., "Implementation effort"} +- ... + +### Dimension 3: {e.g., "Speed / throughput"} +- ... + +## Summary: why the alternatives fail +- Alternative A fails at ... because ... +- Alternative B fails at ... because ... +``` + +**Skeptic output shape:** +``` +## Why {CLAIM} is wrong + +### Structural flaw +- The core reason the claim's logic doesn't hold +- Evidence + +### Dimension 1: {e.g., "Oracle problem"} +- Specific objection with evidence +- Pre-refutation of obvious counter + +### Dimension 2: {e.g., "False positive rate"} +- ... + +## Summary: why one of the alternatives is actually best +- Alternative X wins at ... because ... +``` + +## Common dimensions to include + +Pick 3-5 that fit your claim: + +| Dimension | What to argue | +|-----------|---------------| +| **Effectiveness** | Does it actually find/solve the target problem? | +| **Effort / cost** | Time, lines of code, dependencies added | +| **Speed / throughput** | Execution time, iterations per second, latency | +| **False positive / false negative rate** | Signal-to-noise ratio | +| **Maintenance burden** | Who owns it, how brittle | +| **Precedent** | Has it worked before, in what domains, at what scale | +| **Oracle / correctness** | How do you know the answer is right | +| **Composability** | Does it combine well with existing systems | +| **Reversibility** | If wrong, how hard to back out | +| **Risk** | Worst-case outcome | + +## Synthesis for decision mode + +The verdict table should have one row per dimension. For each dimension, pick which side made the stronger argument and explain why. + +Example verdict row: +| Dimension | Advocate says | Skeptic says | Verdict | +|-----------|---------------|--------------|---------| +| Oracle correctness | Native ARM64 compile is a valid reference | Clang can miscompile; need Intel HW to disambiguate | **Skeptic wins** — advocate didn't address the compiler-bug-vs-translator-bug ambiguity. | + +The **recommendation** paragraph should: +- State which overall approach wins +- List the 2-3 skeptic points that matter for the winner (qualifications) +- State what the caller should actually do next + +## Example: full decision-mode pass + +Claim: "Track 2 (persistent harness + comparator fix) is the best next step for the Rosetta 2 differential fuzzer." + +Advocate dimensions: speed gain (10x), unblocks semantic bug detection, low effort (days not weeks) + +Skeptic dimensions: still no ARM64 execution, still only finds crashes, Track 3 has higher ceiling + +Verdict: +| Dimension | Verdict | +|-----------|---------| +| Short-term speed | Advocate wins — 10x is 10x | +| Semantic bug detection | Skeptic wins — Track 2 doesn't add ARM64 execution | +| Long-term ceiling | Skeptic wins — Track 3 is required eventually | +| Effort-to-value ratio | Advocate wins — Track 2 is days, Track 3 is weeks | + +Recommendation: Do Track 2 now because speed gain is real and blocking. Plan Track 3 as the next major work item. diff --git a/plugins/adversarial-verification/skills/adversarial-verification/references/prompt-templates.md b/plugins/adversarial-verification/skills/adversarial-verification/references/prompt-templates.md new file mode 100644 index 0000000..0830730 --- /dev/null +++ b/plugins/adversarial-verification/skills/adversarial-verification/references/prompt-templates.md @@ -0,0 +1,153 @@ +# Prompt Templates + +Prompts are the most important part of this skill. A weak prompt produces a hedged agent, and a hedged agent produces a worthless verdict. These templates enforce maximal adversarial stance. + +## Table of contents + +- [Template: Advocate (decision mode)](#template-advocate-decision-mode) +- [Template: Skeptic (decision mode)](#template-skeptic-decision-mode) +- [Template: Advocate (proof mode)](#template-advocate-proof-mode) +- [Template: Skeptic (proof mode)](#template-skeptic-proof-mode) +- [Required elements](#required-elements) +- [How to fill in context](#how-to-fill-in-context) + +## Required elements + +Every prompt MUST include all of these. Missing any produces a weak agent. + +1. **Role statement** — "You are the ADVOCATE/SKEPTIC for X" +2. **Explicit side assignment** — "argue FOR / argue AGAINST" +3. **Background context** — claim, alternatives, prior findings, file paths the agent can read +4. **Anti-balance directive** — "Do NOT be balanced. Argue as hard as possible for your side" +5. **Evidence requirement** — "Cite specific files, line numbers, facts, CVEs, benchmarks" +6. **Counter-refutation requirement** — "Anticipate and pre-refute the obvious counter-arguments" +7. **Output shape** — what the agent should produce (arguments with headers, specific claims) +8. **Research-only marker** (if applicable) — "RESEARCH ONLY — do not write code or edit files" + +## Template: Advocate (decision mode) + +``` +RESEARCH ONLY — do not write any code or edit files. + +You are the ADVOCATE for {CLAIM}. + +Context: {BACKGROUND — 2-5 sentences of project state, what's been tried, why this matters} + +The competing alternatives are: {LIST_ALTERNATIVES} + +Make the STRONGEST possible case FOR {CLAIM}. Specifically address: +1. {KEY_POINT_1 — e.g., "Why it will find bugs the alternatives miss"} +2. {KEY_POINT_2 — e.g., "Why it's the fastest path to results"} +3. {KEY_POINT_3 — e.g., "Counter the argument that "} +4. {KEY_POINT_4 — e.g., "Concrete expected metrics"} +5. {KEY_POINT_5 — e.g., "Cite specific prior bugs this would catch"} + +Relevant files to read: {FILE_PATHS} + +Be specific. Cite papers, CVEs, file:line references where possible. + +DO NOT BE BALANCED. Argue as hard as possible FOR the claim. The caller has a SEPARATE skeptic agent that will argue the other side; your job is to produce the strongest possible pro-claim argument, not a balanced view. +``` + +## Template: Skeptic (decision mode) + +``` +RESEARCH ONLY — do not write any code or edit files. + +You are the SKEPTIC arguing AGAINST {CLAIM}. You should argue for one of the alternatives instead. + +Context: {BACKGROUND — same as advocate prompt} + +The competing alternatives are: {LIST_ALTERNATIVES} + +Make the STRONGEST possible case AGAINST {CLAIM} and FOR one of the alternatives. Specifically address: +1. {KEY_POINT_1 — e.g., "Why the claim's reasoning has a structural flaw"} +2. {KEY_POINT_2 — e.g., "The oracle/measurement problem"} +3. {KEY_POINT_3 — e.g., "Why the alternatives are fundamentally better"} +4. {KEY_POINT_4 — e.g., "Specific scenarios where the claim fails"} +5. {KEY_POINT_5 — e.g., "Expected false positive/false negative rate"} + +Relevant files to read: {FILE_PATHS} + +Be specific. Cite papers, CVEs, file:line references where possible. + +DO NOT BE BALANCED. Argue as hard as possible AGAINST the claim. The caller has a SEPARATE advocate agent that will argue for it; your job is to produce the strongest possible anti-claim argument, not a balanced view. +``` + +## Template: Advocate (proof mode) + +``` +RESEARCH ONLY — do not write any code or edit files. + +You are the ADVOCATE proving {FINDING/BUG/CLAIM} is real. + +Context: {BACKGROUND — what the finding is, how it was discovered, initial evidence} + +Your job: REFUTE each of the N null hypotheses below. Each null hypothesis, if true, would dismiss the finding. Show each one is false. + +Null hypotheses to refute: +- P1: {e.g., "This is just normal input rejection"} +- P2: {e.g., "This is a harness artifact"} +- P3: {e.g., "This is a benign assertion"} +- P4: {e.g., "The input is unreachable in practice"} +- P5: {e.g., "Already fixed"} + +For each P, provide: +- Evidence against the null (files, logs, reproducers) +- Concrete reproduction in a clean environment if applicable +- Verdict: REFUTED (with evidence) vs CANNOT REFUTE (with reason) + +Relevant files to read: {FILE_PATHS} + +DO NOT BE BALANCED. Your job is to defend the finding. A skeptic is being run separately to argue the other side. +``` + +## Template: Skeptic (proof mode) + +``` +RESEARCH ONLY — do not write any code or edit files. + +You are the SKEPTIC trying to prove {FINDING/BUG/CLAIM} is NOT a real finding. + +Context: {BACKGROUND — same as advocate} + +Your job: PROVE each of the N null hypotheses below. If any is true, the finding should be dismissed. + +Null hypotheses to prove: +- P1: {e.g., "This is just normal input rejection (exit -302)"} +- P2: {e.g., "This is a harness artifact (doesn't reproduce in clean env)"} +- P3: {e.g., "This is a benign assertion in validation code"} +- P4: {e.g., "The input is unreachable by any real attacker"} +- P5: {e.g., "Already fixed in a newer version"} + +For each P, provide: +- Evidence FOR the null hypothesis +- What would confirm it (repro steps, logs) +- Verdict: PROVED (with evidence) vs CANNOT PROVE (with reason) + +Relevant files to read: {FILE_PATHS} + +DO NOT BE BALANCED. Your job is to dismiss the finding if at all possible. An advocate is being run separately to defend it. +``` + +## How to fill in context + +**Keep context tight (≤200 words).** Each agent should have enough to reason, not so much that the adversarial assignment gets diluted. Include: + +- **Claim/finding** — one sentence, verbatim +- **What's already known** — 2-3 sentences; prior findings, constraints +- **What the agent should read** — 3-5 specific file paths with line numbers if relevant +- **Known-buggy scenarios** (for claim verification) — a concrete list the agent can cross-reference + +Do NOT include: +- The other agent's arguments (even hints) +- Your own inclination ("I think this is probably...") +- The expected answer + +## Re-dispatching on hedged agents + +If an agent returns something like "I see merits on both sides" or "this is a nuanced question" or "both approaches have tradeoffs" — that is a failure. The prompt was too weak. + +Re-dispatch with stronger phrasing: + +> "Your previous response was too balanced. You are the ADVOCATE/SKEPTIC. Do not acknowledge merit in the opposing position. Do not hedge. Argue ONE side as hard as possible — the synthesis step happens separately. Return ONLY the strongest pro-{SIDE} argument. If you cannot make a strong case for your side, say so explicitly, but do not substitute a balanced view." diff --git a/plugins/adversarial-verification/skills/adversarial-verification/references/proof-mode.md b/plugins/adversarial-verification/skills/adversarial-verification/references/proof-mode.md new file mode 100644 index 0000000..0bf2b84 --- /dev/null +++ b/plugins/adversarial-verification/skills/adversarial-verification/references/proof-mode.md @@ -0,0 +1,74 @@ +# Proof Mode + +Use for bug findings, security claims, and any assertion of the form "X is real" or "X exists" or "X is a vulnerability." + +## When to pick proof mode + +- "This crash is a real bug, not a false positive" +- "This is exploitable" +- "This vulnerability affects version N" +- "This finding is valid" +- "This behavior is a bug, not by design" + +If the claim is "X is the best approach," return to [SKILL.md](../SKILL.md) Step 2 and pick Decision mode instead. + +## The structure: N null hypotheses + +Proof mode works by enumerating the **null hypotheses** — every way the finding could be wrong. The skeptic tries to prove each null; the advocate tries to refute each. If all null hypotheses fail to be proved, the finding is CONFIRMED. + +Default set of 5 null hypotheses for security findings: + +| # | Null hypothesis | What proves it | +|---|----------------|----------------| +| P1 | This is normal error handling, not a crash | Exit code matches spec; stderr shows intentional rejection | +| P2 | This is a harness artifact | Doesn't reproduce in a clean environment (different shell, fresh binary) | +| P3 | This is a benign assertion | SIGABRT in validation code with no exploitability path | +| P4 | The input is unreachable by a real attacker | Requires privileged access or an artificial construction | +| P5 | Already fixed in a newer version | Crash doesn't reproduce on current release | + +Adapt the set to the claim. For non-security claims you might use different P values — e.g., for a performance regression claim: P1 = measurement noise, P2 = cold cache, P3 = unrelated background load, etc. + +## How advocate and skeptic interact in proof mode + +Both agents get the SAME list of null hypotheses. They argue the SAME Ps from opposite sides: + +**Advocate (defends the finding):** +- For each P, produces evidence REFUTING the null +- Reproduces in clean env (refutes P2) +- Shows crash is in register allocator, not validation (refutes P3) +- Shows a real compiler can emit the triggering input (refutes P4) + +**Skeptic (attacks the finding):** +- For each P, produces evidence FOR the null +- Cites similar crashes that were dismissed as config issues (attacks P2) +- Cites Apple's stance on similar bugs (attacks P3) +- Shows the input requires a non-standard construction (attacks P4) + +## Verdict rule for proof mode + +| Condition | Verdict | +|-----------|---------| +| All Ps REFUTED by advocate, NOT PROVED by skeptic | **CONFIRMED** — the finding is real | +| Any P clearly PROVED by skeptic | **DISMISSED** — the finding is a false positive | +| Any P in dispute (both sides plausible) | **UNCERTAIN** — gather more evidence for that specific P before committing | + +## Structured output + +Verdict table columns differ from decision mode: + +| Null hypothesis | Skeptic says | Advocate says | Outcome | +|----------------|--------------|---------------|---------| +| P1: normal rejection | Exit codes match rejection pattern | Exit is SIGABRT -6, not rejection (-302) | REFUTED | +| P2: harness artifact | Setup script may interfere | Reproduces with clean `env -i` and direct binary call | REFUTED | +| P3: benign assertion | Assertion is in `check_bounds` | Assertion is in `AllocTempGPRByIndex`, not bounds check | REFUTED | +| P4: unreachable input | Input requires crafted OOB memory ref | Real compilers emit RIP-relative addressing to any offset | REFUTED | +| P5: already fixed | Untested on newer versions | Tested on current macOS, reproduces | REFUTED | + +**Final verdict:** CONFIRMED — all 5 null hypotheses refuted with reproducible evidence. + +## Common mistakes in proof mode + +1. **Vague nulls** — "this might not be real" is not a falsifiable null. Nulls must be specific and testable. +2. **Shifting the burden** — advocate must provide REFUTATIONS, not just "this is clearly real." Each P requires specific evidence. +3. **Missing Ps** — if the skeptic raises a null not in your list, add it. The Ps are the structure of the proof, not a fixed ritual. +4. **Treating UNCERTAIN as CONFIRMED** — if P4 is in dispute, the finding is NOT confirmed. Either close out P4 with more evidence or report the finding with that specific caveat. diff --git a/plugins/adversarial-verification/skills/adversarial-verification/references/synthesis.md b/plugins/adversarial-verification/skills/adversarial-verification/references/synthesis.md new file mode 100644 index 0000000..ed85df2 --- /dev/null +++ b/plugins/adversarial-verification/skills/adversarial-verification/references/synthesis.md @@ -0,0 +1,87 @@ +# Synthesis Template + +After both agents return, the caller produces the synthesis. This is where the adversarial value is extracted. Raw dumps of both sides are not the product — the verdict table is the product. + +## The verdict table + +### For decision mode + +| Dimension | Advocate position | Skeptic position | Verdict | +|-----------|-------------------|------------------|---------| +| {dim 1} | {advocate claim} | {skeptic claim} | **Survives / Weakens / Falls** | +| {dim 2} | ... | ... | ... | + +### For proof mode + +| Null hypothesis | Skeptic proof attempt | Advocate refutation | Outcome | +|----------------|----------------------|--------------------|---------| +| P1: {null} | {skeptic evidence} | {advocate evidence} | **PROVED / REFUTED / UNCERTAIN** | +| P2: ... | ... | ... | ... | + +## Verdict values + +- **Survives** (or **REFUTED** in proof mode) — the side whose point held up. The counter-argument did not land. +- **Weakens** — partial rebuttal; position needs qualification +- **Falls** (or **PROVED** in proof mode) — the side whose point was cleanly defeated +- **UNCERTAIN** (proof mode only) — both sides made plausible cases; more evidence needed + +## The recommendation paragraph + +After the table, write ONE paragraph (3-5 sentences) that: + +1. States the overall winner +2. Lists the 2-3 opposing points that survived (these become qualifications, caveats, or follow-ups) +3. Specifies what the caller should actually do next + +**Template:** + +> {Winner position} wins on {main dimensions}. However, {opposing side}'s points about {surviving objection 1} and {surviving objection 2} are valid and must be addressed. Recommended action: {specific next step}. + +## Worked examples + +### Example 1 — decision mode (YARPGen claim) + +Claim: "YARPGen is the best primary strategy for finding Rosetta 2 semantic bugs." + +| Dimension | Advocate | Skeptic | Verdict | +|-----------|----------|---------|---------| +| End-to-end execution | Tests entire pipeline including translation | Current fuzzer can't execute — true gap | Advocate wins | +| FP / SIMD coverage | Will exercise via auto-vectorization | YARPGen has no FP support; auto-vec rarely triggers | **Skeptic wins** | +| Oracle quality | Native ARM64 compile = ground truth | Clang can miscompile; can't disambiguate without Intel HW | **Skeptic wins** | +| Known-bug coverage | Would catch Bug 2.2, 2.3 | 0 of 6 known Rosetta bug classes reachable | **Skeptic wins** | +| Setup effort | 1 day, 50-line script | True | Advocate wins | +| FP rate estimate | ~1 per 10k programs genuine | 80-95% will be compiler bugs | **Skeptic wins** | + +Recommendation: **Skeptic wins.** YARPGen is a weak PRIMARY strategy because it cannot reach the instruction classes where Rosetta bugs actually live (FP, SIMD, implicit registers) and has a fatal oracle problem at -O2. However, the advocate's point about low setup effort is real. Use YARPGen as a CHEAP COMPLEMENT running in the background, not as the primary direction. Primary direction should be Track 2 + Track 3 (fix current fuzzer, add ARM64 execution harness). + +### Example 2 — proof mode (FINDING-001) + +Claim: "FINDING-001 (pcmpestrm register allocator abort) is a real translation bug." + +| Null | Skeptic | Advocate | Outcome | +|------|---------|----------|---------| +| P1: normal rejection | Exit codes in rejection range | Exit is SIGABRT -6, not rejection (-302) | **REFUTED** | +| P2: harness artifact | Test setup may interfere | Reproduces under `env -i` with fresh binary | **REFUTED** | +| P3: benign assertion | All aborts are assertions | Abort is in `AllocTempGPRByIndex` register allocator, not validation | **REFUTED** | +| P4: unreachable input | Requires crafted OOB memory | Real compilers emit RIP-relative addressing; any OOB offset is reachable | **REFUTED** | +| P5: already fixed | Untested on newer | Reproduces on macOS 26.4 (current) | **REFUTED** | + +Recommendation: **CONFIRMED.** All 5 null hypotheses refuted with reproducible evidence. The crash is a real translation-path register allocator exhaustion in `libRosettaAot.dylib`. Severity: Low-Medium (local DoS, controlled SIGABRT, not memory corruption). Report as finding; do not pursue as critical vuln. + +## When the verdict is mixed + +Sometimes the skeptic wins some dimensions and the advocate wins others. That's not a failure — it's the most common outcome. The recommendation should: + +- Pick the overall winner by weighing the most important dimensions for the caller's actual decision +- List surviving opposing points as required qualifications or follow-up work +- Not split the difference into "do neither" — pick a direction and note the caveats + +## When to re-run + +Re-dispatch one or both agents if: +- An agent returned a balanced / hedged response +- An agent didn't address a critical dimension +- Both agents agreed on something you think is wrong +- A key piece of evidence wasn't considered + +Re-run with a tighter prompt. Use the prompt template reference linked from [SKILL.md](../SKILL.md) Step 3.