From ca2e872c5840d9c01a7673f163361f7309a69fc4 Mon Sep 17 00:00:00 2001 From: nicolascukas Date: Mon, 8 Jun 2026 10:42:14 +0200 Subject: [PATCH 1/4] fix(cli): ungag agy as acting-Cesar + fix code-block render wrapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two unrelated Cesar/TUI bugs surfaced from a real session: 1. agy could not use tools as acting-Cesar. When the configured Cesar returns empty, agon falls back to acting-Cesar dispatched in 'exec' mode. The adapter injects an agy-specific OUTPUT RULES block (engine.id === 'agy' && mode !== 'agent') that forbids file edits / tool use and forces a single-pass text answer — so agy refused every tool and confabulated a "system harness" excuse. Acting-Cesar (and the brain recovery path) are agentic leading roles, so dispatch them in 'agent' mode when the engine supports it (agy's agent and exec modes are the same agentic CLI), falling back to 'exec' otherwise. Matches the injection's own intent ("agent mode is left agentic on purpose"). 2. CodeBlockView rendered every line wrapped with stray border pipes on blank rows. The width math was inconsistent: the box was sized to the longest line while code rows padded to the terminal width, so each row overflowed the box and Ink wrapped it. Rewrote all rows (border, header, code, overflow) around one coherent inner `body` width so each row is exactly rowWidth; over-long lines truncate instead of wrapping. Compile + build clean; 221/221 cesar/adapter/render tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/generated/blocks/rendering.entry.tsx | 4 +- .../cli/src/generated/blocks/rendering.tsx | 38 ++++++++++++------- packages/cli/src/generated/cesar/brain.ts | 8 +++- .../signals/dispatch/cesar-router.ts | 10 ++++- packages/cli/src/kern/blocks/rendering.kern | 26 +++++++++---- packages/cli/src/kern/cesar/brain.kern | 8 +++- .../kern/signals/dispatch/cesar-router.kern | 8 +++- 7 files changed, 73 insertions(+), 29 deletions(-) diff --git a/packages/cli/src/generated/blocks/rendering.entry.tsx b/packages/cli/src/generated/blocks/rendering.entry.tsx index 9f5d5889..087f99e4 100644 --- a/packages/cli/src/generated/blocks/rendering.entry.tsx +++ b/packages/cli/src/generated/blocks/rendering.entry.tsx @@ -1,7 +1,7 @@ #!/usr/bin/env node -// @generated by kern v3.5.7 — DO NOT EDIT. Source: src/kern/blocks/rendering.kern +// @generated by kern v3.5.8 — DO NOT EDIT. Source: src/kern/blocks/rendering.kern -// @kern-source: rendering:428 +// @kern-source: rendering:438 import React from 'react'; import { render } from 'ink'; diff --git a/packages/cli/src/generated/blocks/rendering.tsx b/packages/cli/src/generated/blocks/rendering.tsx index 1c176d6f..cf026dd9 100644 --- a/packages/cli/src/generated/blocks/rendering.tsx +++ b/packages/cli/src/generated/blocks/rendering.tsx @@ -71,11 +71,20 @@ export function CodeBlockView({ segment, borderColor }: { segment:ContentSegment const bc = borderColor || '#585858'; const maxLineLen = capped.reduce((m: number, l: string) => Math.max(m, l.length), 0); const headerLen = (segment.language || 'code').length + (segment.index !== undefined ? ` [${segment.index}]`.length : 0); - const innerWidth = Math.max(maxLineLen, headerLen); - const boxWidth = innerWidth + 4; - const rule = '\u2500'.repeat(boxWidth); + // Single coherent inner-content width for every row. Clamp to the terminal + // budget so a long line truncates instead of forcing the box wider than the + // screen. Every row is built as: `\u2502 \u258c \u2502` + // left frame `\u2502 \u258c ` = 5 cols, right frame ` \u2502` = 3 cols \u2192 rowWidth = body + 8. + // border row `\u2502 ` + rule + ` \u2502` = rule.length + 4, so rule = body + 4. + // Previously the box was sized to body+2 while rows rendered at body+4..body+11, + // so every row overflowed and Ink wrapped it \u2014 dropping the trailing `\u2502` onto a + // blank row (the stray pipes + huge gaps between lines). + const body = Math.min(Math.max(maxLineLen, headerLen), codeWidth); + const rowWidth = body + 8; + const rule = '\u2500'.repeat(body + 4); + const overflowLabel = `\u2026 ${overflow} more lines`; return ( - + {'\u2502 '}{rule}{' \u2502'} {'\u2502 '} @@ -83,7 +92,7 @@ export function CodeBlockView({ segment, borderColor }: { segment:ContentSegment {segment.language || 'code'} {segment.index !== undefined && {` [${segment.index}]`}} - {' '.repeat(Math.max(0, boxWidth - headerLen - 1))} + {' '.repeat(Math.max(0, body - headerLen))} {' \u2502'} {capped.map((line: string, i: number) => ( @@ -91,8 +100,8 @@ export function CodeBlockView({ segment, borderColor }: { segment:ContentSegment {'\u2502 '} {CODE_RAIL} - {isDiff ? : } - {' '.repeat(Math.max(0, codeWidth - line.length - 4))} + {isDiff ? : } + {' '.repeat(Math.max(0, body - line.length))} {' \u2502'} ))} @@ -101,7 +110,8 @@ export function CodeBlockView({ segment, borderColor }: { segment:ContentSegment {'\u2502 '} {CODE_RAIL} - {'\u2026 '}{overflow}{' more lines'} + {overflowLabel} + {' '.repeat(Math.max(0, body - overflowLabel.length))} {' \u2502'} )} @@ -110,7 +120,7 @@ export function CodeBlockView({ segment, borderColor }: { segment:ContentSegment ); } -// @kern-source: rendering:254 +// @kern-source: rendering:264 export function RichSpanView({ span }: { span:InlineSpan }) { if (span.style.code) { return {span.text}; @@ -127,7 +137,7 @@ export function RichSpanView({ span }: { span:InlineSpan }) { return el; } -// @kern-source: rendering:275 +// @kern-source: rendering:285 export function RichLineView({ line, borderColor }: { line:RichLine; borderColor?:string }) { const border = borderColor ? {'\u2502 '} : null; const indent = line.indent > 0 ? ' '.repeat(line.indent) : ''; @@ -149,7 +159,7 @@ export function RichLineView({ line, borderColor }: { line:RichLine; borderColor return {border}{indent}{listIndent}{marker}{line.spans.map((s: InlineSpan, i: number) => )}; } -// @kern-source: rendering:302 +// @kern-source: rendering:312 export function MarkdownTableView({ headers, rows, alignments, borderColor }: { headers:string[]; rows:string[][]; alignments:('left' | 'center' | 'right')[]; borderColor:string }) { const colWidths = headers.map((h: string, i: number) => { let max = h.length; @@ -185,7 +195,7 @@ export function MarkdownTableView({ headers, rows, alignments, borderColor }: { ); } -// @kern-source: rendering:345 +// @kern-source: rendering:355 export function RenderedSegments({ segments, borderColor, wrapWidth }: { segments:ContentSegment[]; borderColor:string; wrapWidth:number }) { return ( <> @@ -246,7 +256,7 @@ export function RenderedSegments({ segments, borderColor, wrapWidth }: { segment ); } -// @kern-source: rendering:412 +// @kern-source: rendering:422 export function GradientLine({ text, colors }: { text:string; colors:readonly string[] }) { const step = Math.max(1, Math.ceil(text.length / colors.length)); return ( @@ -259,7 +269,7 @@ export function GradientLine({ text, colors }: { text:string; colors:readonly st ); } -// @kern-source: rendering:428 +// @kern-source: rendering:438 export function AnsiLine({ text, maxWidth, fallbackDim }: { text:string; maxWidth:number; fallbackDim?:boolean }) { if (!hasAnsiCodes(text)) { const display = text.length > maxWidth ? text.slice(0, maxWidth - 4) + '\u2026' : text; diff --git a/packages/cli/src/generated/cesar/brain.ts b/packages/cli/src/generated/cesar/brain.ts index 85de85f6..d105a08e 100644 --- a/packages/cli/src/generated/cesar/brain.ts +++ b/packages/cli/src/generated/cesar/brain.ts @@ -319,8 +319,14 @@ export async function handleCesarBrain(input: string, dispatch: Dispatch, ctx: H const outputDir = join(RUNS_DIR, `cesar-fallback-${Date.now()}`); mkdirSync(outputDir, { recursive: true }); const primedPrompt = buildHistoryPrimedPrompt(ctx.chatSession, input); + // Cesar is an agentic leading role, so dispatch in 'agent' mode when the + // engine supports it. 'exec' triggers agy's OUTPUT-RULES gag (adapter-helpers: + // engine.id === 'agy' && mode !== 'agent') which forbids file edits / tool use + // and forces a single-pass text answer — exactly why agy could not run tools + // as Cesar. Fall back to 'exec' for engines without an agent mode (no regression). + const fallbackMode = ((engine as any)?.agent ? 'agent' : 'exec') as any; const freshResult = await ctx.adapter.dispatch({ - engine, prompt: primedPrompt, cwd: resolveWorkingDir(), mode: 'exec' as any, + engine, prompt: primedPrompt, cwd: resolveWorkingDir(), mode: fallbackMode, timeout: config.timeout ?? 120, outputDir, signal: abort.signal, systemPrompt: buildCesarSystemPrompt(ctx), }); dispatch({ type: 'spinner-stop' }); diff --git a/packages/cli/src/generated/signals/dispatch/cesar-router.ts b/packages/cli/src/generated/signals/dispatch/cesar-router.ts index 16e6b299..a13ff8ca 100644 --- a/packages/cli/src/generated/signals/dispatch/cesar-router.ts +++ b/packages/cli/src/generated/signals/dispatch/cesar-router.ts @@ -948,11 +948,17 @@ export async function runCesarBrainFallback(input: string, cb: DispatchCallbacks const outDir = join(RUNS_DIR, `acting-cesar-${Date.now()}`); mkdirSync(outDir, { recursive: true }); if (!_silentMode) cb.dispatch({ type: 'info', message: formatCesarRecoveryStatus('acting', actingCesar, `log: ${outDir}`) }); + // Acting-Cesar leads and may need tools, so dispatch in 'agent' mode when the + // substitute supports it. 'exec' triggers agy's OUTPUT-RULES gag (adapter-helpers: + // engine.id === 'agy' && mode !== 'agent') which forbids file edits / tool use and + // forces single-pass text — the reason agy refused to run tools as acting Cesar. + // Fall back to 'exec' for engines without an agent mode (no regression). + const actingMode = ((actingEngine as any)?.agent ? 'agent' : 'exec') as any; const actingResult = await cb.ctx.adapter.dispatch({ engine: actingEngine, prompt: actingPrompt, cwd: resolveWorkingDir(), - mode: 'exec' as any, + mode: actingMode, timeout: (cesarConfig as any).timeout ?? 120, outputDir: outDir, systemPrompt: buildCesarSystemPrompt(cb.ctx), @@ -988,7 +994,7 @@ export async function runCesarBrainFallback(input: string, cb: DispatchCallbacks /** * Unified Cesar brain routing. Returns true if a background job was dispatched. */ -// @kern-source: cesar-router:929 +// @kern-source: cesar-router:935 export async function routeWithCesar(input: string, images: ImageAttachment[], cb: DispatchCallbacks): Promise { cb.setPendingImages(() => []); // Hoisted out of the try so the fallback ladder below can see whether the diff --git a/packages/cli/src/kern/blocks/rendering.kern b/packages/cli/src/kern/blocks/rendering.kern index c5cac083..fdc1d21d 100644 --- a/packages/cli/src/kern/blocks/rendering.kern +++ b/packages/cli/src/kern/blocks/rendering.kern @@ -210,11 +210,20 @@ screen name=CodeBlockView target=ink const bc = borderColor || '#585858'; const maxLineLen = capped.reduce((m: number, l: string) => Math.max(m, l.length), 0); const headerLen = (segment.language || 'code').length + (segment.index !== undefined ? ` [${segment.index}]`.length : 0); - const innerWidth = Math.max(maxLineLen, headerLen); - const boxWidth = innerWidth + 4; - const rule = '\u2500'.repeat(boxWidth); + // Single coherent inner-content width for every row. Clamp to the terminal + // budget so a long line truncates instead of forcing the box wider than the + // screen. Every row is built as: `\u2502 \u258c \u2502` + // left frame `\u2502 \u258c ` = 5 cols, right frame ` \u2502` = 3 cols \u2192 rowWidth = body + 8. + // border row `\u2502 ` + rule + ` \u2502` = rule.length + 4, so rule = body + 4. + // Previously the box was sized to body+2 while rows rendered at body+4..body+11, + // so every row overflowed and Ink wrapped it \u2014 dropping the trailing `\u2502` onto a + // blank row (the stray pipes + huge gaps between lines). + const body = Math.min(Math.max(maxLineLen, headerLen), codeWidth); + const rowWidth = body + 8; + const rule = '\u2500'.repeat(body + 4); + const overflowLabel = `\u2026 ${overflow} more lines`; return ( - + {'\u2502 '}{rule}{' \u2502'} {'\u2502 '} @@ -222,7 +231,7 @@ screen name=CodeBlockView target=ink {segment.language || 'code'} {segment.index !== undefined && {` [${segment.index}]`}} - {' '.repeat(Math.max(0, boxWidth - headerLen - 1))} + {' '.repeat(Math.max(0, body - headerLen))} {' \u2502'} {capped.map((line: string, i: number) => ( @@ -230,8 +239,8 @@ screen name=CodeBlockView target=ink {'\u2502 '} {CODE_RAIL} - {isDiff ? : } - {' '.repeat(Math.max(0, codeWidth - line.length - 4))} + {isDiff ? : } + {' '.repeat(Math.max(0, body - line.length))} {' \u2502'} ))} @@ -240,7 +249,8 @@ screen name=CodeBlockView target=ink {'\u2502 '} {CODE_RAIL} - {'\u2026 '}{overflow}{' more lines'} + {overflowLabel} + {' '.repeat(Math.max(0, body - overflowLabel.length))} {' \u2502'} )} diff --git a/packages/cli/src/kern/cesar/brain.kern b/packages/cli/src/kern/cesar/brain.kern index 145dcc45..339128bf 100644 --- a/packages/cli/src/kern/cesar/brain.kern +++ b/packages/cli/src/kern/cesar/brain.kern @@ -289,8 +289,14 @@ fn name=handleCesarBrain async=true params="input:string, dispatch:Dispatch, ctx const outputDir = join(RUNS_DIR, `cesar-fallback-${Date.now()}`); mkdirSync(outputDir, { recursive: true }); const primedPrompt = buildHistoryPrimedPrompt(ctx.chatSession, input); + // Cesar is an agentic leading role, so dispatch in 'agent' mode when the + // engine supports it. 'exec' triggers agy's OUTPUT-RULES gag (adapter-helpers: + // engine.id === 'agy' && mode !== 'agent') which forbids file edits / tool use + // and forces a single-pass text answer — exactly why agy could not run tools + // as Cesar. Fall back to 'exec' for engines without an agent mode (no regression). + const fallbackMode = ((engine as any)?.agent ? 'agent' : 'exec') as any; const freshResult = await ctx.adapter.dispatch({ - engine, prompt: primedPrompt, cwd: resolveWorkingDir(), mode: 'exec' as any, + engine, prompt: primedPrompt, cwd: resolveWorkingDir(), mode: fallbackMode, timeout: config.timeout ?? 120, outputDir, signal: abort.signal, systemPrompt: buildCesarSystemPrompt(ctx), }); dispatch({ type: 'spinner-stop' }); diff --git a/packages/cli/src/kern/signals/dispatch/cesar-router.kern b/packages/cli/src/kern/signals/dispatch/cesar-router.kern index ba1e1fe7..714d4d99 100644 --- a/packages/cli/src/kern/signals/dispatch/cesar-router.kern +++ b/packages/cli/src/kern/signals/dispatch/cesar-router.kern @@ -889,11 +889,17 @@ fn name=runCesarBrainFallback async=true params="input:string, cb:DispatchCallba const outDir = join(RUNS_DIR, `acting-cesar-${Date.now()}`); mkdirSync(outDir, { recursive: true }); if (!_silentMode) cb.dispatch({ type: 'info', message: formatCesarRecoveryStatus('acting', actingCesar, `log: ${outDir}`) }); + // Acting-Cesar leads and may need tools, so dispatch in 'agent' mode when the + // substitute supports it. 'exec' triggers agy's OUTPUT-RULES gag (adapter-helpers: + // engine.id === 'agy' && mode !== 'agent') which forbids file edits / tool use and + // forces single-pass text — the reason agy refused to run tools as acting Cesar. + // Fall back to 'exec' for engines without an agent mode (no regression). + const actingMode = ((actingEngine as any)?.agent ? 'agent' : 'exec') as any; const actingResult = await cb.ctx.adapter.dispatch({ engine: actingEngine, prompt: actingPrompt, cwd: resolveWorkingDir(), - mode: 'exec' as any, + mode: actingMode, timeout: (cesarConfig as any).timeout ?? 120, outputDir: outDir, systemPrompt: buildCesarSystemPrompt(cb.ctx), From 6412119556bc364632c0b41bde4f391482b2134f Mon Sep 17 00:00:00 2001 From: nicolascukas Date: Mon, 8 Jun 2026 11:09:39 +0200 Subject: [PATCH 2/4] fix(review): fail loudly when branch:X targets the current branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `agon review branch:NAME` runs `git diff NAME...HEAD`, which reviews HEAD's changes relative to NAME as the base — correct when NAME is the base branch (e.g. branch:main). But when NAME resolves to the same commit as HEAD (the common footgun of targeting the branch you're currently on), it's an empty self-diff that surfaced as a silent "No changes to review." A caller — or Cesar — could mistake that for a clean review and confabulate success. Now resolveReviewTarget compares the rev-parsed SHAs of NAME and HEAD and throws an actionable error pointing at the right targets (branch:main for the branch's commits, uncommitted for working-tree changes) before any engine is dispatched. review tests: 58/58 pass. Verified: `agon review branch:` now errors loudly pre-dispatch instead of returning empty. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/cli/src/generated/handlers/review.ts | 49 +++++++++++++------ packages/cli/src/kern/handlers/review.kern | 17 +++++++ 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/packages/cli/src/generated/handlers/review.ts b/packages/cli/src/generated/handlers/review.ts index 77e17dee..209c4021 100644 --- a/packages/cli/src/generated/handlers/review.ts +++ b/packages/cli/src/generated/handlers/review.ts @@ -82,6 +82,23 @@ export function resolveReviewTarget(target: string|undefined, cwd: string): {dif } else if (t.startsWith('branch:')) { const branch = t.slice(7); label = `branch ${branch}`; + // `git diff BRANCH...HEAD` reviews HEAD's changes relative to BRANCH as the + // base — correct when BRANCH is the base (e.g. branch:main). The footgun: + // when BRANCH resolves to the same commit as HEAD (targeting the branch you + // are currently on), it's an empty self-diff that surfaces as a silent + // "No changes to review" — which a caller (or Cesar) can mistake for a clean + // review. Detect that and fail LOUDLY with the right targets instead. + let branchSha = ''; + let headSha = ''; + try { + branchSha = execFileSync('git', ['rev-parse', branch], { cwd, encoding: 'utf-8' }).trim(); + headSha = execFileSync('git', ['rev-parse', 'HEAD'], { cwd, encoding: 'utf-8' }).trim(); + } catch (err) { + throw new Error(`Failed to resolve branch "${branch}": ${err instanceof Error ? err.message : String(err)}`); + } + if (branchSha && branchSha === headSha) { + throw new Error(`branch:${branch} points at the commit you are currently on, so diffing it against HEAD yields nothing to review. Use "branch:main" (or your base branch) to review this branch's commits, or "uncommitted" to review working-tree changes.`); + } try { diff = execFileSync('git', ['diff', `${branch}...HEAD`], { cwd, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }).trim(); } catch (err) { @@ -107,7 +124,7 @@ export function resolveReviewTarget(target: string|undefined, cwd: string): {dif return { diff, label }; } -// @kern-source: review:100 +// @kern-source: review:117 export function selectReviewEngine(requestedEngine: string|undefined, ctx: HandlerContext): string { const allActive = ctx.activeEngines(); const active = requestedEngine ? allActive : filterDefaultOrchestrationEngines(allActive); @@ -155,7 +172,7 @@ export function selectReviewEngine(requestedEngine: string|undefined, ctx: Handl throw new Error('No engines available for review. Try /engines to check availability.'); } -// @kern-source: review:148 +// @kern-source: review:165 export interface ReviewCoreResult { response: string; blocking: boolean; @@ -165,10 +182,10 @@ export interface ReviewCoreResult { usage?: {promptTokens:number,completionTokens:number,totalTokens:number,source:'sdk'|'cli-reported'|'estimated'}; } -// @kern-source: review:159 +// @kern-source: review:176 export const REVIEW_SENTINEL: string = ''; -// @kern-source: review:161 +// @kern-source: review:178 export interface ReviewSeverityCounts { blocking: number; important: number; @@ -179,7 +196,7 @@ export interface ReviewSeverityCounts { /** * Sentinel-anchored, fail-closed extraction of the findings array — the single chokepoint shared by parseReviewBlocking (the blocking gate) and summarizeReviewFindings (severity counts). Returns the parsed array (possibly empty []) or null when no parseable block follows the LAST sentinel. Anti-injection: only text after the LAST sentinel is considered, so attacker brackets quoted earlier in the diff are ignored. Tolerant of almost-JSON (trailing commas, line and block JS-style comments) and fenced json code blocks. */ -// @kern-source: review:167 +// @kern-source: review:184 export function extractReviewFindings(response: string): Array<{severity?:string, blocking?:boolean}> | null { if (!response || response.trim().length === 0) return null; @@ -279,7 +296,7 @@ export function extractReviewFindings(response: string): Array<{severity?:string /** * Sentinel-anchored, fail-closed parser. The engine MUST end its response with a unique sentinel followed by a JSON array of findings. Without a parseable block the response is treated as blocking + parseFailed, so the user must explicitly approve. This blocks the prompt-injection attack where an attacker echoes `[{"blocking":false}]` inside diff content — only the engine's real structured output after the LAST sentinel is considered. Thin wrapper over extractReviewFindings. */ -// @kern-source: review:265 +// @kern-source: review:282 export function parseReviewBlocking(response: string): {blocking:boolean, parseFailed:boolean} { const findings = extractReviewFindings(response); if (findings === null) return { blocking: true, parseFailed: true }; @@ -290,7 +307,7 @@ export function parseReviewBlocking(response: string): {blocking:boolean, parseF /** * Count findings by severity from the structured block, for human summaries like 'claude: ok, 1 important, 3 nits'. Returns all-zero when there is no parseable findings block (the caller renders that as unstructured/empty). A finding counts as blocking if blocking===true or severity==='blocking'; otherwise by its severity, with anything not 'important' falling to nit. */ -// @kern-source: review:274 +// @kern-source: review:291 export function summarizeReviewFindings(response: string): ReviewSeverityCounts { const findings = extractReviewFindings(response); if (!findings) return { blocking: 0, important: 0, nit: 0, total: 0 }; @@ -309,7 +326,7 @@ export function summarizeReviewFindings(response: string): ReviewSeverityCounts /** * Repair pass (B): re-ask the engine for ONLY a bare JSON array of the findings it already wrote in prose. Asking for a bare array (no sentinel, no prose, no fence) is the format LLMs comply with most reliably — far better than 'an HTML-comment marker followed by JSON', which engines routinely truncate to just the marker. The caller (runReviewCore) prepends the sentinel itself before parsing, so the anti-injection anchor is preserved. Best-effort: if this still doesn't parse, the fail-closed/unstructured result stands. */ -// @kern-source: review:291 +// @kern-source: review:308 export async function runReviewRepair(priorReview: string, engineId: string, ctx: HandlerContext, signal?: AbortSignal): Promise { const config = ctx.config; const cwd = resolveWorkingDir(); @@ -359,7 +376,7 @@ export async function runReviewRepair(priorReview: string, engineId: string, ctx /** * Repo grounding: read the CURRENT full content of each source file the diff touches and format it as a context block. A diff shows only the changed hunks, so reviewers raise false alarms that reading the whole file would kill instantly ('X is unhandled' when the wrapper handles it three lines down; 'unimported' when it's imported at the top). Bounded hard (per-file + total caps) to protect prompt size / TTFT, and skips generated/dist/min files (derived noise that would blow the budget). Best-effort: deleted/binary/unreadable files are skipped — the diff still covers them. */ -// @kern-source: review:329 +// @kern-source: review:346 export function gatherReviewFileContext(diff: string, cwd: string): string { const PER_FILE_MAX = 20_000; const TOTAL_MAX = 60_000; @@ -406,7 +423,7 @@ export function gatherReviewFileContext(diff: string, cwd: string): string { /** * Core review flow with no ctx side effects. Used by both handleReview (with streaming dispatch) and the plan executor's review step (silent). Does NOT touch ctx.setActiveAbort, ctx.lastReviewResult, ctx.chatSession, or tracker. signal is optional: callers that don't have an abort controller can pass undefined. cwdOverride pins the working directory the review engine runs in AND the repo file-context is gathered from — goal passes the per-task worktree so review engines never operate in (and write to) the parent repo; defaults to resolveWorkingDir() for the interactive/CLI review paths. */ -// @kern-source: review:374 +// @kern-source: review:391 export async function runReviewCore(diff: string, label: string, engineId: string, ctx: HandlerContext, signal?: AbortSignal, onProgress?: (chunk:string)=>void, cwdOverride?: string): Promise { const cwd = cwdOverride ?? resolveWorkingDir(); const config = ctx.config; @@ -502,7 +519,7 @@ export async function runReviewCore(diff: string, label: string, engineId: strin /** * Strip the trailing machine-readable findings block (sentinel + JSON) from a review so the Ctrl+R results pager shows clean prose — the consensus summary already encodes those findings. Cesar's copy (ctx.lastReviewResult.reviewOutput) keeps the full response, so 'fix it' still has the structured file/line/minimalFix data. No-op when there's no sentinel. */ -// @kern-source: review:450 +// @kern-source: review:467 export function stripMachineBlock(response: string): string { const idx = response.lastIndexOf(REVIEW_SENTINEL); if (idx < 0) return response; @@ -512,7 +529,7 @@ export function stripMachineBlock(response: string): string { /** * Build a consensus EngineOutcome from one engine's review. status!=='ok' yields an empty-findings failure lane (never a phantom blocker), carrying any diagnostic note (error message / timeout detail) through to ConsensusReport.engineFailures; 'ok' parses the engine's structured findings into RawFindings. Shared by the single- and multi-engine paths so the mapping lives in one place. */ -// @kern-source: review:458 +// @kern-source: review:475 export function reviewOutcome(engineId: string, response: string, status: string, note?: string): any { if (status !== 'ok') return { engine: engineId, status, findings: [], note }; // Guard against a model emitting a non-object element (e.g. `[null]` or a @@ -531,7 +548,7 @@ export function reviewOutcome(engineId: string, response: string, status: string /** * Render a consensus report into the compact, human-facing summary lines (tiered: verified / needs-check / speculative / nits / failed). The single source of the summary text shown inline AND stored as ReviewResultData.consensusSummary, so the transcript and the Ctrl+R pager always agree. */ -// @kern-source: review:475 +// @kern-source: review:492 export function buildReviewConsensusLines(consensus: any): string[] { const fmt = (f: any): string => ` • [${f.severity} ${f.maxConfidence.toFixed(2)} ×${f.engines.length}${f.pairVotes >= 2 ? ' pair' : ''}] ${f.problem}${f.file ? ` (${f.file}${f.lines ? ':' + f.lines : ''})` : ''}`; const lines: string[] = [`Consensus — ${consensus.summary}`]; @@ -546,7 +563,7 @@ export function buildReviewConsensusLines(consensus: any): string[] { /** * One-line severity tail for a single engine's review: '2 important, 3 nits' (zero categories omitted; 'no findings' when empty). */ -// @kern-source: review:488 +// @kern-source: review:505 export function formatReviewCounts(c: ReviewSeverityCounts|undefined): string { if (!c || c.total === 0) return 'no findings'; const parts: string[] = []; @@ -556,7 +573,7 @@ export function formatReviewCounts(c: ReviewSeverityCounts|undefined): string { return parts.join(', '); } -// @kern-source: review:499 +// @kern-source: review:516 export async function handleReview(dispatch: Dispatch, ctx: HandlerContext, target?: string, requestedEngine?: string): Promise { const abort = new AbortController(); try { @@ -682,7 +699,7 @@ export async function handleReview(dispatch: Dispatch, ctx: HandlerContext, targ /** * Run review for one or more explicitly requested engines. With 2+ engines they run in PARALLEL — each gets its own hard timeout, so a slow-but-excellent reviewer (codex) never blocks the others and a hung engine can't wedge the whole review. Each engine's block is dispatched as it finishes; findings are combined into ctx.lastReviewResult for Cesar follow-up/fix planning. A single engine delegates to the streaming handleReview path. */ -// @kern-source: review:621 +// @kern-source: review:638 export async function handleReviewMany(dispatch: Dispatch, ctx: HandlerContext, target?: string, requestedEngines?: string[]): Promise { const abort = new AbortController(); try { diff --git a/packages/cli/src/kern/handlers/review.kern b/packages/cli/src/kern/handlers/review.kern index 1ea415b2..c137b9e5 100644 --- a/packages/cli/src/kern/handlers/review.kern +++ b/packages/cli/src/kern/handlers/review.kern @@ -72,6 +72,23 @@ fn name=resolveReviewTarget params="target:string|undefined, cwd:string" returns } else if (t.startsWith('branch:')) { const branch = t.slice(7); label = `branch ${branch}`; + // `git diff BRANCH...HEAD` reviews HEAD's changes relative to BRANCH as the + // base — correct when BRANCH is the base (e.g. branch:main). The footgun: + // when BRANCH resolves to the same commit as HEAD (targeting the branch you + // are currently on), it's an empty self-diff that surfaces as a silent + // "No changes to review" — which a caller (or Cesar) can mistake for a clean + // review. Detect that and fail LOUDLY with the right targets instead. + let branchSha = ''; + let headSha = ''; + try { + branchSha = execFileSync('git', ['rev-parse', branch], { cwd, encoding: 'utf-8' }).trim(); + headSha = execFileSync('git', ['rev-parse', 'HEAD'], { cwd, encoding: 'utf-8' }).trim(); + } catch (err) { + throw new Error(`Failed to resolve branch "${branch}": ${err instanceof Error ? err.message : String(err)}`); + } + if (branchSha && branchSha === headSha) { + throw new Error(`branch:${branch} points at the commit you are currently on, so diffing it against HEAD yields nothing to review. Use "branch:main" (or your base branch) to review this branch's commits, or "uncommitted" to review working-tree changes.`); + } try { diff = execFileSync('git', ['diff', `${branch}...HEAD`], { cwd, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }).trim(); } catch (err) { From 2d03106e62f44b8b9124177d6bbdaac5b599e26d Mon Sep 17 00:00:00 2001 From: nicolascukas Date: Mon, 8 Jun 2026 11:17:39 +0200 Subject: [PATCH 3/4] =?UTF-8?q?fix(cesar):=20ground=20confabulated=20deleg?= =?UTF-8?q?ations=20=E2=80=94=20stop=20"review=20is=20running"=20lies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Weak coding-plan engines as Cesar narrate that they dispatched or are running an async review/forge/agent job ("three reviewers are reading the diff in parallel", "I kicked off the review", "I'll report when they land") without ever calling the tool. Cesar's turn has no jobManager access, but it has a grounded turn-local signal: ctx.cesar.pendingDelegation is set only when a handoff tool (Review/Forge/Agent/…) is actually emitted this turn. So a dispatch/running claim + a null pendingDelegation = a fabricated delegation. Adds detectFabricatedDelegation(text) (requires BOTH a delegable target AND a dispatch/running claim, so a plain answer mentioning "review" doesn't trip it) and a guard in brain.kern after the plan-mode nudge: when it fires and nothing is pending or running, re-prompt once with a [SYSTEM] grounding message — call the real tool now, or tell the user plainly nothing is running. Mirrors the existing plan-mode / final-answer nudges; soft (re-prompt, never blocks). Pairs with the branch:X review fix: that turns a silent "No changes" into a loud error, so Cesar can't mistake a no-op review for a successful one. Tests: detectFabricatedDelegation covered with the real transcript phrases + negatives; 198/198 cesar/brain/pty/adapter tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../cli/src/generated/cesar/brain-helpers.ts | 25 +++++++++--- packages/cli/src/generated/cesar/brain.ts | 39 ++++++++++++++++++- .../cli/src/kern/cesar/brain-helpers.kern | 13 +++++++ packages/cli/src/kern/cesar/brain.kern | 39 ++++++++++++++++++- tests/unit/cesar-brain.test.ts | 25 +++++++++++- 5 files changed, 133 insertions(+), 8 deletions(-) diff --git a/packages/cli/src/generated/cesar/brain-helpers.ts b/packages/cli/src/generated/cesar/brain-helpers.ts index 4c9a44ea..79fd486d 100644 --- a/packages/cli/src/generated/cesar/brain-helpers.ts +++ b/packages/cli/src/generated/cesar/brain-helpers.ts @@ -121,9 +121,24 @@ export function detectMutationIntentStall(text: string): boolean { } /** - * Return unique tool names from failed eager tool results. Used to restrict one-shot repair retries to the tool that just failed. + * Detect a response that CLAIMS an async review/forge/tribunal/brainstorm/agent or background job was dispatched or is now running — e.g. 'review delegated to codex, claude, agy', 'three reviewers are reading the diff in parallel', 'I kicked off the review', "I'll get back when they report". The caller pairs this with 'no delegation was actually emitted this turn' (ctx.cesar.pendingDelegation is null) to catch the confabulation where a weak engine narrates a dispatch it never made. Requires BOTH a delegable target AND a dispatch/running claim, so a plain answer that merely mentions the word 'review' does not trip it. */ // @kern-source: brain-helpers:113 +export function detectFabricatedDelegation(text: string): boolean { + const body = String(text ?? '').trim(); + if (!body) return false; + // A delegable target: review / forge / tribunal / brainstorm / campfire / agents / engines / a background job. + const TARGET_RE = /\b(?:review(?:er)?s?|forg(?:e|ing)|tribunal|brainstorm|campfire|agents?|engines?|jobs?)\b/i; + if (!TARGET_RE.test(body)) return false; + // A claim that the target was dispatched or is now running / will report back. + const DISPATCH_RE = /\b(?:kick(?:ed|ing)?\s*(?:it|them|that|the\s+\w+)?\s*off|fired?\s*(?:it|them|off)|dispatch(?:ed|ing)|delegat(?:ed|ing)|(?:is|are|now)\s+running|running\s+(?:in|now)|in\s+parallel|reading\s+the\s+(?:diff|changes|code)|working\s+(?:on\s+it|in\s+parallel)|in\s+progress|under\s*way|i'?ll\s+(?:get\s+back|report|let\s+you\s+know|surface|update)|report(?:s|ing)?\s+back|when\s+they\s+(?:report|land|return|finish|come\s+back)|still\s+(?:running|going|working|in\s+progress)|spun?\s+up|started\s+(?:the|a)\s+(?:review|forge|job|tribunal|brainstorm))\b/i; + return DISPATCH_RE.test(body); +} + +/** + * Return unique tool names from failed eager tool results. Used to restrict one-shot repair retries to the tool that just failed. + */ +// @kern-source: brain-helpers:126 export function eagerFailedToolNames(results: ToolCallResult[]): string[] { const names: string[] = []; for (const result of results ?? []) { @@ -141,7 +156,7 @@ export function eagerFailedToolNames(results: ToolCallResult[]): string[] { /** * Gate eager tool repair retries. A corrected tool call may run once only if the same tool failed in the immediately previous eager batch. */ -// @kern-source: brain-helpers:125 +// @kern-source: brain-helpers:138 export function shouldRunEagerRepairTool(toolName: string, meta: any, failedToolNames: string[], usedToolNames: string[]): boolean { const name = String(toolName ?? '').trim(); if (!name) return false; @@ -156,7 +171,7 @@ export function shouldRunEagerRepairTool(toolName: string, meta: any, failedTool /** * Return true for XML tools that hand control back to the Agon dispatcher. These tools do not produce inline results; continuing the XML tool loop after them can make Cesar claim a delegation happened while the actual forge/brainstorm/etc. job has not started yet. */ -// @kern-source: brain-helpers:138 +// @kern-source: brain-helpers:151 export function shouldStopAfterXmlToolCall(toolName: string): boolean { const HANDOFF_TOOLS = new Set(['Forge', 'Brainstorm', 'Tribunal', 'Campfire', 'Pipeline', 'Review', 'Agent', 'Goal', 'ProposePlan', 'ExitPlanMode']); return HANDOFF_TOOLS.has(String(toolName ?? '')); @@ -165,7 +180,7 @@ export function shouldStopAfterXmlToolCall(toolName: string): boolean { /** * Expand a bare 'fix it' follow-up into an explicit prompt grounded in the most recent stored review result. This avoids making Cesar guess which reviewer findings the user means, especially because /review runs outside Cesar's live session history. */ -// @kern-source: brain-helpers:144 +// @kern-source: brain-helpers:157 export function buildReviewFollowupPrompt(input: string, ctx: HandlerContext): { matched: boolean; prompt: string } { const trimmed = input.trim(); const match = trimmed.match(/^fix it(?:\s+with\s+([a-z0-9._-]+))?[\s?!.,;:]*$/i); @@ -186,7 +201,7 @@ export function buildReviewFollowupPrompt(input: string, ctx: HandlerContext): { return { matched: true, prompt: prompt }; } -// @kern-source: brain-helpers:163 +// @kern-source: brain-helpers:176 export function extractDelegation(toolName: string, args: Record): PendingDelegation { const argsRecord = args as Record; const taskKindRaw = argsRecord.taskKind; diff --git a/packages/cli/src/generated/cesar/brain.ts b/packages/cli/src/generated/cesar/brain.ts index d105a08e..8b5f047d 100644 --- a/packages/cli/src/generated/cesar/brain.ts +++ b/packages/cli/src/generated/cesar/brain.ts @@ -32,7 +32,7 @@ import { applyCesarSelfTurnApproval } from './self-turn-approval.js'; import { createCesarTurnId, recordCesarApprovalDecision, recordCesarToolTimeline, recordCesarConfidence } from './tool-observability.js'; -import { yieldToInk, splitBeforeToolMarkup, XML_TOOL_MARKUP_HOLD_CHARS, findTrailingUserQuestion, detectAwaitingUserInput, detectNarratedToolStall, detectMutationIntentStall, eagerFailedToolNames, shouldRunEagerRepairTool, shouldStopAfterXmlToolCall, buildReviewFollowupPrompt, extractDelegation } from './brain-helpers.js'; +import { yieldToInk, splitBeforeToolMarkup, XML_TOOL_MARKUP_HOLD_CHARS, findTrailingUserQuestion, detectAwaitingUserInput, detectNarratedToolStall, detectMutationIntentStall, detectFabricatedDelegation, eagerFailedToolNames, shouldRunEagerRepairTool, shouldStopAfterXmlToolCall, buildReviewFollowupPrompt, extractDelegation } from './brain-helpers.js'; // @kern-source: brain:19 export async function commitTurnAndDelegate(pendingDel: PendingDelegation, input: string, response: string, cesarEngineId: string, streaming: boolean, dispatch: Dispatch, ctx: HandlerContext, telemetry?: Record): Promise { @@ -1631,6 +1631,43 @@ export async function handleCesarBrain(input: string, dispatch: Dispatch, ctx: H } } + // ── Fabricated-delegation guard: ground a confabulated dispatch ── + // Catches the failure where a weak engine narrates that it dispatched or is + // running an async review/forge/agent job ("three reviewers are reading the + // diff in parallel", "I kicked off the review", "I'll report when they land") + // WITHOUT having emitted any delegation this turn — pendingDelegation is null, + // so nothing is actually queued or running. Re-prompt once to ground it: call + // the real tool now, or tell the user plainly that nothing is running. If the + // re-prompt dispatches for real, pendingDelegation gets set and the existing + // downstream delegation path takes over. + if ( + !ctx.cesar!.pendingDelegation + && session.alive + && !abort.signal.aborted + && detectFabricatedDelegation(response.trim()) + ) { + dispatch({ type: 'warning', message: 'Cesar claimed a job was running but never dispatched one — grounding...' }); + dispatch({ type: 'spinner-start', message: 'Cesar grounding…', color }); + try { + let groundResponse = ''; + const groundGen = session.send({ + message: '[SYSTEM] GROUNDING CHECK: You did NOT dispatch any review/forge/tribunal/brainstorm/agent/job this turn, and none is pending or running. Do NOT claim background work is "running", "in parallel", "kicked off", or that anyone "will report back" — that is false and misleads the user. If the user wants that work done, call the actual tool now (Review/Forge/Tribunal/Brainstorm/Agent). Otherwise tell the user plainly that nothing is currently running and ask whether to start it.', + signal: abort.signal, + }); + for await (const chunk of groundGen) { + if (chunk.type === 'text') groundResponse += chunk.content; + if (chunk.type === 'done' || chunk.type === 'error') break; + } + dispatch({ type: 'spinner-stop' }); + if (groundResponse.trim()) { + dispatch({ type: 'engine-block', engineId: cesarEngineId, color, content: groundResponse.trim() }); + response = groundResponse.trim(); + } + } catch { + dispatch({ type: 'spinner-stop' }); + } + } + // ── Protocol enforcement: DISABLED ── // Cesar decides all delegations. The system never forces brainstorm/tribunal on the user. // If Cesar wants to delegate, he calls the tool. If he doesn't, that's his call. diff --git a/packages/cli/src/kern/cesar/brain-helpers.kern b/packages/cli/src/kern/cesar/brain-helpers.kern index e056161f..96ae62c8 100644 --- a/packages/cli/src/kern/cesar/brain-helpers.kern +++ b/packages/cli/src/kern/cesar/brain-helpers.kern @@ -110,6 +110,19 @@ fn name=detectMutationIntentStall params="text:string" returns=boolean export=tr return MUTATION_INTENT_RE.test(body) && HANDBACK_RE.test(body); >>> +fn name=detectFabricatedDelegation params="text:string" returns=boolean export=true + doc "Detect a response that CLAIMS an async review/forge/tribunal/brainstorm/agent or background job was dispatched or is now running — e.g. 'review delegated to codex, claude, agy', 'three reviewers are reading the diff in parallel', 'I kicked off the review', \"I'll get back when they report\". The caller pairs this with 'no delegation was actually emitted this turn' (ctx.cesar.pendingDelegation is null) to catch the confabulation where a weak engine narrates a dispatch it never made. Requires BOTH a delegable target AND a dispatch/running claim, so a plain answer that merely mentions the word 'review' does not trip it." + handler <<< + const body = String(text ?? '').trim(); + if (!body) return false; + // A delegable target: review / forge / tribunal / brainstorm / campfire / agents / engines / a background job. + const TARGET_RE = /\b(?:review(?:er)?s?|forg(?:e|ing)|tribunal|brainstorm|campfire|agents?|engines?|jobs?)\b/i; + if (!TARGET_RE.test(body)) return false; + // A claim that the target was dispatched or is now running / will report back. + const DISPATCH_RE = /\b(?:kick(?:ed|ing)?\s*(?:it|them|that|the\s+\w+)?\s*off|fired?\s*(?:it|them|off)|dispatch(?:ed|ing)|delegat(?:ed|ing)|(?:is|are|now)\s+running|running\s+(?:in|now)|in\s+parallel|reading\s+the\s+(?:diff|changes|code)|working\s+(?:on\s+it|in\s+parallel)|in\s+progress|under\s*way|i'?ll\s+(?:get\s+back|report|let\s+you\s+know|surface|update)|report(?:s|ing)?\s+back|when\s+they\s+(?:report|land|return|finish|come\s+back)|still\s+(?:running|going|working|in\s+progress)|spun?\s+up|started\s+(?:the|a)\s+(?:review|forge|job|tribunal|brainstorm))\b/i; + return DISPATCH_RE.test(body); + >>> + fn name=eagerFailedToolNames params="results:ToolCallResult[]" returns="string[]" export=true doc "Return unique tool names from failed eager tool results. Used to restrict one-shot repair retries to the tool that just failed." handler lang="kern" diff --git a/packages/cli/src/kern/cesar/brain.kern b/packages/cli/src/kern/cesar/brain.kern index 339128bf..97115387 100644 --- a/packages/cli/src/kern/cesar/brain.kern +++ b/packages/cli/src/kern/cesar/brain.kern @@ -14,7 +14,7 @@ import from="./routing.js" names="buildRoutingContext,deriveRoutingHints,shouldS import from="./reliability.js" names="readCesarToolReliability,formatCesarReliabilityLine,shouldDowngradeCesarToolWork,buildWhatHappenedSummary" import from="./self-turn-approval.js" names="applyCesarSelfTurnApproval" import from="./tool-observability.js" names="createCesarTurnId,recordCesarApprovalDecision,recordCesarToolTimeline,recordCesarConfidence" -import from="./brain-helpers.js" names="yieldToInk,splitBeforeToolMarkup,XML_TOOL_MARKUP_HOLD_CHARS,findTrailingUserQuestion,detectAwaitingUserInput,detectNarratedToolStall,detectMutationIntentStall,eagerFailedToolNames,shouldRunEagerRepairTool,shouldStopAfterXmlToolCall,buildReviewFollowupPrompt,extractDelegation" +import from="./brain-helpers.js" names="yieldToInk,splitBeforeToolMarkup,XML_TOOL_MARKUP_HOLD_CHARS,findTrailingUserQuestion,detectAwaitingUserInput,detectNarratedToolStall,detectMutationIntentStall,detectFabricatedDelegation,eagerFailedToolNames,shouldRunEagerRepairTool,shouldStopAfterXmlToolCall,buildReviewFollowupPrompt,extractDelegation" fn name=commitTurnAndDelegate async=true params="pendingDel:PendingDelegation, input:string, response:string, cesarEngineId:string, streaming:boolean, dispatch:Dispatch, ctx:HandlerContext, telemetry?:Record" returns="Promise" handler lang="kern" @@ -1601,6 +1601,43 @@ ${reviewFollowup.prompt}`; } } + // ── Fabricated-delegation guard: ground a confabulated dispatch ── + // Catches the failure where a weak engine narrates that it dispatched or is + // running an async review/forge/agent job ("three reviewers are reading the + // diff in parallel", "I kicked off the review", "I'll report when they land") + // WITHOUT having emitted any delegation this turn — pendingDelegation is null, + // so nothing is actually queued or running. Re-prompt once to ground it: call + // the real tool now, or tell the user plainly that nothing is running. If the + // re-prompt dispatches for real, pendingDelegation gets set and the existing + // downstream delegation path takes over. + if ( + !ctx.cesar!.pendingDelegation + && session.alive + && !abort.signal.aborted + && detectFabricatedDelegation(response.trim()) + ) { + dispatch({ type: 'warning', message: 'Cesar claimed a job was running but never dispatched one — grounding...' }); + dispatch({ type: 'spinner-start', message: 'Cesar grounding…', color }); + try { + let groundResponse = ''; + const groundGen = session.send({ + message: '[SYSTEM] GROUNDING CHECK: You did NOT dispatch any review/forge/tribunal/brainstorm/agent/job this turn, and none is pending or running. Do NOT claim background work is "running", "in parallel", "kicked off", or that anyone "will report back" — that is false and misleads the user. If the user wants that work done, call the actual tool now (Review/Forge/Tribunal/Brainstorm/Agent). Otherwise tell the user plainly that nothing is currently running and ask whether to start it.', + signal: abort.signal, + }); + for await (const chunk of groundGen) { + if (chunk.type === 'text') groundResponse += chunk.content; + if (chunk.type === 'done' || chunk.type === 'error') break; + } + dispatch({ type: 'spinner-stop' }); + if (groundResponse.trim()) { + dispatch({ type: 'engine-block', engineId: cesarEngineId, color, content: groundResponse.trim() }); + response = groundResponse.trim(); + } + } catch { + dispatch({ type: 'spinner-stop' }); + } + } + // ── Protocol enforcement: DISABLED ── // Cesar decides all delegations. The system never forces brainstorm/tribunal on the user. // If Cesar wants to delegate, he calls the tool. If he doesn't, that's his call. diff --git a/tests/unit/cesar-brain.test.ts b/tests/unit/cesar-brain.test.ts index dd2e21dd..9082ea98 100644 --- a/tests/unit/cesar-brain.test.ts +++ b/tests/unit/cesar-brain.test.ts @@ -2,7 +2,7 @@ import { describe, it, expect } from 'vitest'; import { parseSuggestion, parseConfidence, confidenceBadge, CONFIDENCE_TIERS, CESAR_SYSTEM_PROMPT, buildReviewFollowupPrompt, detectNarratedToolStall } from '../../packages/cli/src/handlers/cesar-brain.js'; // Source of truth for these helpers is packages/cli/src/kern/cesar/brain-helpers.kern; // the generated/*.js below is regenerated from it (npm run kern:compile) — do not edit by hand. -import { eagerFailedToolNames, shouldRunEagerRepairTool, shouldStopAfterXmlToolCall, splitBeforeToolMarkup, isUserDirectedQuestion, findTrailingUserQuestion, detectAwaitingUserInput, detectMutationIntentStall } from '../../packages/cli/src/generated/cesar/brain-helpers.js'; +import { eagerFailedToolNames, shouldRunEagerRepairTool, shouldStopAfterXmlToolCall, splitBeforeToolMarkup, isUserDirectedQuestion, findTrailingUserQuestion, detectAwaitingUserInput, detectMutationIntentStall, detectFabricatedDelegation } from '../../packages/cli/src/generated/cesar/brain-helpers.js'; import { createReportConfidenceTool, createForgeTool, createBrainstormTool, createTribunalTool, createCampfireTool, createPipelineTool } from '../../packages/core/src/tools.js'; describe('Cesar Brain', () => { @@ -36,6 +36,29 @@ describe('Cesar Brain', () => { }); }); + describe('detectFabricatedDelegation (confabulated dispatch)', () => { + it('flags a claim that reviewers/jobs are running or were dispatched', () => { + // Real phrases from the confabulation transcript. + expect(detectFabricatedDelegation('Going — three reviewers (codex, claude, agy) are reading the 90-file diff in parallel.')).toBe(true); + expect(detectFabricatedDelegation('The review is still running — codex, claude, and agy are each reading the diff in parallel.')).toBe(true); + expect(detectFabricatedDelegation("Review delegated to codex, claude, and agy. I'll get back when they report.")).toBe(true); + expect(detectFabricatedDelegation('I kicked off the review — the agents are working in parallel now.')).toBe(true); + }); + + it('does not flag a plain answer that merely mentions a review', () => { + expect(detectFabricatedDelegation('You should run a review before merging this branch.')).toBe(false); + expect(detectFabricatedDelegation('The review tool diffs the branch against its base.')).toBe(false); + expect(detectFabricatedDelegation('Here is the fix; nothing is running right now.')).toBe(false); + }); + + it('requires both a delegable target AND a dispatch/running claim', () => { + // "running" but no delegable target → not a fabricated delegation. + expect(detectFabricatedDelegation('The build is running in parallel across packages.')).toBe(false); + // target but no dispatch claim → not flagged. + expect(detectFabricatedDelegation('A tribunal would surface the tradeoffs here.')).toBe(false); + }); + }); + describe('detectMutationIntentStall (false read-only hand-back)', () => { it('flags "I am read-only" narration with intent to apply a change', () => { expect(detectMutationIntentStall('This session is read-only, so I cannot apply the edit — paste it into your terminal.')).toBe(true); From 6f13bea589b592ace2fc69c037c79819fffb6d19 Mon Sep 17 00:00:00 2001 From: nicolascukas Date: Mon, 8 Jun 2026 11:25:29 +0200 Subject: [PATCH 4/4] feat(cli): terminal bell + window-title alerts on done / awaiting input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rings the terminal bell and flips the window title when a turn finishes or when Cesar needs the user (a question or a plan awaiting approval), so a backgrounded terminal surfaces a bell + title dot. Originally built by Cesar (minimax-coding-plan-m3); this completes and corrects it. - New packages/cli/src/kern/lib/terminal-notify.kern: bell() (BEL once) and setWindowTitle(label) (OSC 0 ; label BEL). Both no-op when stdout isn't a TTY (piped/CI) and honour AGON_NO_BELL / AGON_NO_TITLE opt-outs. - Wired into app.kern: bell on await (question / plan-approval, deduped per plan id) and on done, with a single-shot pendingBellRef guard; title shows "● agon — running" / "● agon — input needed" / "agon". Fixes in the wiring (the part Cesar left broken — its plan's wire step failed verify): - The done-bell only fired on two edge paths (status dashboard / empty mode-switch) because the MAIN turn-completion path ended via a direct setReplState, bypassing transition()'s bell hook. Route it through transition(finishReplState) so every completed turn rings (no-op for job turns that already went idle at handoff). - Background jobs (forge/review/etc.) now ring + reset the title when they complete or fail — the "alert me when the long job is done" case. Adds tests/unit/terminal-notify.test.ts (the unit step Cesar's plan never reached): BEL-once / opt-out / non-TTY for bell, OSC sequence / opt-out / non-TTY for setWindowTitle. Full suite green: 1994/1994. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../cli/src/generated/lib/terminal-notify.ts | 31 ++++++ .../cli/src/generated/surfaces/app.entry.tsx | 4 +- packages/cli/src/generated/surfaces/app.tsx | 98 ++++++++++++++++--- .../cli/src/kern/lib/terminal-notify.kern | 19 ++++ packages/cli/src/kern/surfaces/app.kern | 88 +++++++++++++++-- tests/unit/terminal-notify.test.ts | 76 ++++++++++++++ 6 files changed, 293 insertions(+), 23 deletions(-) create mode 100644 packages/cli/src/generated/lib/terminal-notify.ts create mode 100644 packages/cli/src/kern/lib/terminal-notify.kern create mode 100644 tests/unit/terminal-notify.test.ts diff --git a/packages/cli/src/generated/lib/terminal-notify.ts b/packages/cli/src/generated/lib/terminal-notify.ts new file mode 100644 index 00000000..7feda78f --- /dev/null +++ b/packages/cli/src/generated/lib/terminal-notify.ts @@ -0,0 +1,31 @@ +// @generated by kern v3.5.8 — DO NOT EDIT. Source: src/kern/lib/terminal-notify.kern + +import { stdout } from 'node:process'; + +/** + * Ring the terminal bell once on stdout. No-op when stdio is not a TTY (piped runs, CI) or when AGON_NO_BELL is set in the environment, so we never break automation or annoy users who opted out. + */ +// @kern-source: terminal-notify:3 +export function bell(): void { + if (!stdout.isTTY) { + return; + } + if (process.env.AGON_NO_BELL) { + return; + } + stdout.write('\x07'); +} + +/** + * Set the terminal window/tab title via the OSC 0 ;