From b3a9540bf400625c8e08d1a5279eba1cdc6bffa9 Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 11 Jun 2026 00:23:46 +0900 Subject: [PATCH 1/7] feat(adapters): OpenRouter agentic adapter + harness hardening from SWE-bench findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an OpenRouter adapter that runs the native agentic loop (runAgenticLoop) with PKCE auth, ZDR (data_collection: deny), prompt caching, and optional reasoning-off for mechanical roles. Route worker/reviewer/planner models per cost-efficiency measurements (lightweight worker + frontier escalate). Harden the agentic harness with fixes for 7 defects discovered by running real SWE-bench instances (none were visible on synthetic benchmarks): 1. Inject the working directory into the prompt — models guessed absolute paths and every file tool call was rejected. 2. Surface stdout/stderr + exit code on bash failures — grep "no match" (exit 1) looked like a fatal error and caused infinite retries. 3. Raise compaction thresholds (24k→60k tokens, keep 16 recent messages) — early compaction erased freshly-read files and caused endless re-reads. 4. Final-answer turn — when maxTurns is exhausted, make one last tool-less call so the model still produces a conclusion. 5. No-edit guard (nudgeMaxOnNoEdit) — push back when a model tries to finish an edit-required task with analysis only. 6. Protected files (protectedFiles) — reject edit/write on verification harness files; implementers were rewriting the test script when tests failed. 7. Configurable bash timeout (bashTimeoutMs) + explicit TIMEOUT message — the 30s default died silently on docker-based test runs and read as a broken environment. Also: loop-level read cache (dedup repeated reads), edit_file returns the resulting region (no re-read needed), git-diff-based success promotion in the worker (structured output no longer required). --- src/adapters/agenticLoop.test.ts | 113 ++++++++++ src/adapters/agenticLoop.ts | 230 +++++++++++++++----- src/adapters/codex.test.ts | 35 ++- src/adapters/codex.ts | 49 ++++- src/adapters/gpt.ts | 19 +- src/adapters/index.ts | 10 +- src/adapters/local.ts | 15 +- src/adapters/openrouter.test.ts | 192 +++++++++++++++++ src/adapters/openrouter.ts | 359 +++++++++++++++++++++++++++++++ src/adapters/tools.test.ts | 128 ++++++++++- src/adapters/tools.ts | 137 +++++++++++- src/adapters/types.ts | 21 +- src/agents/draftAnalyzer.ts | 8 +- src/agents/reviewer.ts | 2 +- src/agents/worker.ts | 33 ++- src/auth/index.ts | 15 +- src/auth/oauthPkce.ts | 38 +++- src/auth/oauthStore.ts | 19 +- src/auth/openrouterPkce.ts | 282 ++++++++++++++++++++++++ src/cli.ts | 11 +- src/cli/authHandler.ts | 161 ++++++++++---- src/core/config.ts | 50 +++-- 22 files changed, 1760 insertions(+), 167 deletions(-) create mode 100644 src/adapters/agenticLoop.test.ts create mode 100644 src/adapters/openrouter.test.ts create mode 100644 src/adapters/openrouter.ts create mode 100644 src/auth/openrouterPkce.ts diff --git a/src/adapters/agenticLoop.test.ts b/src/adapters/agenticLoop.test.ts new file mode 100644 index 0000000..2c5edb2 --- /dev/null +++ b/src/adapters/agenticLoop.test.ts @@ -0,0 +1,113 @@ +// ============================================ +// OpenSwarm - Agentic Loop history compaction tests +// Regression guard for the worker-failure bug: over-eager compaction used to +// strip everything but the last assistant block every turn, so the model lost +// the files it had just read and looped 3-4 times. These tests pin the VEGA-style +// behaviour: keep recent blocks intact, never leave orphan tool messages. +// ============================================ + +import { describe, it, expect } from 'vitest'; +import { compactPriorTurns, type ChatMessage } from './agenticLoop.js'; + +/** Build a representative tool-using history: system + user + N (assistant→tool) rounds. */ +function buildHistory(rounds: number): ChatMessage[] { + const messages: ChatMessage[] = [ + { role: 'system', content: 'You are a worker.' }, + { role: 'user', content: 'Do the task.' }, + ]; + for (let i = 0; i < rounds; i++) { + messages.push({ + role: 'assistant', + content: `Step ${i}: reading file`, + tool_calls: [{ + id: `call_${i}`, + type: 'function', + function: { name: 'read_file', arguments: JSON.stringify({ path: `src/file${i}.ts` }) }, + }], + }); + messages.push({ + role: 'tool', + tool_call_id: `call_${i}`, + content: `contents of file${i}`, + }); + } + return messages; +} + +/** Every tool message must immediately follow an assistant carrying its tool_call_id. */ +function hasNoOrphanToolMessages(messages: ChatMessage[]): boolean { + for (let i = 0; i < messages.length; i++) { + const m = messages[i]; + if (m.role !== 'tool') continue; + const prev = messages[i - 1]; + if (!prev || prev.role !== 'assistant' || !prev.tool_calls) return false; + const ids = prev.tool_calls.map((tc) => tc.id); + if (!ids.includes(m.tool_call_id)) return false; + } + return true; +} + +describe('compactPriorTurns', () => { + it('keeps the most recent keepRecent messages verbatim', () => { + const messages = buildHistory(10); // 2 header + 20 round msgs = 22 + const before = messages.slice(-4).map((m) => JSON.stringify(m)); + + compactPriorTurns(messages, 4); + + const after = messages.slice(-4).map((m) => JSON.stringify(m)); + expect(after).toEqual(before); + }); + + it('preserves the system + user header', () => { + const messages = buildHistory(8); + compactPriorTurns(messages, 4); + + expect(messages[0]).toEqual({ role: 'system', content: 'You are a worker.' }); + expect(messages[1]).toEqual({ role: 'user', content: 'Do the task.' }); + }); + + it('never leaves an orphan tool message after compaction', () => { + const messages = buildHistory(10); + compactPriorTurns(messages, 5); + expect(hasNoOrphanToolMessages(messages)).toBe(true); + }); + + it('replaces old rounds with a single [Prior turns compacted] summary', () => { + const messages = buildHistory(10); + compactPriorTurns(messages, 4); + + const summaries = messages.filter( + (m) => m.role === 'assistant' && typeof m.content === 'string' && m.content.startsWith('[Prior turns compacted]'), + ); + expect(summaries).toHaveLength(1); + // The summary must sit right after the header, before the preserved tail. + expect(messages[2].role).toBe('assistant'); + expect((messages[2] as { content: string }).content).toContain('[Prior turns compacted]'); + }); + + it('shrinks total message count (actually compacts)', () => { + const messages = buildHistory(10); + const originalLen = messages.length; + compactPriorTurns(messages, 4); + expect(messages.length).toBeLessThan(originalLen); + }); + + it('is a no-op when nothing is old enough to compact', () => { + // keepRecent larger than the whole body → boundary collapses to header, no change. + const messages = buildHistory(2); // 2 header + 4 body = 6 + const snapshot = messages.map((m) => JSON.stringify(m)); + compactPriorTurns(messages, 10); + expect(messages.map((m) => JSON.stringify(m))).toEqual(snapshot); + }); + + it('absorbs an existing summary instead of nesting summaries', () => { + const messages = buildHistory(12); + compactPriorTurns(messages, 4); // first pass creates a summary + compactPriorTurns(messages, 4); // second pass should fold it in, not nest + + const summaries = messages.filter( + (m) => m.role === 'assistant' && typeof m.content === 'string' && m.content.startsWith('[Prior turns compacted]'), + ); + expect(summaries.length).toBeLessThanOrEqual(1); + }); +}); diff --git a/src/adapters/agenticLoop.ts b/src/adapters/agenticLoop.ts index 15a48ca..c10cb71 100644 --- a/src/adapters/agenticLoop.ts +++ b/src/adapters/agenticLoop.ts @@ -1,14 +1,49 @@ // ============================================ // OpenSwarm - Agentic Tool Loop // Created: 2026-04-11 -// Purpose: GPT/Local 어댑터에 Claude CLI와 동등한 도구 사용 능력을 부여하는 -// 범용 에이전틱 루프 엔진. +// Purpose: Codex/OpenRouter/Local 어댑터용 범용 에이전틱 루프 엔진. // OpenAI function calling 포맷 기반. +// VEGA token_count.py 패턴 이식 — 토큰 기반 히스토리 압축. // ============================================ -import { TOOL_DEFINITIONS, executeToolCalls, type ToolCall, type ToolResult, type ToolDefinition } from './tools.js'; +import { TOOL_DEFINITIONS, executeToolCalls, createReadCache, type ToolCall, type ToolResult, type ToolDefinition } from './tools.js'; import type { CliRunResult } from './types.js'; +// ============ 토큰 카운팅 (VEGA token_count.py 이식) ============ + +// cl100k_base 근사: 한국어 0.78t/char, 영어 0.27t/char +function countTokensApprox(text: string): number { + if (!text) return 0; + const hangul = [...text].filter(c => c >= '가' && c <= '힣').length; + const korRatio = hangul / Math.max(1, text.length); + const rate = 0.78 * korRatio + 0.27 * (1 - korRatio); + return Math.ceil(text.length * rate); +} + +function countMessageTokens(messages: ChatMessage[]): number { + let total = 0; + for (const m of messages) { + const content = typeof m.content === 'string' ? m.content : ''; + total += countTokensApprox(content); + total += 4; // role overhead + if ('tool_calls' in m && m.tool_calls) { + for (const tc of m.tool_calls) { + total += countTokensApprox(tc.function.arguments) + countTokensApprox(tc.function.name) + 8; + } + } + } + return total; +} + +// 도구 결과 길이 제한: 너무 작게 자르면 모델이 파일 절반만 보고 잘못 수정한다. +// 코딩 작업에 맞춰 넉넉히 보존(2500자), 초과 시 앞 1500 + 뒤 700자 유지. +function truncateToolResult(content: string, maxLen = 2500): string { + if (content.length <= maxLen) return content; + const head = content.slice(0, 1500); + const tail = content.slice(-700); + return `${head}\n...[${content.length - 2200} chars truncated]...\n${tail}`; +} + // ============ 타입 ============ /** OpenAI Chat Completions API 메시지 포맷 */ @@ -55,7 +90,7 @@ export interface AgenticLoopOptions { model: string; /** API 호출 함수 (어댑터별로 주입) */ callApi: (messages: ChatMessage[], tools: ToolDefinition[]) => Promise; - /** 최대 도구 사용 턴 수 (기본: 15) */ + /** 최대 도구 사용 턴 수 (기본: 20) */ maxTurns?: number; /** 전체 타임아웃 (ms, 기본: 300000) */ timeoutMs?: number; @@ -63,6 +98,23 @@ export interface AgenticLoopOptions { onLog?: (line: string) => void; /** 도구 사용 허용 여부 (기본: true) */ enableTools?: boolean; + /** 토큰 기반 압축 트리거 임계값 (기본: 24000) */ + compactTokenThreshold?: number; + /** 이 메시지 수를 넘어야 압축 후보 (VEGA compact_threshold, 기본: 24) */ + compactAfterMessages?: number; + /** 압축 시 항상 원본 유지할 최근 메시지 수 (VEGA keep_recent, 기본: 8) */ + keepRecentMessages?: number; + /** + * 수정이 필수인 작업의 no-edit 종료 가드. 모델이 edit/write 도구를 한 번도 안 쓰고 + * 최종 텍스트로 끝내려 하면 "아직 수정 안 했다, 계속하라"고 N회까지 되민다. + * 경량 모델(gemini 등)이 탐색만 하고 일찍 결론 내는 패턴 차단 (SWE 하이브리드에서 발견). + * 기본 0 (비활성) — 수정 없는 작업(진단·분석)도 정상이므로 옵트인. + */ + nudgeMaxOnNoEdit?: number; + /** Verification-harness files for which edit/write are refused (see tools.ts ToolExecOptions) */ + protectedFiles?: string[]; + /** bash tool timeout — docker-based tests need minutes (default 30s) */ + bashTimeoutMs?: number; } /** 루프 실행 결과 */ @@ -95,10 +147,19 @@ export async function runAgenticLoop(options: AgenticLoopOptions): Promise= 2) { - compactPriorTurns(messages); + // 히스토리 압축 — VEGA compaction.py 패턴 이식. + // 트리거: 메시지 수가 compactAfterMessages를 넘고 + 토큰이 임계값 초과일 때만. + // 과거에는 turn>=2부터 매 턴 무조건 압축해 모델이 방금 읽은 파일·작업 맥락을 + // 즉시 잃고 헛돌았다(루프 재발). 이제 정말 길어질 때만 압축하고, 압축해도 + // 최근 keepRecentMessages 블록은 원본 유지한다. + if (messages.length > compactAfterMessages) { + const msgTokens = countMessageTokens(messages); + if (msgTokens > compactTokenThreshold) { + onLog?.(`📦 Compacting history (${messages.length} msgs, ${msgTokens} tokens > ${compactTokenThreshold})`); + compactPriorTurns(messages, keepRecentMessages); + } } // API 호출 @@ -158,6 +238,21 @@ export async function runAgenticLoop(options: AgenticLoopOptions): Promise tc.function.name === 'edit_file' || tc.function.name === 'write_file', + ).length; - // 도구 결과를 메시지에 추가 + // 도구 결과를 메시지에 추가 (길이 초과 시 자동 truncate) for (const result of results) { + const content = truncateToolResult(result.content); messages.push({ role: 'tool', tool_call_id: result.tool_call_id, - content: result.content, + content, }); if (result.is_error) { - onLog?.(` ✖ ${result.content.slice(0, 100)}`); + onLog?.(` ✖ ${content.slice(0, 100)}`); + } + } + } + + // Final answer turn — maxTurns/타임아웃으로 끊겼는데 모델이 최종 텍스트를 못 낸 경우, + // 도구 없이 마지막 1회 호출로 결론을 강제한다. 이게 없으면 진단·분석형 작업이 + // 끝까지 도구만 호출하다 빈 결과("(no summary)")로 끝난다 — SWE 하이브리드 진단 + // 단계에서 발견된 결함. + if (!finalText && apiCallCount > 0) { + onLog?.('▸ Final answer turn (no tools) — loop ended without a final message'); + messages.push({ + role: 'user', + content: + 'Tool budget exhausted. Based on everything you have learned above, give your final ' + + 'answer NOW as plain text. Do not request any more tools.', + }); + try { + const response = await callApi(messages, []); + if (response.usage) { + totalTokens += response.usage.prompt_tokens + response.usage.completion_tokens; } + apiCallCount++; + finalText = response.choices?.[0]?.message?.content ?? ''; + } catch (err) { + onLog?.(`✖ Final answer turn failed: ${err instanceof Error ? err.message : String(err)}`); } } @@ -225,78 +348,71 @@ export function loopResultToCliResult(result: AgenticLoopResult): CliRunResult { }; } -// ============ 히스토리 압축 ============ +// ============ 히스토리 압축 (VEGA compaction.py 패턴 이식) ============ /** - * 최근 1턴(assistant + tool results)만 원본 유지. - * 그 이전 턴의 assistant+tool 쌍을 1줄 요약 assistant 메시지로 교체. + * 이전 턴(assistant+tool 쌍)을 요약 1줄로 교체. + * OpenAI API 제약: tool 메시지는 직전 assistant의 tool_call_id와 대응해야 하므로 + * 오래된 assistant+tool 쌍은 텍스트 요약으로 대체해 API 오류를 방지. * - * 구조 변환: - * [system, user, asst(tool_calls), tool, tool, asst(tool_calls), tool, ...] - * → [system, user, asst("Prior: read_file→ok, edit→ok"), asst(latest_tool_calls), tool, ...] - * - * OpenAI API 제약: tool 메시지는 직전 assistant의 tool_call_id와 대응해야 함. - * 따라서 오래된 tool 메시지는 삭제하고, 해당 assistant도 일반 텍스트로 교체. + * 보존 기준 (VEGA keep_recent): 최근 keepRecent개 메시지 블록은 항상 원본 유지. + * tool 메시지는 직전 assistant의 tool_call_id와 짝이 맞아야 하므로, 보존 경계는 + * keepRecent 지점 이후 첫 assistant로 정렬해 짝이 깨진 tool 메시지가 남지 않게 한다. + * 기존 [Prior turns compacted] 요약이 있으면 새 요약에 합산 후 교체. + * (테스트를 위해 export — 외부에서 직접 호출할 일은 없음) */ -function compactPriorTurns(messages: ChatMessage[]): void { +export function compactPriorTurns(messages: ChatMessage[], keepRecent = 8): void { const headerCount = messages[0]?.role === 'system' ? 2 : 1; - // 마지막 assistant+tool 블록의 시작 인덱스 찾기 - let lastAssistantIdx = -1; - for (let i = messages.length - 1; i >= headerCount; i--) { - if (messages[i].role === 'assistant') { - lastAssistantIdx = i; - break; - } + // 최근 keepRecent개 메시지는 보존 — 압축 상한 인덱스 산출 + let boundary = Math.max(headerCount, messages.length - keepRecent); + // 보존 경계를 assistant 시작점으로 정렬 (orphan tool 메시지 방지) + while (boundary < messages.length && messages[boundary].role === 'tool') { + boundary++; } - if (lastAssistantIdx <= headerCount) return; // 압축할 이전 턴이 없음 + if (boundary <= headerCount) return; - // headerCount ~ lastAssistantIdx 사이의 모든 assistant+tool 쌍을 요약 const summaryParts: string[] = []; const toRemove: number[] = []; - for (let i = headerCount; i < lastAssistantIdx; i++) { + for (let i = headerCount; i < boundary; i++) { const msg = messages[i]; - if (msg.role === 'assistant' && msg.tool_calls && msg.tool_calls.length > 0) { - // tool call 요약: "read_file(src/foo.ts), edit_file(src/foo.ts)" - const callSummaries = msg.tool_calls.map(tc => { - try { - const args = JSON.parse(tc.function.arguments); - const key = args.path || args.pattern || args.command; - const short = typeof key === 'string' ? key.slice(0, 50) : ''; - return `${tc.function.name}(${short})`; - } catch { - return tc.function.name; - } - }); - summaryParts.push(callSummaries.join(', ')); + if (msg.role === 'assistant') { + if (msg.tool_calls && msg.tool_calls.length > 0) { + const calls = msg.tool_calls.map(tc => { + try { + const args = JSON.parse(tc.function.arguments); + const key = args.path || args.pattern || args.command; + const short = typeof key === 'string' ? key.slice(0, 40) : ''; + return `${tc.function.name}(${short})`; + } catch { + return tc.function.name; + } + }); + summaryParts.push(calls.join(', ')); + } else { + // 기존 compacted 요약이면 내용 그대로 흡수, 아니면 어시스턴트 설명 텍스트 보존 + const text = (msg.content ?? '').trim(); + if (text) summaryParts.push(text.startsWith('[Prior') ? text : `note: ${text.slice(0, 200)}`); + } toRemove.push(i); } else if (msg.role === 'tool') { - // tool result → 성공/실패만 기록 const ok = !msg.content.startsWith('BLOCKED') && !msg.content.startsWith('Tool error'); - const firstLine = msg.content.split('\n')[0].slice(0, 60); - summaryParts.push(ok ? `→ok` : `→err: ${firstLine}`); - toRemove.push(i); - } else if (msg.role === 'assistant' && !msg.tool_calls) { - // 일반 assistant 메시지 — 첫 줄만 유지 - const short = (msg.content ?? '').split('\n')[0].slice(0, 80); - if (short) summaryParts.push(`note: ${short}`); + const firstLine = msg.content.split('\n')[0].slice(0, 50); + summaryParts.push(ok ? '→ok' : `→err: ${firstLine}`); toRemove.push(i); } } if (toRemove.length === 0) return; - // 요약 메시지 생성 const summaryText = `[Prior turns compacted] ${summaryParts.join(' | ')}`; - // 역순으로 제거 (인덱스 안정성) for (let i = toRemove.length - 1; i >= 0; i--) { messages.splice(toRemove[i], 1); } - // header 직후에 요약 삽입 messages.splice(headerCount, 0, { role: 'assistant', content: summaryText, diff --git a/src/adapters/codex.test.ts b/src/adapters/codex.test.ts index 520f404..90dcfae 100644 --- a/src/adapters/codex.test.ts +++ b/src/adapters/codex.test.ts @@ -1,5 +1,5 @@ -import { describe, it, expect } from 'vitest'; -import { CodexCliAdapter } from './codex.js'; +import { describe, it, expect, vi } from 'vitest'; +import { CodexCliAdapter, coerceCodexModel } from './codex.js'; describe('CodexCliAdapter', () => { const adapter = new CodexCliAdapter(); @@ -18,6 +18,37 @@ describe('CodexCliAdapter', () => { expect(command).toContain("-m 'gpt-5-codex'"); }); + it('substitutes a claude model with the codex default and warns', () => { + const warn = vi.spyOn(console, 'warn').mockImplementation(() => {}); + try { + const { command } = adapter.buildCommand({ + prompt: '/tmp/prompt.txt', + cwd: '/tmp/project', + model: 'claude-sonnet-4-20250514', + }); + // Should not pass the claude model through to the codex CLI. + expect(command).not.toContain('claude-sonnet'); + expect(command).toContain("-m 'gpt-5-codex'"); + // Warning emitted at least once for this model name. + const messages = warn.mock.calls.map((c) => String(c[0])); + expect(messages.some((m) => m.includes('claude-sonnet-4-20250514'))).toBe(true); + } finally { + warn.mockRestore(); + } + }); + + it('coerceCodexModel passes OpenAI model names through unchanged', () => { + expect(coerceCodexModel('gpt-5-codex')).toBe('gpt-5-codex'); + expect(coerceCodexModel('o3')).toBe('o3'); + expect(coerceCodexModel('gpt-4o')).toBe('gpt-4o'); + }); + + it('coerceCodexModel rewrites every claude-* variant', () => { + expect(coerceCodexModel('claude-opus-4-6')).toBe('gpt-5-codex'); + expect(coerceCodexModel('claude-haiku-4-5-20251001')).toBe('gpt-5-codex'); + expect(coerceCodexModel('Claude-Sonnet-4')).toBe('gpt-5-codex'); + }); + it('parses worker output from codex json events', () => { const raw = { exitCode: 0, diff --git a/src/adapters/codex.ts b/src/adapters/codex.ts index 18adbe3..72296b9 100644 --- a/src/adapters/codex.ts +++ b/src/adapters/codex.ts @@ -39,7 +39,8 @@ export class CodexCliAdapter implements CliAdapter { buildCommand(options: CliRunOptions): { command: string; args: string[] } { const promptFile = options.prompt; - const modelFlag = options.model ? ` -m ${shellEscape(options.model)}` : ''; + const resolvedModel = options.model ? coerceCodexModel(options.model) : undefined; + const modelFlag = resolvedModel ? ` -m ${shellEscape(resolvedModel)}` : ''; const cmd = `cat ${shellEscape(promptFile)} | codex exec --json --full-auto --skip-git-repo-check${modelFlag}`; return { command: cmd, args: [] }; } @@ -77,6 +78,36 @@ function shellEscape(value: string): string { return `'${value.replace(/'/g, `'\\''`)}'`; } +/** + * Codex (ChatGPT account mode) only accepts OpenAI-family models. When a + * pipeline role was configured with a Claude model and the operator later + * switched the global adapter to `codex`, the CLI returns: + * 400 invalid_request_error: The 'claude-...' model is not supported when + * using Codex with a ChatGPT account. + * + * Rather than letting the request fail, transparently substitute the Codex + * default and log a warning so the operator can see what happened and either + * fix their config or accept the substitution. + */ +const CODEX_DEFAULT_MODEL = 'gpt-5-codex'; +const warnedAboutModel = new Set(); + +export function coerceCodexModel(requested: string): string { + if (!isClaudeModel(requested)) return requested; + if (!warnedAboutModel.has(requested)) { + warnedAboutModel.add(requested); + console.warn( + `[CodexAdapter] '${requested}' is a Claude model and is not accepted by codex with a ChatGPT account. ` + + `Substituting '${CODEX_DEFAULT_MODEL}'. Set worker/reviewer model explicitly in config.yaml to silence this.`, + ); + } + return CODEX_DEFAULT_MODEL; +} + +function isClaudeModel(name: string): boolean { + return /^claude[-_]/i.test(name); +} + function extractCodexMessageText(output: string): string { let lastMessage = ''; @@ -270,8 +301,10 @@ function extractWorkerResultJson(text: string): WorkerResult | null { } function extractWorkerFromText(text: string): WorkerResult { - const hasError = /error|fail|exception|cannot/i.test(text); - const hasSuccess = /success|completed|done|finished/i.test(text); + // Only an explicit failure phrase marks the run as failed. Loose "error"/"fail" + // words appear in normal coding prose; git-diff promotion in worker.ts is the + // real success signal. + const failed = isExplicitFailure(text); const filePatterns = [ /(?:changed?|modified?|created?|updated?):\s*(.+\.(?:ts|js|py|json|yaml|yml|md))/gi, @@ -299,15 +332,21 @@ function extractWorkerFromText(text: string): WorkerResult { } return { - success: !hasError || hasSuccess, + success: !failed, summary: extractSummary(text), filesChanged: filesChanged.slice(0, 10), commands: commands.slice(0, 10), output: text, - error: hasError ? extractErrorMessage(text) : undefined, + error: failed ? extractErrorMessage(text) : undefined, }; } +// Detect a real failure declaration, not incidental "error"/"fail" prose (see gpt.ts). +function isExplicitFailure(text: string): boolean { + if (/"success"\s*:\s*false/i.test(text)) return true; + return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); +} + function extractSummary(text: string): string { const lines = text.split('\n').filter((l) => l.trim().length > 10); if (lines.length === 0) return t('common.fallback.noSummary'); diff --git a/src/adapters/gpt.ts b/src/adapters/gpt.ts index b3b2393..d7a8780 100644 --- a/src/adapters/gpt.ts +++ b/src/adapters/gpt.ts @@ -221,16 +221,19 @@ function extractWorkerResultJson(text: string): WorkerResult | null { } function extractWorkerFromText(text: string): WorkerResult { - const hasError = /error|fail|exception|cannot/i.test(text); - const hasSuccess = /success|completed|done|finished/i.test(text); + // Only an explicit failure phrase marks the run as failed. Loose words like + // "error" or "fail" appear in normal coding prose ("error handling", "the + // failing test") and used to cause false negatives. git-diff promotion in + // worker.ts is the real success signal; this is just the non-repo fallback. + const failed = isExplicitFailure(text); return { - success: !hasError || hasSuccess, + success: !failed, summary: extractSummary(text), filesChanged: [], commands: [], output: text, - error: hasError ? extractErrorMessage(text) : undefined, + error: failed ? extractErrorMessage(text) : undefined, }; } @@ -297,6 +300,14 @@ function findJsonObject(text: string, marker: string): string | null { return null; } +// Detect a real failure declaration, not incidental "error"/"fail" prose. +// Matches explicit statements like "failed to", "unable to", "could not", +// "cannot complete", or an explicit JSON success:false. +function isExplicitFailure(text: string): boolean { + if (/"success"\s*:\s*false/i.test(text)) return true; + return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); +} + function extractSummary(text: string): string { const lines = text.split('\n').filter((l) => l.trim().length > 10); if (lines.length === 0) return t('common.fallback.noSummary'); diff --git a/src/adapters/index.ts b/src/adapters/index.ts index 733310e..1292432 100644 --- a/src/adapters/index.ts +++ b/src/adapters/index.ts @@ -15,32 +15,32 @@ export type { } from './types.js'; export { spawnCli } from './base.js'; -export { ClaudeCliAdapter } from './claude.js'; export { CodexCliAdapter } from './codex.js'; export { GptCliAdapter } from './gpt.js'; export { LocalModelAdapter } from './local.js'; export { LmStudioAdapter } from './lmstudio.js'; +export { OpenRouterCliAdapter } from './openrouter.js'; export { registerProcess, getProcess, getAllProcesses, killProcess, startHealthChecker, stopHealthChecker } from './processRegistry.js'; -import { ClaudeCliAdapter } from './claude.js'; import { CodexCliAdapter } from './codex.js'; import { GptCliAdapter } from './gpt.js'; import { LocalModelAdapter } from './local.js'; import { LmStudioAdapter } from './lmstudio.js'; +import { OpenRouterCliAdapter } from './openrouter.js'; import type { AdapterName, CliAdapter } from './types.js'; const adapters: Record = { - claude: new ClaudeCliAdapter(), codex: new CodexCliAdapter(), gpt: new GptCliAdapter(), local: new LocalModelAdapter(), lmstudio: new LmStudioAdapter(), + openrouter: new OpenRouterCliAdapter(), }; -let defaultAdapter: AdapterName = 'claude'; +let defaultAdapter: AdapterName = 'codex'; /** - * Get an adapter by name. Defaults to 'claude'. + * Get an adapter by name. Defaults to 'codex'. */ export function getAdapter(name: string = defaultAdapter): CliAdapter { const adapter = adapters[name]; diff --git a/src/adapters/local.ts b/src/adapters/local.ts index a757c65..ec6c25e 100644 --- a/src/adapters/local.ts +++ b/src/adapters/local.ts @@ -328,16 +328,17 @@ function extractWorkerResultJson(text: string): WorkerResult | null { } function extractWorkerFromText(text: string): WorkerResult { - const hasError = /error|fail|exception|cannot/i.test(text); - const hasSuccess = /success|completed|done|finished/i.test(text); + // Only an explicit failure phrase marks the run as failed (see gpt.ts). + // git-diff promotion in worker.ts is the real success signal. + const failed = isExplicitFailure(text); return { - success: !hasError || hasSuccess, + success: !failed, summary: extractSummary(text), filesChanged: [], commands: [], output: text, - error: hasError ? extractErrorMessage(text) : undefined, + error: failed ? extractErrorMessage(text) : undefined, }; } @@ -397,6 +398,12 @@ function findJsonObject(text: string, marker: string): string | null { return null; } +// Detect a real failure declaration, not incidental "error"/"fail" prose (see gpt.ts). +function isExplicitFailure(text: string): boolean { + if (/"success"\s*:\s*false/i.test(text)) return true; + return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); +} + function extractSummary(text: string): string { const lines = text.split('\n').filter(l => l.trim().length > 10); if (lines.length === 0) return t('common.fallback.noSummary'); diff --git a/src/adapters/openrouter.test.ts b/src/adapters/openrouter.test.ts new file mode 100644 index 0000000..bedde1c --- /dev/null +++ b/src/adapters/openrouter.test.ts @@ -0,0 +1,192 @@ +// ============================================ +// OpenSwarm - OpenRouter Adapter Tests +// Created: 2026-05-27 +// Purpose: Verify OpenRouter adapter wiring + API call shape +// Dependencies: vitest +// Test Status: npm run test -- src/adapters/openrouter.test.ts +// ============================================ + +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { OpenRouterCliAdapter, createApiCaller, applyPromptCaching } from './openrouter.js'; +import { getAdapter } from './index.js'; +import type { ChatMessage } from './agenticLoop.js'; + +describe('OpenRouterCliAdapter', () => { + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('registers as a named adapter', () => { + const adapter = getAdapter('openrouter'); + expect(adapter.name).toBe('openrouter'); + expect(adapter.capabilities.supportsModelSelection).toBe(true); + expect(adapter.capabilities.supportsJsonOutput).toBe(true); + }); + + it('reports unavailable when no profile is stored', async () => { + const adapter = new OpenRouterCliAdapter(); + // Without a stored sk-or-* key, isAvailable should be false. + // We don't write a profile in this test, so the default ~/.openswarm store + // either lacks the key or returns null — either way the adapter is unavailable. + const available = await adapter.isAvailable(); + expect(typeof available).toBe('boolean'); + }); + + it('calls /chat/completions with Bearer auth and attribution headers', async () => { + const fetchMock = vi.fn(async () => + new Response( + JSON.stringify({ + choices: [{ message: { role: 'assistant', content: 'hi' }, finish_reason: 'stop' }], + usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 }, + }), + { status: 200 }, + ), + ); + vi.stubGlobal('fetch', fetchMock); + + const callApi = createApiCaller('sk-or-test-key', 'anthropic/claude-sonnet-4'); + const response = await callApi( + [{ role: 'user', content: 'ping' }], + [], + ); + + expect(response.choices[0].message.content).toBe('hi'); + expect(fetchMock).toHaveBeenCalledTimes(1); + + const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(url).toBe('https://openrouter.ai/api/v1/chat/completions'); + const headers = init.headers as Record; + expect(headers.Authorization).toBe('Bearer sk-or-test-key'); + expect(headers['HTTP-Referer']).toContain('openswarm'); + expect(headers['X-Title']).toBe('OpenSwarm'); + + const body = JSON.parse(init.body as string); + expect(body.model).toBe('anthropic/claude-sonnet-4'); + expect(body.messages).toEqual([{ role: 'user', content: 'ping' }]); + expect(body.tools).toBeUndefined(); + }); + + it('includes tools when the agentic loop provides them', async () => { + const fetchMock = vi.fn(async () => + new Response( + JSON.stringify({ + choices: [{ message: { role: 'assistant', content: 'ok' }, finish_reason: 'stop' }], + }), + { status: 200 }, + ), + ); + vi.stubGlobal('fetch', fetchMock); + + const callApi = createApiCaller('sk-or-test-key', 'openai/gpt-4o'); + await callApi( + [{ role: 'user', content: 'use tools' }], + [ + { + type: 'function', + function: { name: 'read_file', description: 'read', parameters: { type: 'object' } }, + }, + ], + ); + + const [, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + const body = JSON.parse(init.body as string); + expect(body.tools).toHaveLength(1); + expect(body.tools[0].function.name).toBe('read_file'); + }); + + it('sends ZDR (provider.data_collection: deny) for non-OpenAI models', async () => { + const fetchMock = vi.fn(async () => + new Response(JSON.stringify({ choices: [{ message: { role: 'assistant', content: 'x' }, finish_reason: 'stop' }] }), { status: 200 }), + ); + vi.stubGlobal('fetch', fetchMock); + const callApi = createApiCaller('sk-or-test', 'z-ai/glm-4.7-flash'); + await callApi([{ role: 'user', content: 'hi' }], []); + const body = JSON.parse((fetchMock.mock.calls[0][1] as RequestInit).body as string); + expect(body.provider).toEqual({ data_collection: 'deny' }); + expect(body.reasoning).toBeUndefined(); // not disabled unless requested + }); + + it('does NOT send ZDR for OpenAI models (they reject data_collection:deny)', async () => { + const fetchMock = vi.fn(async () => + new Response(JSON.stringify({ choices: [{ message: { role: 'assistant', content: 'x' }, finish_reason: 'stop' }] }), { status: 200 }), + ); + vi.stubGlobal('fetch', fetchMock); + const callApi = createApiCaller('sk-or-test', 'openai/gpt-5'); + await callApi([{ role: 'user', content: 'hi' }], []); + const body = JSON.parse((fetchMock.mock.calls[0][1] as RequestInit).body as string); + expect(body.provider).toBeUndefined(); + }); + + it('disables reasoning for non-OpenAI models when requested', async () => { + const fetchMock = vi.fn(async () => + new Response(JSON.stringify({ choices: [{ message: { role: 'assistant', content: 'x' }, finish_reason: 'stop' }] }), { status: 200 }), + ); + vi.stubGlobal('fetch', fetchMock); + const callApi = createApiCaller('sk-or-test', 'z-ai/glm-4.7-flash', { disableReasoning: true }); + await callApi([{ role: 'user', content: 'hi' }], []); + const body = JSON.parse((fetchMock.mock.calls[0][1] as RequestInit).body as string); + expect(body.reasoning).toEqual({ enabled: false }); + }); + + it('does NOT disable reasoning for OpenAI models (mandatory; would 400)', async () => { + const fetchMock = vi.fn(async () => + new Response(JSON.stringify({ choices: [{ message: { role: 'assistant', content: 'x' }, finish_reason: 'stop' }] }), { status: 200 }), + ); + vi.stubGlobal('fetch', fetchMock); + // gpt-5 is the worker escalate target — disableReasoning must be ignored for it. + const callApi = createApiCaller('sk-or-test', 'openai/gpt-5', { disableReasoning: true }); + await callApi([{ role: 'user', content: 'hi' }], []); + const body = JSON.parse((fetchMock.mock.calls[0][1] as RequestInit).body as string); + expect(body.reasoning).toBeUndefined(); + }); + + it('leaves OpenAI/Gemini messages untouched (auto-cached by OpenRouter)', () => { + const msgs: ChatMessage[] = [ + { role: 'system', content: 'sys' }, + { role: 'user', content: 'u1' }, + { role: 'assistant', content: 'a1' }, + { role: 'user', content: 'u2' }, + ]; + const out = applyPromptCaching(msgs, 'openai/gpt-5'); + expect(out).toBe(msgs); // same reference, no transform + }); + + it('inserts cache_control breakpoints for Anthropic models', () => { + const msgs: ChatMessage[] = [ + { role: 'system', content: 'sys' }, + { role: 'user', content: 'u1' }, + { role: 'assistant', content: 'a1' }, + { role: 'user', content: 'u2' }, // last msg — NOT cached (changes every turn) + ]; + const out = applyPromptCaching(msgs, 'anthropic/claude-sonnet-4') as Array>; + + // system (idx 0) and length-2 (idx 2, the assistant) get cache markers. + const sysContent = out[0].content as Array>; + expect(Array.isArray(sysContent)).toBe(true); + expect(sysContent[0].cache_control).toEqual({ type: 'ephemeral' }); + + const cachedAssistant = out[2].content as Array>; + expect(cachedAssistant[0].cache_control).toEqual({ type: 'ephemeral' }); + + // last message stays a plain string (no breakpoint on the volatile tail) + expect(typeof out[3].content).toBe('string'); + }); + + it('does not transform a single-message history (no stable prefix to cache)', () => { + const msgs: ChatMessage[] = [{ role: 'user', content: 'only' }]; + const out = applyPromptCaching(msgs, 'anthropic/claude-sonnet-4'); + expect(out).toEqual(msgs); + }); + + it('throws on non-2xx responses with status code in the error message', async () => { + const fetchMock = vi.fn(async () => + new Response('rate limited', { status: 429 }), + ); + vi.stubGlobal('fetch', fetchMock); + + const callApi = createApiCaller('sk-or-test-key', 'openai/gpt-4o'); + await expect( + callApi([{ role: 'user', content: 'x' }], []), + ).rejects.toThrow(/OpenRouter API error \(429\)/); + }); +}); diff --git a/src/adapters/openrouter.ts b/src/adapters/openrouter.ts new file mode 100644 index 0000000..fb36860 --- /dev/null +++ b/src/adapters/openrouter.ts @@ -0,0 +1,359 @@ +// ============================================ +// OpenSwarm - OpenRouter CLI Adapter +// Calls the OpenRouter Chat Completions API (OpenAI-compatible schema) +// using a stored sk-or-* key from `openswarm auth login --provider openrouter`. +// ============================================ + +import type { + CliAdapter, + CliRunOptions, + CliRunResult, + AdapterCapabilities, + WorkerResult, + ReviewResult, +} from './types.js'; +import { AuthProfileStore, ensureValidToken } from '../auth/index.js'; +import { t } from '../locale/index.js'; +import { + runAgenticLoop, + loopResultToCliResult, + type ChatMessage, + type AgenticLoopOptions, +} from './agenticLoop.js'; +import type { ToolDefinition } from './tools.js'; + +const OPENROUTER_API_BASE = 'https://openrouter.ai/api/v1'; +const DEFAULT_MODEL = 'openai/gpt-5'; +const PROFILE_KEY = 'openrouter:default'; + +/** OPENROUTER_API env var → immediate API key (no PKCE needed). */ +function getEnvApiKey(): string | undefined { + return process.env.OPENROUTER_API?.trim() || undefined; +} + +// Attribution headers — OpenRouter surfaces these in its analytics UI so +// model providers can see traffic originating from OpenSwarm. +const ATTRIBUTION_HEADERS: Record = { + 'HTTP-Referer': 'https://github.com/unohee/openswarm', + 'X-Title': 'OpenSwarm', +}; + +export class OpenRouterCliAdapter implements CliAdapter { + readonly name = 'openrouter'; + + readonly capabilities: AdapterCapabilities = { + supportsStreaming: false, + supportsJsonOutput: true, + supportsModelSelection: true, + managedGit: false, + supportedSkills: [], + }; + + async isAvailable(): Promise { + if (getEnvApiKey()) return true; + try { + const store = new AuthProfileStore(); + return store.getProfile(PROFILE_KEY) !== null; + } catch { + return false; + } + } + + buildCommand(_options: CliRunOptions): { command: string; args: string[] } { + // 어댑터가 직접 fetch하므로 spawn 진입점은 미사용. + return { command: 'echo', args: ['"OpenRouter adapter uses run() — not shell spawn"'] }; + } + + async run(options: CliRunOptions): Promise { + const startTime = Date.now(); + + // Prefer OPENROUTER_API env var (e.g. sourced from VEGA .env) + let apiKey: string | undefined = getEnvApiKey(); + if (!apiKey) { + const store = new AuthProfileStore(); + try { + apiKey = await ensureValidToken(store, PROFILE_KEY); + } catch (err) { + return { + exitCode: 1, + stdout: '', + stderr: `Auth error: ${err instanceof Error ? err.message : String(err)}. Set OPENROUTER_API env var or run: openswarm auth login --provider openrouter`, + durationMs: Date.now() - startTime, + }; + } + } + + const model = options.model ?? DEFAULT_MODEL; + const callApi = createApiCaller(apiKey, model, { + disableReasoning: options.disableReasoning, + }); + + const loopOptions: AgenticLoopOptions = { + systemPrompt: options.systemPrompt, + prompt: options.prompt, + cwd: options.cwd ?? process.cwd(), + model, + callApi, + maxTurns: options.maxTurns ?? 20, + timeoutMs: options.timeoutMs || 300000, + onLog: options.onLog, + enableTools: true, + nudgeMaxOnNoEdit: options.nudgeMaxOnNoEdit, + protectedFiles: options.protectedFiles, + bashTimeoutMs: options.bashTimeoutMs, + }; + + try { + const result = await runAgenticLoop(loopOptions); + options.onLog?.( + `[OpenRouter] ${result.apiCallCount} API calls, ${result.toolCallCount} tool uses, ${result.totalTokens} tokens`, + ); + return loopResultToCliResult(result); + } catch (err) { + return { + exitCode: 1, + stdout: '', + stderr: `OpenRouter agentic loop failed: ${err instanceof Error ? err.message : String(err)}`, + durationMs: Date.now() - startTime, + }; + } + } + + parseWorkerOutput(raw: CliRunResult): WorkerResult { + return extractWorkerResultJson(raw.stdout) ?? extractWorkerFromText(raw.stdout); + } + + parseReviewerOutput(raw: CliRunResult): ReviewResult { + return extractReviewerResultJson(raw.stdout) ?? extractReviewerFromText(raw.stdout); + } +} + +// ----- API caller ----- + +interface OpenRouterChatResponse { + choices: Array<{ + message: { + content: string | null; + role: string; + tool_calls?: Array<{ + id: string; + type: 'function'; + function: { name: string; arguments: string }; + }>; + }; + finish_reason: string; + }>; + usage?: { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; + }; +} + +export interface ApiCallerOptions { + /** worker 등 기계적 역할: 추론 토큰 비활성화 (지원 모델 한정) */ + disableReasoning?: boolean; +} + +export function createApiCaller(apiKey: string, model: string, opts: ApiCallerOptions = {}) { + return async (messages: ChatMessage[], tools: ToolDefinition[]) => { + const body: Record = { + model, + messages: applyPromptCaching(messages, model), + temperature: 0.2, + max_tokens: 16384, + }; + // ZDR(Zero Data Retention) — 데이터를 보존하지 않는 provider로만 라우팅. + // 단, OpenAI provider는 data_collection:deny 플래그를 거부("Provider returned + // error")하므로 제외한다. OpenAI는 API 데이터를 학습에 쓰지 않아(정책상) ZDR + // 강제가 불필요하다. non-OpenAI 모델에만 적용한다. + if (!/^openai\//i.test(model)) { + body.provider = { data_collection: 'deny' }; + } + // 추론 불필요 역할은 reasoning 토큰을 끈다. glm-4.7-flash처럼 non-thinking + // 모델엔 무영향, 추론형 모델(glm-5 등)을 worker로 바꿔도 토큰 낭비를 막는다. + // 단, OpenAI 추론 모델(gpt-5 등)은 "Reasoning is mandatory"로 이 플래그를 + // 거부하므로 제외한다 — worker escalate 대상이 gpt-5라 이걸 안 빼면 escalation이 + // 항상 깨진다. OpenAI는 단순 작업엔 추론을 자동 최소화하므로 끌 필요도 없다. + if (opts.disableReasoning && !/^openai\//i.test(model)) { + body.reasoning = { enabled: false }; + } + if (tools.length > 0) { + body.tools = tools; + } + + const res = await fetch(`${OPENROUTER_API_BASE}/chat/completions`, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + ...ATTRIBUTION_HEADERS, + }, + body: JSON.stringify(body), + }); + + if (!res.ok) { + const errText = await res.text().catch(() => ''); + throw new Error(`OpenRouter API error (${res.status}): ${errText.slice(0, 500)}`); + } + + return (await res.json()) as OpenRouterChatResponse; + }; +} + +/** + * Prompt caching breakpoint 삽입. + * + * OpenAI/Gemini 모델은 OpenRouter가 자동 캐싱하므로 메시지를 건드리지 않는다. + * Anthropic 모델은 명시적 cache_control breakpoint가 필요하다 — 매 API 호출마다 + * 전체 히스토리가 재전송되는데, 시스템 프롬프트 + 직전 누적 히스토리는 턴마다 + * 거의 동일하므로 그 경계에 ephemeral 캐시 마커를 두면 입력 토큰이 ~90% 할인된다. + * + * breakpoint 2개: (1) 시스템 메시지 끝, (2) 마지막 user/tool 메시지 직전 경계. + * Anthropic은 최대 4개 breakpoint를 허용하므로 2개는 안전하다. + */ +export function applyPromptCaching(messages: ChatMessage[], model: string): unknown[] { + // OpenAI/Gemini 등은 자동 캐싱 — 변환 불필요 (cache_control을 넣으면 거부될 수 있음) + if (!/anthropic\/|claude/i.test(model)) { + return messages; + } + + // 캐시 마커를 달 인덱스: 시스템 메시지(있으면) + 마지막 직전 메시지. + // 마지막 메시지(가장 최근 tool 결과)는 매 턴 바뀌므로 캐시하지 않는다. + const cacheable = new Set(); + if (messages[0]?.role === 'system') cacheable.add(0); + if (messages.length >= 2) cacheable.add(messages.length - 2); + + return messages.map((m, i) => { + if (!cacheable.has(i) || typeof m.content !== 'string' || !m.content) { + return m; + } + // string content → content-part 배열로 변환하며 마지막 파트에 cache_control 부착 + return { + ...m, + content: [ + { type: 'text', text: m.content, cache_control: { type: 'ephemeral' } }, + ], + }; + }); +} + +// ----- Worker/Reviewer output parsing (mirrors gpt.ts) ----- + +function extractWorkerResultJson(text: string): WorkerResult | null { + const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); + const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"success"'); + if (!jsonStr) return null; + + try { + const parsed = JSON.parse(jsonStr); + return { + success: Boolean(parsed.success), + summary: parsed.summary || t('common.fallback.noSummary'), + filesChanged: Array.isArray(parsed.filesChanged) ? parsed.filesChanged : [], + commands: Array.isArray(parsed.commands) ? parsed.commands : [], + output: text, + error: parsed.error, + confidencePercent: + typeof parsed.confidencePercent === 'number' ? parsed.confidencePercent : undefined, + haltReason: parsed.haltReason || undefined, + }; + } catch { + return null; + } +} + +function extractWorkerFromText(text: string): WorkerResult { + // Only an explicit failure phrase marks the run as failed (see gpt.ts). + // git-diff promotion in worker.ts is the real success signal. + const failed = isExplicitFailure(text); + + return { + success: !failed, + summary: extractSummary(text), + filesChanged: [], + commands: [], + output: text, + error: failed ? extractErrorMessage(text) : undefined, + }; +} + +function extractReviewerResultJson(text: string): ReviewResult | null { + const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/); + const jsonStr = jsonMatch?.[1] ?? findJsonObject(text, '"decision"'); + if (!jsonStr) return null; + + try { + const parsed = JSON.parse(jsonStr); + const decision = + parsed.decision === 'approve' || parsed.decision === 'reject' ? parsed.decision : 'revise'; + return { + decision, + feedback: + typeof parsed.feedback === 'string' ? parsed.feedback : t('common.fallback.noSummary'), + issues: Array.isArray(parsed.issues) + ? parsed.issues.filter((v: unknown): v is string => typeof v === 'string') + : [], + suggestions: Array.isArray(parsed.suggestions) + ? parsed.suggestions.filter((v: unknown): v is string => typeof v === 'string') + : [], + }; + } catch { + return null; + } +} + +function extractReviewerFromText(text: string): ReviewResult { + const lower = text.toLowerCase(); + const decision = lower.includes('approve') + ? 'approve' + : lower.includes('reject') + ? 'reject' + : 'revise'; + return { + decision, + feedback: extractSummary(text), + issues: [], + suggestions: [], + }; +} + +function findJsonObject(text: string, marker: string): string | null { + const idx = text.indexOf(marker); + if (idx < 0) return null; + + const start = text.lastIndexOf('{', idx); + if (start < 0) return null; + + let depth = 0; + for (let i = start; i < text.length; i++) { + if (text[i] === '{') depth++; + if (text[i] === '}') { + depth--; + if (depth === 0) { + return text.slice(start, i + 1); + } + } + } + return null; +} + +// Detect a real failure declaration, not incidental "error"/"fail" prose (see gpt.ts). +function isExplicitFailure(text: string): boolean { + if (/"success"\s*:\s*false/i.test(text)) return true; + return /\b(failed to|unable to|could not|couldn['’]t|cannot (?:complete|finish|proceed|continue)|giving up|abort(?:ed|ing))\b/i.test(text); +} + +function extractSummary(text: string): string { + const lines = text.split('\n').filter((l) => l.trim().length > 10); + if (lines.length === 0) return t('common.fallback.noSummary'); + const summary = lines[0].trim(); + return summary.length > 200 ? `${summary.slice(0, 200)}...` : summary; +} + +function extractErrorMessage(text: string): string { + const errorMatch = text.match(/(?:error|exception|failed?):\s*(.+)/i); + if (errorMatch) return errorMatch[1].slice(0, 200); + const lines = text.split('\n').filter((l) => /error|fail/i.test(l)); + return lines.length > 0 ? lines[0].slice(0, 200) : 'Unknown error'; +} diff --git a/src/adapters/tools.test.ts b/src/adapters/tools.test.ts index ebbf920..1ae7daf 100644 --- a/src/adapters/tools.test.ts +++ b/src/adapters/tools.test.ts @@ -2,7 +2,7 @@ import { describe, it, expect, beforeAll, afterAll } from 'vitest'; import fs from 'node:fs/promises'; import path from 'node:path'; import { execFileSync } from 'node:child_process'; -import { TOOL_DEFINITIONS, executeTool, ToolCall } from './tools.js'; +import { TOOL_DEFINITIONS, executeTool, createReadCache, ToolCall } from './tools.js'; // Check if rg binary is available (not just a shell function wrapper) let hasRg = false; @@ -264,7 +264,8 @@ describe('Path validation', () => { TMP_DIR, ); expect(result.is_error).toBe(true); - expect(result.content).toContain('Path outside project'); + // 거부 메시지는 모델 자가수정을 돕도록 안내형 — "outside the project root" 포함. + expect(result.content).toContain('outside the project root'); }); it('allows paths under /tmp', async () => { @@ -280,3 +281,126 @@ describe('Path validation', () => { expect(result.content).toContain('ok'); }); }); + +// ────────────────────────────────────────────── +// 3. ReadCache — token-saving read deduplication +// ────────────────────────────────────────────── + +describe('ReadCache', () => { + it('returns cached content marked unchanged on a repeated read', async () => { + const filePath = path.join(TMP_DIR, 'cache-a.txt'); + await fs.writeFile(filePath, 'hello\nworld\n'); + const cache = createReadCache(); + + const first = await executeTool(makeCall('read_file', { path: filePath }), TMP_DIR, cache); + expect(first.content).toContain('hello'); + expect(first.content).not.toContain('unchanged'); + + const second = await executeTool(makeCall('read_file', { path: filePath }), TMP_DIR, cache); + expect(second.content).toContain('unchanged since last read'); + expect(second.content).toContain('hello'); // still carries the content + }); + + it('invalidates the cache after edit_file so the next read is fresh', async () => { + const filePath = path.join(TMP_DIR, 'cache-b.txt'); + await fs.writeFile(filePath, 'foo = 1\n'); + const cache = createReadCache(); + + await executeTool(makeCall('read_file', { path: filePath }), TMP_DIR, cache); + await executeTool( + makeCall('edit_file', { path: filePath, old_string: 'foo = 1', new_string: 'foo = 2' }), + TMP_DIR, + cache, + ); + + const afterEdit = await executeTool(makeCall('read_file', { path: filePath }), TMP_DIR, cache); + expect(afterEdit.content).not.toContain('unchanged'); + expect(afterEdit.content).toContain('foo = 2'); + }); + + it('edit_file returns the resulting region so a re-read is unnecessary', async () => { + const filePath = path.join(TMP_DIR, 'cache-c.txt'); + await fs.writeFile(filePath, 'line1\ntarget\nline3\n'); + const cache = createReadCache(); + + const edit = await executeTool( + makeCall('edit_file', { path: filePath, old_string: 'target', new_string: 'fixed' }), + TMP_DIR, + cache, + ); + expect(edit.is_error).toBe(false); + expect(edit.content).toContain('Resulting region'); + expect(edit.content).toContain('fixed'); + }); + + it('caches by path+range so different offsets are not confused', async () => { + const filePath = path.join(TMP_DIR, 'cache-d.txt'); + await fs.writeFile(filePath, Array.from({ length: 20 }, (_, i) => `line${i + 1}`).join('\n') + '\n'); + const cache = createReadCache(); + + const head = await executeTool(makeCall('read_file', { path: filePath, offset: 0, limit: 5 }), TMP_DIR, cache); + const tail = await executeTool(makeCall('read_file', { path: filePath, offset: 10, limit: 5 }), TMP_DIR, cache); + // Different range → not served from cache + expect(tail.content).not.toContain('unchanged'); + expect(head.content).toContain('line1'); + expect(tail.content).toContain('line11'); + }); +}); + +// ────────────────────────────────────────────── +// ToolExecOptions — verification harness protection +// ────────────────────────────────────────────── + +describe('ToolExecOptions', () => { + it('edit_file refuses protected files with guidance back to source code', async () => { + const filePath = path.join(TMP_DIR, 'run_tests.sh'); + await fs.writeFile(filePath, '#!/bin/bash\necho ok\n'); + + const res = await executeTool( + makeCall('edit_file', { path: filePath, old_string: 'echo ok', new_string: 'echo hacked' }), + TMP_DIR, + undefined, + { protectedFiles: ['run_tests.sh'] }, + ); + expect(res.is_error).toBe(true); + expect(res.content).toContain('PROTECTED'); + expect(await fs.readFile(filePath, 'utf-8')).toContain('echo ok'); + }); + + it('write_file refuses protected files', async () => { + const filePath = path.join(TMP_DIR, 'run_tests.sh'); + const res = await executeTool( + makeCall('write_file', { path: filePath, content: 'overwritten' }), + TMP_DIR, + undefined, + { protectedFiles: ['run_tests.sh'] }, + ); + expect(res.is_error).toBe(true); + expect(res.content).toContain('PROTECTED'); + }); + + it('edit_file still works on non-protected files when protection is active', async () => { + const filePath = path.join(TMP_DIR, 'source.py'); + await fs.writeFile(filePath, 'x = 1\n'); + const res = await executeTool( + makeCall('edit_file', { path: filePath, old_string: 'x = 1', new_string: 'x = 2' }), + TMP_DIR, + undefined, + { protectedFiles: ['run_tests.sh'] }, + ); + expect(res.is_error).toBe(false); + expect(await fs.readFile(filePath, 'utf-8')).toContain('x = 2'); + }); + + it('bash reports TIMEOUT explicitly instead of a silent failure', async () => { + const res = await executeTool( + makeCall('bash', { command: 'sleep 5' }), + TMP_DIR, + undefined, + { bashTimeoutMs: 300 }, + ); + expect(res.is_error).toBe(true); + expect(res.content).toContain('TIMEOUT'); + expect(res.content).toContain('NOT evidence'); + }); +}); diff --git a/src/adapters/tools.ts b/src/adapters/tools.ts index cbf6c86..17d24ed 100644 --- a/src/adapters/tools.ts +++ b/src/adapters/tools.ts @@ -139,12 +139,60 @@ export interface ToolResult { is_error: boolean; } +/** + * 루프 단위 read 캐시. 같은 작업 루프 안에서 동일 파일을 반복 read하면 + * (모델이 edit 후 "고쳐졌나?" 확인하려 재read하는 패턴) 디스크를 다시 읽지 않고 + * 캐시된 내용 + "변경 없음" 힌트를 반환해 토큰·턴 낭비를 줄인다. + * edit_file/write_file 성공 시 해당 경로를 무효화해 stale read를 막는다. + */ +export interface ReadCache { + store: Map; +} + +export function createReadCache(): ReadCache { + return { store: new Map() }; +} + +/** 캐시에서 한 파일의 모든 범위 엔트리를 제거 (edit/write 후 stale 방지) */ +function invalidateCache(cache: ReadCache | undefined, filePath: string): void { + if (!cache) return; + for (const key of cache.store.keys()) { + if (key.startsWith(`${filePath}#`)) cache.store.delete(key); + } +} + +/** + * Tool execution options — verification-harness protection. + * Found in SWE hybrid runs: the implementer model misattributed test failures + * to the verification script (run_tests.sh) and edited the script itself five + * times, destroying verification integrity. Protected files reject edit/write. + * The bash timeout is also configurable — the 30s default dies silently on + * docker-based test runs (minutes), which made models conclude "the + * environment is broken". + */ +export interface ToolExecOptions { + /** Filenames (matched by path suffix) for which edit_file/write_file are refused */ + protectedFiles?: string[]; + /** bash tool timeout (default 30000ms) */ + bashTimeoutMs?: number; +} + +function isProtected(resolved: string, protectedFiles?: string[]): boolean { + if (!protectedFiles?.length) return false; + return protectedFiles.some((p) => resolved === p || resolved.endsWith(`/${p}`)); +} + /** 프로젝트 경로 내로 접근을 제한하는 경로 검증 */ function validatePath(filePath: string, cwd: string): string { const resolved = path.resolve(cwd, filePath); // cwd 하위이거나, /tmp 하위만 허용 if (!resolved.startsWith(cwd) && !resolved.startsWith('/tmp')) { - throw new Error(`Path outside project: ${resolved} (cwd: ${cwd})`); + // 모델이 자가수정하도록 안내 — 그냥 거부만 하면 같은 실수를 반복한다. + throw new Error( + `Path "${filePath}" is outside the project root (${cwd}). ` + + `Use a path relative to the project root instead, e.g. "." for the whole project or "src/...". ` + + `Do not use "/" or absolute paths outside ${cwd}.`, + ); } return resolved; } @@ -155,6 +203,8 @@ function validatePath(filePath: string, cwd: string): string { export async function executeTool( toolCall: ToolCall, cwd: string, + cache?: ReadCache, + execOptions?: ToolExecOptions, ): Promise { const { name, arguments: argsJson } = toolCall.function; const callId = toolCall.id; @@ -165,28 +215,59 @@ export async function executeTool( switch (name) { case 'read_file': { const filePath = validatePath(args.path, cwd); - const content = await fs.readFile(filePath, 'utf-8'); - const lines = content.split('\n'); const offset = args.offset ?? 0; const limit = args.limit ?? 500; + const cacheKey = `${filePath}#${offset}:${limit}`; + + // 같은 루프에서 이미 같은 범위를 읽었으면 디스크 재접근 없이 캐시 반환. + // 모델에게 "변경 없음"을 알려 추가 확인 read를 유도하지 않는다. + if (cache?.store.has(cacheKey)) { + return { + tool_call_id: callId, + content: `(unchanged since last read — cached)\n${cache.store.get(cacheKey)!}`, + is_error: false, + }; + } + + const content = await fs.readFile(filePath, 'utf-8'); + const lines = content.split('\n'); const slice = lines.slice(offset, offset + limit); const numbered = slice.map((line, i) => `${offset + i + 1}\t${line}`).join('\n'); const truncated = lines.length > offset + limit ? `\n... (${lines.length - offset - limit} more lines)` : ''; - return { tool_call_id: callId, content: numbered + truncated, is_error: false }; + const result = numbered + truncated; + cache?.store.set(cacheKey, result); + return { tool_call_id: callId, content: result, is_error: false }; } case 'write_file': { const filePath = validatePath(args.path, cwd); + if (isProtected(filePath, execOptions?.protectedFiles)) { + return { + tool_call_id: callId, + content: `PROTECTED: ${args.path} is part of the verification harness and must not be modified. ` + + `If tests fail, the cause is in the SOURCE code (or your fix) — debug from the test output instead.`, + is_error: true, + }; + } // 디렉토리 자동 생성 await fs.mkdir(path.dirname(filePath), { recursive: true }); await fs.writeFile(filePath, args.content, 'utf-8'); + invalidateCache(cache, filePath); return { tool_call_id: callId, content: `Written: ${filePath}`, is_error: false }; } case 'edit_file': { const filePath = validatePath(args.path, cwd); + if (isProtected(filePath, execOptions?.protectedFiles)) { + return { + tool_call_id: callId, + content: `PROTECTED: ${args.path} is part of the verification harness and must not be modified. ` + + `If tests fail, the cause is in the SOURCE code (or your fix) — debug from the test output instead.`, + is_error: true, + }; + } const original = await fs.readFile(filePath, 'utf-8'); const occurrences = original.split(args.old_string).length - 1; if (occurrences === 0) { @@ -197,7 +278,18 @@ export async function executeTool( } const updated = original.replace(args.old_string, args.new_string); await fs.writeFile(filePath, updated, 'utf-8'); - return { tool_call_id: callId, content: `Edited: ${filePath}`, is_error: false }; + invalidateCache(cache, filePath); + // 변경 결과(주변 컨텍스트)를 함께 반환해 모델이 재read 없이 확인하도록 한다. + const newLines = updated.split('\n'); + const editLine = updated.slice(0, updated.indexOf(args.new_string)).split('\n').length - 1; + const from = Math.max(0, editLine - 3); + const to = Math.min(newLines.length, editLine + args.new_string.split('\n').length + 3); + const snippet = newLines.slice(from, to).map((l, i) => `${from + i + 1}\t${l}`).join('\n'); + return { + tool_call_id: callId, + content: `Edited: ${filePath}\nResulting region:\n${snippet}`, + is_error: false, + }; } case 'search_files': { @@ -228,7 +320,7 @@ export async function executeTool( try { const { stdout, stderr } = await execFileAsync('bash', ['-c', command], { cwd, - timeout: 30000, + timeout: execOptions?.bashTimeoutMs ?? 30000, maxBuffer: 1024 * 512, env: process.env, }); @@ -236,12 +328,35 @@ export async function executeTool( // 출력이 너무 길면 잘라냄 return { tool_call_id: callId, - content: output.length > 8000 ? output.slice(0, 8000) + '\n... (truncated)' : output, + content: output.length > 8000 ? output.slice(0, 8000) + '\n... (truncated)' : output || '(no output, exit 0)', is_error: false, }; } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - return { tool_call_id: callId, content: `Command failed: ${msg.slice(0, 2000)}`, is_error: true }; + // exit code != 0 → execFile이 throw. 하지만 grep/find 등은 "매치 없음"으로 + // exit 1을 내며 이건 정상이다. 실제 stdout/stderr + exit code를 모델에게 줘서 + // "no match"인지 진짜 에러인지 스스로 판단하게 한다(이게 없으면 같은 명령 반복). + const e = err as { code?: number; stdout?: string; stderr?: string; message?: string; killed?: boolean; signal?: string }; + const out = (e.stdout ?? '') + (e.stderr ? `\n[stderr] ${e.stderr}` : ''); + const code = typeof e.code === 'number' ? e.code : '?'; + // Make timeout kills explicit — a silent no-output failure leads the + // model to conclude "the verification environment is broken" and start + // dismantling the harness (observed in SWE runs). + if (e.killed && e.signal) { + const limit = execOptions?.bashTimeoutMs ?? 30000; + return { + tool_call_id: callId, + content: `TIMEOUT: command exceeded ${Math.round(limit / 1000)}s and was killed (${e.signal}). ` + + `The command may simply be slow — this is NOT evidence that the environment or script is broken. ` + + `Partial output:\n${out.slice(0, 2000) || '(none)'}`, + is_error: true, + }; + } + const body = out.trim() + ? `exit ${code}:\n${out.slice(0, 4000)}` + : `exit ${code} (no output) — likely no matches or a non-fatal nonzero exit, not necessarily an error.`; + // exit 1 + 출력 없음은 보통 무해(grep no-match) → is_error를 false로 둬 모델이 안 헤매게. + const benign = e.code === 1 && !out.trim(); + return { tool_call_id: callId, content: body, is_error: !benign }; } } @@ -260,6 +375,8 @@ export async function executeTool( export async function executeToolCalls( toolCalls: ToolCall[], cwd: string, + cache?: ReadCache, + execOptions?: ToolExecOptions, ): Promise { - return Promise.all(toolCalls.map(tc => executeTool(tc, cwd))); + return Promise.all(toolCalls.map(tc => executeTool(tc, cwd, cache, execOptions))); } diff --git a/src/adapters/types.ts b/src/adapters/types.ts index c52981c..f494907 100644 --- a/src/adapters/types.ts +++ b/src/adapters/types.ts @@ -8,7 +8,7 @@ import type { WorkerResult, ReviewResult } from '../agents/agentPair.js'; // Re-export for convenience export type { WorkerResult, ReviewResult }; -export type AdapterName = 'claude' | 'codex' | 'gpt' | 'local' | 'lmstudio'; +export type AdapterName = 'codex' | 'gpt' | 'local' | 'lmstudio' | 'openrouter'; /** * Raw result from a CLI process execution @@ -41,6 +41,25 @@ export interface CliRunOptions { processContext?: ProcessContext; /** 시스템 프롬프트 (GPT/Local 에이전틱 루프에서 사용) */ systemPrompt?: string; + /** + * 추론(reasoning) 토큰 비활성화 요청 (OpenRouter). 기계적 실행 역할(worker 등)은 + * 추론이 불필요하므로 토큰 낭비를 막는다. 모델이 추론 강제(thinking 전용)면 + * OpenRouter가 거부할 수 있으나, 그런 모델은 경량 역할에 쓰지 않는다. + */ + disableReasoning?: boolean; + /** + * 수정 필수 작업의 no-edit 종료 가드 횟수 (agenticLoop). 모델이 edit/write 없이 + * 끝내려 하면 N회까지 되민다. 기본 0(비활성). + */ + nudgeMaxOnNoEdit?: number; + /** + * Verification-harness file protection (agenticLoop → tools). Files in this + * list reject edit_file/write_file — prevents the model from suspecting and + * rewriting the verification script when tests fail. + */ + protectedFiles?: string[]; + /** bash tool timeout in ms (default 30s). Raise for docker-based tests that take minutes. */ + bashTimeoutMs?: number; } /** diff --git a/src/agents/draftAnalyzer.ts b/src/agents/draftAnalyzer.ts index 6242ee3..993760e 100644 --- a/src/agents/draftAnalyzer.ts +++ b/src/agents/draftAnalyzer.ts @@ -50,7 +50,7 @@ export interface DraftAnalyzerOptions { taskDescription: string; projectPath: string; projectId?: string; - /** Haiku 모델명 (기본: claude-haiku-4-5-20251001) */ + /** Fast model for draft analysis (default: gpt-5-codex) */ model?: string; /** 타임아웃 (기본: 30초 — Haiku는 빠름) */ timeoutMs?: number; @@ -283,8 +283,8 @@ export async function runDraftAnalysis(options: DraftAnalyzerOptions): Promise = { @@ -295,7 +295,7 @@ export async function runDraftAnalysis(options: DraftAnalyzerOptions): Promise { onLog: options.onLog, processContext: options.processContext, systemPrompt: getPrompts().systemPrompt, + // Worker is a mechanical execution role — file edits, not deep reasoning. + // Disable reasoning tokens to cut cost/latency (no-op on non-thinking models). + disableReasoning: true, + nudgeMaxOnNoEdit: options.nudgeMaxOnNoEdit, + protectedFiles: options.protectedFiles, + bashTimeoutMs: options.bashTimeoutMs, }); // Parse result via adapter const parsedResult = adapter.parseWorkerOutput(raw); - // Extract actually changed files via Git diff (independent of LLM report) + // Git diff is the source of truth for "did real work happen" — independent of + // whether the model emitted a well-formed JSON success block. LLMs are weak at + // structured output, so we never let a missing/malformed JSON block alone mark + // a task as failed when the working tree actually changed (VEGA-style: real + // signal over self-report). if (isGitRepo && snapshotHash) { const gitChangedFiles = await gitTracker.getChangedFilesSinceSnapshot(cwd, snapshotHash); @@ -92,6 +108,17 @@ export async function runWorker(options: WorkerOptions): Promise { ...parsedResult.filesChanged, ]); parsedResult.filesChanged = Array.from(mergedFiles); + + // Real file changes + no explicit error signal → treat as success even if + // the model never produced a JSON block. Only an explicit error/halt in the + // output should keep success=false here. + if (!parsedResult.success && !parsedResult.error && !parsedResult.haltReason) { + console.log('[Worker] Promoting to success: git changes present, no error signal'); + parsedResult.success = true; + if (!parsedResult.summary || parsedResult.summary === t('common.fallback.noSummary')) { + parsedResult.summary = `Modified ${gitChangedFiles.length} file(s): ${gitChangedFiles.slice(0, 5).join(', ')}`; + } + } } else if (parsedResult.filesChanged.length === 0) { console.log('[Worker] No file changes detected by Git or LLM'); } @@ -103,7 +130,7 @@ export async function runWorker(options: WorkerOptions): Promise { console.error(`[Worker] Execution failed: ${errMsg}`); // Log stderr hint if available (CLI spawn errors often contain useful info) if (error instanceof Error && error.message.includes('code')) { - console.error(`[Worker] CLI exited with non-zero code — check claude CLI availability and permissions`); + console.error(`[Worker] CLI exited with non-zero code — check adapter auth and permissions`); } return { success: false, diff --git a/src/auth/index.ts b/src/auth/index.ts index 2ed4281..6516728 100644 --- a/src/auth/index.ts +++ b/src/auth/index.ts @@ -3,4 +3,17 @@ // ============================================ export { AuthProfileStore, ensureValidToken, type AuthProfile } from './oauthStore.js'; -export { runOAuthPkceFlow, loginAndSaveProfile, type OAuthFlowResult, type OAuthFlowOptions } from './oauthPkce.js'; +export { + runOAuthPkceFlow, + loginAndSaveProfile, + DEFAULT_OPENAI_CLIENT_ID, + type OAuthFlowResult, + type OAuthFlowOptions, +} from './oauthPkce.js'; +export { + runOpenRouterPkceFlow, + loginAndSaveOpenRouterProfile, + saveOpenRouterApiKey, + type OpenRouterFlowResult, + type OpenRouterFlowOptions, +} from './openrouterPkce.js'; diff --git a/src/auth/oauthPkce.ts b/src/auth/oauthPkce.ts index 551bf06..a5b48bd 100644 --- a/src/auth/oauthPkce.ts +++ b/src/auth/oauthPkce.ts @@ -13,10 +13,17 @@ import { AuthProfileStore, type AuthProfile } from './oauthStore.js'; const OPENAI_AUTH_ENDPOINT = 'https://auth.openai.com/oauth/authorize'; const OPENAI_TOKEN_ENDPOINT = 'https://auth.openai.com/oauth/token'; const DEFAULT_CALLBACK_PORT = 1455; -const DEFAULT_SCOPES = 'openid profile email offline_access model.request'; -const LOGIN_TIMEOUT_MS = 120_000; // 2분 +const DEFAULT_SCOPES = 'openid profile email offline_access'; +const LOGIN_TIMEOUT_MS = 120_000; // 2 minutes const PROFILE_KEY = 'openai-gpt:default'; +// Public OAuth client_id used by the official @openai/codex CLI. +// Reusing it lets `openswarm auth login --provider gpt` work out of the box for +// any ChatGPT Plus/Pro/Team user without provisioning a custom OAuth app. +// Override with `--client-id` or the OPENAI_CLIENT_ID env var if needed. +export const DEFAULT_OPENAI_CLIENT_ID = 'app_EMoamEEZ73f0CkXaXp7hrann'; +const OAUTH_ORIGINATOR = 'openswarm'; + // PKCE helpers function generateCodeVerifier(): string { @@ -59,7 +66,7 @@ export interface OAuthFlowResult { } export interface OAuthFlowOptions { - clientId: string; + clientId?: string; port?: number; scopes?: string; } @@ -68,16 +75,25 @@ export interface OAuthFlowOptions { * OAuth 2.1 PKCE 흐름 실행. * 로컬 HTTP 서버에서 callback을 받고, token을 교환하여 저장한다. */ -export async function runOAuthPkceFlow(options: OAuthFlowOptions): Promise { - const { clientId, port = DEFAULT_CALLBACK_PORT, scopes = DEFAULT_SCOPES } = options; - const redirectUri = `http://127.0.0.1:${port}/auth/callback`; +export async function runOAuthPkceFlow(options: OAuthFlowOptions = {}): Promise { + const { + clientId = DEFAULT_OPENAI_CLIENT_ID, + port = DEFAULT_CALLBACK_PORT, + scopes = DEFAULT_SCOPES, + } = options; + // Must be exactly "http://localhost:1455/auth/callback" — this is the value + // registered on the public Codex OAuth client. Using 127.0.0.1 instead + // triggers Hydra's authorize_hydra_invalid_request error. + const redirectUri = `http://localhost:${port}/auth/callback`; // 1. PKCE 생성 const codeVerifier = generateCodeVerifier(); const codeChallenge = generateCodeChallenge(codeVerifier); const state = generateState(); - // 2. Authorization URL 구성 + // 2. Authorization URL 구성. + // The simplified_flow + id_token_add_organizations params mirror the official + // codex CLI so the ChatGPT side recognises this as a first-party desktop login. const authParams = new URLSearchParams({ response_type: 'code', client_id: clientId, @@ -86,6 +102,9 @@ export async function runOAuthPkceFlow(options: OAuthFlowOptions): Promise { +export async function loginAndSaveProfile( + clientId: string = DEFAULT_OPENAI_CLIENT_ID, + port?: number, +): Promise { const result = await runOAuthPkceFlow({ clientId, port }); const profile: AuthProfile = { diff --git a/src/auth/oauthStore.ts b/src/auth/oauthStore.ts index ba7f801..500c1e0 100644 --- a/src/auth/oauthStore.ts +++ b/src/auth/oauthStore.ts @@ -10,11 +10,21 @@ import { homedir } from 'node:os'; // Types export interface AuthProfile { - type: 'oauth'; + /** + * oauth: short-lived access_token + refresh_token (e.g. ChatGPT Codex) + * apiKey: long-lived bearer token, no refresh flow (e.g. OpenRouter sk-or-*) + */ + type: 'oauth' | 'apiKey'; provider: string; access: string; + /** Empty string when `type === 'apiKey'` (no refresh available). */ refresh: string; - expires: number; // ms timestamp + /** + * ms timestamp at which `access` expires. + * For `type === 'apiKey'` this is set to Number.MAX_SAFE_INTEGER (never expires). + */ + expires: number; + /** OAuth client_id for the issuer. Empty string for plain API keys. */ clientId: string; accountId?: string; } @@ -93,6 +103,11 @@ export async function ensureValidToken(store: AuthProfileStore, profileKey: stri throw new Error(`Auth profile "${profileKey}" not found. Run: openswarm auth login --provider gpt`); } + // API keys never expire and have no refresh flow. + if (profile.type === 'apiKey') { + return profile.access; + } + const now = Date.now(); if (now < profile.expires - REFRESH_BUFFER_MS) { return profile.access; diff --git a/src/auth/openrouterPkce.ts b/src/auth/openrouterPkce.ts new file mode 100644 index 0000000..cbe26c9 --- /dev/null +++ b/src/auth/openrouterPkce.ts @@ -0,0 +1,282 @@ +// ============================================ +// OpenSwarm - OpenRouter PKCE Flow +// Browser-based login that exchanges an authorization code +// for a user-controlled `sk-or-*` API key. +// ============================================ + +import { createServer, type IncomingMessage, type ServerResponse } from 'node:http'; +import { randomBytes, createHash } from 'node:crypto'; +import { exec } from 'node:child_process'; +import { AuthProfileStore, type AuthProfile } from './oauthStore.js'; + +// ----- Constants ----- + +const OPENROUTER_AUTH_ENDPOINT = 'https://openrouter.ai/auth'; +const OPENROUTER_KEYS_ENDPOINT = 'https://openrouter.ai/api/v1/auth/keys'; +const DEFAULT_CALLBACK_PORT = 1456; // distinct from the OpenAI flow (1455) +const LOGIN_TIMEOUT_MS = 120_000; +const OPENROUTER_PROFILE_KEY = 'openrouter:default'; + +export const PROFILE_KEY = OPENROUTER_PROFILE_KEY; + +// ----- PKCE helpers (same shape as oauthPkce.ts) ----- + +function generateCodeVerifier(): string { + return randomBytes(96).toString('base64url'); +} + +function generateCodeChallenge(verifier: string): string { + return createHash('sha256').update(verifier).digest('base64url'); +} + +function generateState(): string { + return randomBytes(32).toString('hex'); +} + +// ----- Browser open (cross-platform) ----- + +function openBrowser(url: string): void { + const platform = process.platform; + const cmd = + platform === 'darwin' ? 'open' : + platform === 'win32' ? 'start' : + 'xdg-open'; + + exec(`${cmd} "${url}"`, (err) => { + if (err) { + console.error(`[Auth] 브라우저를 자동으로 열 수 없습니다. 직접 열어주세요:`); + console.error(url); + } + }); +} + +// ----- Types ----- + +export interface OpenRouterFlowResult { + apiKey: string; + userId?: string; +} + +export interface OpenRouterFlowOptions { + port?: number; +} + +/** + * OpenRouter PKCE 흐름 실행. + * + * 1. PKCE verifier/challenge 생성 + * 2. https://openrouter.ai/auth?callback_url=...&code_challenge=...&code_challenge_method=S256 로 브라우저 오픈 + * 3. 로컬 콜백 서버에서 ?code=... 수신 + * 4. POST /api/v1/auth/keys 로 교환 → 영구 sk-or-* API key + * + * OpenAI 흐름과 달리 refresh token 개념이 없다 — 받은 키를 그대로 저장한다. + */ +export async function runOpenRouterPkceFlow( + options: OpenRouterFlowOptions = {}, +): Promise { + const { port = DEFAULT_CALLBACK_PORT } = options; + const callbackUrl = `http://127.0.0.1:${port}/auth/callback`; + + const codeVerifier = generateCodeVerifier(); + const codeChallenge = generateCodeChallenge(codeVerifier); + // OpenRouter does not echo back `state`, but we still keep it locally + // so we can detect tampered callbacks (the URL pattern requires us to send it). + const state = generateState(); + + const authParams = new URLSearchParams({ + callback_url: callbackUrl, + code_challenge: codeChallenge, + code_challenge_method: 'S256', + state, + }); + const authUrl = `${OPENROUTER_AUTH_ENDPOINT}?${authParams.toString()}`; + + return new Promise((resolve, reject) => { + let settled = false; + + const timeout = setTimeout(() => { + if (!settled) { + settled = true; + server.close(); + reject(new Error('OpenRouter login timed out (120s). 다시 시도하세요.')); + } + }, LOGIN_TIMEOUT_MS); + + const server = createServer(async (req: IncomingMessage, res: ServerResponse) => { + if (settled) { + res.writeHead(400); + res.end(); + return; + } + + const url = new URL(req.url ?? '/', `http://127.0.0.1:${port}`); + + if (url.pathname !== '/auth/callback') { + res.writeHead(404); + res.end('Not found'); + return; + } + + const code = url.searchParams.get('code'); + const error = url.searchParams.get('error'); + + if (error) { + settled = true; + clearTimeout(timeout); + res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); + res.end(errorHtml(error)); + server.close(); + reject(new Error(`OpenRouter OAuth error: ${error}`)); + return; + } + + if (!code) { + settled = true; + clearTimeout(timeout); + res.writeHead(400, { 'Content-Type': 'text/html; charset=utf-8' }); + res.end(errorHtml('Missing authorization code')); + server.close(); + reject(new Error('Invalid OpenRouter callback: missing code')); + return; + } + + try { + const exchangeRes = await fetch(OPENROUTER_KEYS_ENDPOINT, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + code, + code_verifier: codeVerifier, + code_challenge_method: 'S256', + }), + }); + + if (!exchangeRes.ok) { + const errText = await exchangeRes.text().catch(() => ''); + throw new Error( + `Key exchange failed (${exchangeRes.status}): ${errText.slice(0, 300)}`, + ); + } + + const payload = (await exchangeRes.json()) as { + key?: string; + user_id?: string | null; + }; + + if (!payload.key) { + throw new Error('OpenRouter key exchange response missing "key" field'); + } + + const result: OpenRouterFlowResult = { + apiKey: payload.key, + userId: payload.user_id ?? undefined, + }; + + settled = true; + clearTimeout(timeout); + res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); + res.end(successHtml()); + server.close(); + resolve(result); + } catch (err) { + settled = true; + clearTimeout(timeout); + res.writeHead(500, { 'Content-Type': 'text/html; charset=utf-8' }); + res.end(errorHtml(String(err))); + server.close(); + reject(err); + } + }); + + server.listen(port, '127.0.0.1', () => { + console.log(`[Auth] Callback server listening on http://127.0.0.1:${port}`); + console.log(`[Auth] 브라우저에서 OpenRouter 로그인 페이지를 엽니다...`); + openBrowser(authUrl); + }); + + server.on('error', (err) => { + if (!settled) { + settled = true; + clearTimeout(timeout); + reject(new Error(`Callback server error: ${err.message}`)); + } + }); + }); +} + +/** + * OpenRouter API key를 직접 받아 저장 (PKCE fallback). + * `sk-or-` 접두사만 가볍게 검증한다. + */ +export function saveOpenRouterApiKey(apiKey: string): void { + const trimmed = apiKey.trim(); + if (!trimmed) { + throw new Error('Empty API key'); + } + if (!trimmed.startsWith('sk-or-')) { + throw new Error( + 'OpenRouter API keys start with "sk-or-". Get one from https://openrouter.ai/keys', + ); + } + + const profile: AuthProfile = { + type: 'apiKey', + provider: 'openrouter', + access: trimmed, + refresh: '', + expires: Number.MAX_SAFE_INTEGER, + clientId: '', + }; + + const store = new AuthProfileStore(); + store.setProfile(OPENROUTER_PROFILE_KEY, profile); +} + +/** + * 전체 PKCE 흐름 + 저장 + */ +export async function loginAndSaveOpenRouterProfile(port?: number): Promise { + const result = await runOpenRouterPkceFlow({ port }); + + const profile: AuthProfile = { + type: 'apiKey', + provider: 'openrouter', + access: result.apiKey, + refresh: '', + expires: Number.MAX_SAFE_INTEGER, + clientId: '', + accountId: result.userId, + }; + + const store = new AuthProfileStore(); + store.setProfile(OPENROUTER_PROFILE_KEY, profile); + + console.log(`[Auth] OpenRouter 인증 완료. 프로필 저장됨: ${OPENROUTER_PROFILE_KEY}`); + if (result.userId) { + console.log(`[Auth] User ID: ${result.userId}`); + } +} + +// ----- HTML templates ----- + +function successHtml(): string { + return ` +OpenSwarm Auth + +

✓ 인증 완료

OpenSwarm에 OpenRouter 인증이 완료되었습니다.
이 창을 닫아도 됩니다.

`; +} + +function errorHtml(error: string): string { + return ` +OpenSwarm Auth Error + +

✗ 인증 실패

${escapeHtml(error)}

터미널에서 다시 시도하세요.

`; +} + +function escapeHtml(s: string): string { + return s.replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"'); +} diff --git a/src/cli.ts b/src/cli.ts index 35c5301..6fa3695 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -335,11 +335,12 @@ const authCmd = program authCmd .command('login') - .description('Login via OAuth (GPT)') - .option('--provider ', 'Provider to authenticate', 'gpt') - .option('--client-id ', 'OAuth Client ID (or set OPENAI_CLIENT_ID env)') + .description('Login via OAuth/PKCE (gpt, openrouter)') + .option('--provider ', 'Provider to authenticate (gpt | openrouter)', 'gpt') + .option('--client-id ', 'GPT only: override OAuth Client ID (defaults to the public Codex client)') + .option('--api-key ', 'OpenRouter only: skip browser flow and store this sk-or-* key directly') .option('--port ', 'Callback server port', parseInt) - .action(async (opts: { provider: string; clientId?: string; port?: number }) => { + .action(async (opts: { provider: string; clientId?: string; apiKey?: string; port?: number }) => { const { handleAuthLogin } = await import('./cli/authHandler.js'); await handleAuthLogin(opts.provider, opts); }); @@ -355,7 +356,7 @@ authCmd authCmd .command('logout') .description('Remove stored auth tokens') - .option('--provider ', 'Provider to remove', 'gpt') + .option('--provider ', 'Provider to remove (gpt | openrouter)', 'gpt') .action(async (opts: { provider: string }) => { const { handleAuthLogout } = await import('./cli/authHandler.js'); handleAuthLogout(opts.provider); diff --git a/src/cli/authHandler.ts b/src/cli/authHandler.ts index 3e58da6..6d214b5 100644 --- a/src/cli/authHandler.ts +++ b/src/cli/authHandler.ts @@ -3,45 +3,132 @@ // `openswarm auth login/status/logout` // ============================================ +import { createInterface } from 'node:readline'; import { AuthProfileStore } from '../auth/index.js'; -import { loginAndSaveProfile } from '../auth/oauthPkce.js'; +import { + loginAndSaveProfile, + DEFAULT_OPENAI_CLIENT_ID, +} from '../auth/oauthPkce.js'; +import { + loginAndSaveOpenRouterProfile, + saveOpenRouterApiKey, +} from '../auth/openrouterPkce.js'; -const DEFAULT_CLIENT_ID = process.env.OPENAI_CLIENT_ID ?? ''; +type Provider = 'gpt' | 'openrouter'; + +const PROFILE_KEYS: Record = { + gpt: 'openai-gpt:default', + openrouter: 'openrouter:default', +}; + +const VALID_PROVIDERS: Provider[] = ['gpt', 'openrouter']; + +function assertProvider(provider: string): asserts provider is Provider { + if (!VALID_PROVIDERS.includes(provider as Provider)) { + console.error( + `지원하지 않는 provider: "${provider}". 지원: ${VALID_PROVIDERS.join(', ')}`, + ); + process.exit(1); + } +} + +export interface AuthLoginOpts { + clientId?: string; + port?: number; + /** OpenRouter: PKCE 없이 직접 입력받은 API key */ + apiKey?: string; +} /** - * GPT OAuth 로그인 흐름 실행 + * 로그인 흐름 (provider별 분기) */ -export async function handleAuthLogin(provider: string, opts: { clientId?: string; port?: number }): Promise { - if (provider !== 'gpt') { - console.error(`지원하지 않는 provider: "${provider}". 현재 "gpt"만 지원합니다.`); +export async function handleAuthLogin( + provider: string, + opts: AuthLoginOpts, +): Promise { + assertProvider(provider); + + try { + if (provider === 'gpt') { + const clientId = + opts.clientId ?? process.env.OPENAI_CLIENT_ID ?? DEFAULT_OPENAI_CLIENT_ID; + await loginAndSaveProfile(clientId, opts.port); + printGptPostLoginHint(); + } else { + await loginOpenRouter(opts); + printOpenRouterPostLoginHint(); + } + } catch (err) { + console.error(`로그인 실패: ${err instanceof Error ? err.message : String(err)}`); process.exit(1); } +} - const clientId = opts.clientId ?? DEFAULT_CLIENT_ID; - if (!clientId) { - console.error('OpenAI Client ID가 필요합니다.'); - console.error('환경변수 OPENAI_CLIENT_ID를 설정하거나 --client-id 옵션을 사용하세요.'); - console.error(''); - console.error(' export OPENAI_CLIENT_ID="your-client-id"'); - console.error(' openswarm auth login --provider gpt'); - console.error(''); - console.error('또는:'); - console.error(' openswarm auth login --provider gpt --client-id "your-client-id"'); - process.exit(1); +async function loginOpenRouter(opts: AuthLoginOpts): Promise { + // 1) Explicit --api-key wins. + if (opts.apiKey) { + saveOpenRouterApiKey(opts.apiKey); + console.log(`[Auth] OpenRouter API key 저장 완료: ${PROFILE_KEYS.openrouter}`); + return; + } + + // 2) OPENROUTER_API_KEY env (headless / CI). + const envKey = process.env.OPENROUTER_API_KEY?.trim(); + if (envKey) { + saveOpenRouterApiKey(envKey); + console.log( + `[Auth] OPENROUTER_API_KEY 환경 변수에서 키를 저장했습니다: ${PROFILE_KEYS.openrouter}`, + ); + return; } + // 3) PKCE browser flow (primary path). try { - await loginAndSaveProfile(clientId, opts.port); - console.log(''); - console.log('GPT 어댑터를 사용하려면 config.yaml에서 adapter를 변경하세요:'); - console.log(' adapter: gpt'); - console.log(''); - console.log('또는 CLI에서 직접 실행:'); - console.log(' openswarm run "your task" --model gpt-4o'); + await loginAndSaveOpenRouterProfile(opts.port); + return; } catch (err) { - console.error(`OAuth 로그인 실패: ${err instanceof Error ? err.message : String(err)}`); - process.exit(1); + const message = err instanceof Error ? err.message : String(err); + console.error(`[Auth] PKCE 흐름 실패: ${message}`); + console.error('[Auth] API 키 직접 입력으로 전환합니다.'); } + + // 4) Interactive API-key fallback when PKCE could not complete. + const manualKey = await promptForApiKey(); + saveOpenRouterApiKey(manualKey); + console.log(`[Auth] OpenRouter API key 저장 완료: ${PROFILE_KEYS.openrouter}`); +} + +function promptForApiKey(): Promise { + return new Promise((resolve, reject) => { + const rl = createInterface({ input: process.stdin, output: process.stdout }); + rl.question('OpenRouter API key (sk-or-...): ', (answer) => { + rl.close(); + const trimmed = answer.trim(); + if (!trimmed) { + reject(new Error('빈 키가 입력되었습니다.')); + return; + } + resolve(trimmed); + }); + }); +} + +function printGptPostLoginHint(): void { + console.log(''); + console.log('GPT 어댑터를 사용하려면 config.yaml에서 adapter를 변경하세요:'); + console.log(' adapter: gpt'); + console.log(''); + console.log('또는 CLI에서 직접 실행:'); + console.log(' openswarm run "your task" --model gpt-4o'); +} + +function printOpenRouterPostLoginHint(): void { + console.log(''); + console.log('OpenRouter 어댑터를 사용하려면 config.yaml에서 adapter를 변경하세요:'); + console.log(' adapter: openrouter'); + console.log(''); + console.log('또는 CLI에서 직접 실행 (모델은 provider/model 형식):'); + console.log(' openswarm run "your task" --model anthropic/claude-sonnet-4'); } /** @@ -54,7 +141,9 @@ export function handleAuthStatus(): void { if (keys.length === 0) { console.log('저장된 인증 프로필이 없습니다.'); - console.log('로그인: openswarm auth login --provider gpt'); + console.log('로그인:'); + console.log(' openswarm auth login --provider gpt'); + console.log(' openswarm auth login --provider openrouter'); return; } @@ -63,14 +152,17 @@ export function handleAuthStatus(): void { for (const key of keys) { const p = profiles[key]; - const expired = Date.now() > p.expires; - const expiresAt = new Date(p.expires).toLocaleString('ko-KR', { timeZone: 'Asia/Seoul' }); - const status = expired ? '만료됨' : '유효'; + const isApiKey = p.type === 'apiKey'; + const expired = !isApiKey && Date.now() > p.expires; + const expiresAt = isApiKey + ? '∞ (API key)' + : `${new Date(p.expires).toLocaleString('ko-KR', { timeZone: 'Asia/Seoul' })} (${expired ? '만료됨' : '유효'})`; console.log(` ${key}`); console.log(` Provider: ${p.provider}`); + console.log(` Type: ${p.type}`); console.log(` Token: ${maskToken(p.access)}`); - console.log(` Expires: ${expiresAt} (${status})`); + console.log(` Expires: ${expiresAt}`); if (p.accountId) { console.log(` Account: ${p.accountId}`); } @@ -82,12 +174,9 @@ export function handleAuthStatus(): void { * 인증 프로필 삭제 */ export function handleAuthLogout(provider: string): void { - if (provider !== 'gpt') { - console.error(`지원하지 않는 provider: "${provider}". 현재 "gpt"만 지원합니다.`); - process.exit(1); - } + assertProvider(provider); - const profileKey = 'openai-gpt:default'; + const profileKey = PROFILE_KEYS[provider]; const store = new AuthProfileStore(); if (store.deleteProfile(profileKey)) { diff --git a/src/core/config.ts b/src/core/config.ts index 6edd86f..2fb03af 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -40,7 +40,7 @@ function getConfigSearchPaths(): string[] { const DEFAULT_HEARTBEAT_INTERVAL = 30 * 60 * 1000; // 30 minutes const DEFAULT_GITHUB_CHECK_INTERVAL = 5 * 60 * 1000; // 5 minutes -const AdapterNameSchema = z.enum(['claude', 'codex', 'gpt', 'local', 'lmstudio']); +const AdapterNameSchema = z.enum(['codex', 'gpt', 'local', 'lmstudio', 'openrouter']); // Zod Schemas @@ -98,10 +98,10 @@ const PairModeConfigSchema = z.object({ }).optional(); const ModelConfigSchema = z.object({ - /** Worker agent model */ - worker: z.string().default('claude-sonnet-4-5-20250929'), - /** Reviewer agent model */ - reviewer: z.string().default('claude-sonnet-4-5-20250929'), + /** Worker agent model — lightweight tier (see DefaultRolesConfigSchema). */ + worker: z.string().default('z-ai/glm-4.7-flash'), + /** Reviewer agent model — frontier quality gate. */ + reviewer: z.string().default('openai/gpt-5'), }).optional(); /** Per-role configuration schema */ @@ -122,16 +122,27 @@ const RoleConfigSchema = z.object({ /** Default roles configuration schema */ const DefaultRolesConfigSchema = z.object({ + // Worker = lightweight tier. Benchmark (benchmarks/modelSelect.ts, L0–L3 coding + // tasks) ranked z-ai/glm-4.7-flash #1: 100% pass, $0.0021/pass (cheapest), and + // 2759 tok/s under ZDR via DeepInfra — ~5× faster than the next candidate. It is + // a non-thinking model, so it wastes no reasoning tokens on mechanical edits. + // On repeated failure it escalates to the frontier (gpt-5). worker: RoleConfigSchema.default({ enabled: true, - model: 'claude-haiku-4-5-20251001', + model: 'z-ai/glm-4.7-flash', timeoutMs: 0, - escalateModel: 'claude-sonnet-4-5-20250929', - escalateAfterIteration: 3, + escalateModel: 'openai/gpt-5', + // Escalate on the 2nd attempt, not the 3rd. With maxIterations=3, a threshold + // of 3 only kicks in on the final pass — too late to help. Retrying the exact + // same model after a failure rarely changes the outcome; switch models sooner. + escalateAfterIteration: 2, }), + // Reviewer = frontier tier, never cheaped out. A weak reviewer that wrongly + // approves (bug slips through) or wrongly rejects (worker loops) costs MORE than + // the model price difference. The quality gate stays on gpt-5. reviewer: RoleConfigSchema.default({ enabled: true, - model: 'claude-haiku-4-5-20251001', + model: 'openai/gpt-5', timeoutMs: 0, }), tester: RoleConfigSchema.optional(), @@ -174,8 +185,11 @@ const DecompositionConfigSchema = z.object({ dailyLimit: z.number().min(1).max(100).default(20).optional(), /** Auto-move to backlog if too complex or failing (default: true) */ autoBacklog: z.boolean().default(true).optional(), - /** Planner model */ - plannerModel: z.string().default('claude-sonnet-4-5-20250929'), + /** + * Planner model — frontier tier. Decomposition is high-leverage: a bad split + * pollutes every downstream worker, so we never cheap out here. + */ + plannerModel: z.string().default('openai/gpt-5'), /** Planner timeout (ms) - default 600000 (10min) */ plannerTimeoutMs: z.number().min(60000).default(600000), }).optional(); @@ -271,7 +285,7 @@ const CIWorkerConfigSchema = z.object({ }).optional(); const RawConfigSchema = z.object({ - adapter: AdapterNameSchema.default('claude'), + adapter: AdapterNameSchema.default('codex'), language: z.enum(['en', 'ko']).default('en'), discord: DiscordConfigSchema, linear: LinearConfigSchema, @@ -591,11 +605,13 @@ export function generateSampleConfig(): string { # Environment variables use \${VAR_NAME} or \${VAR_NAME:-default} format # Default CLI adapter for worker/reviewer stages -# Options: claude, codex, gpt, local, lmstudio -# For GPT: run \`openswarm auth login --provider gpt\` first -# For LM Studio: start Local Server and set LMSTUDIO_BASE_URL/LMSTUDIO_MODEL if needed -# If LMSTUDIO_MODEL is unset, the lmstudio adapter auto-selects the first loaded model. -adapter: claude +# Options: codex, openrouter, lmstudio, local, gpt +# - codex: OpenAI Codex via PKCE login (openswarm auth login --provider codex) +# - openrouter: OpenRouter API key (OPENROUTER_API env var or openswarm auth login --provider openrouter) +# - lmstudio: LM Studio local server (set LMSTUDIO_BASE_URL / LMSTUDIO_MODEL) +# - local: Ollama local models (ollama pull ) +# - gpt: OpenAI Chat API via OAuth (openswarm auth login --provider gpt) +adapter: codex discord: token: \${DISCORD_TOKEN} From 889f6585106c2df19516d1db27ec1acb922e4bf4 Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 11 Jun 2026 00:24:03 +0900 Subject: [PATCH 2/7] =?UTF-8?q?feat(memory):=20repo-scoped=20knowledge=20l?= =?UTF-8?q?oop=20=E2=80=94=20workers=20learn=20the=20repo=20across=20tasks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The write infrastructure (memoryCore LanceDB with a repo field) existed but nothing ever read it back: task outcomes were stored as one generic line and never reached a worker prompt. Close the loop: - src/memory/repoKnowledge.ts: recordTaskOutcome() stores successes as system_pattern (files changed + approach + iterations) and review rejections as constraint (pitfalls), scoped to the project path. recallRepoKnowledge() retrieves the top task-relevant memories. skipDistillation keeps the intended types (distillation was downgrading them to belief, which type-filtered recall then missed). - pairPipeline.collectWorkerContext() recalls repo knowledge into WorkerContext.repoMemories; worker prompts render it as a "Repository Knowledge" section with pattern/pitfall tags (en + ko). - autonomousRunner completed/rejected handlers record outcomes instead of the old one-line strategy memo. All memory paths are non-blocking — the pipeline runs even if the memory DB is unavailable. --- src/agents/pairPipeline.ts | 122 ++++++++++++++++++++++++++++- src/automation/autonomousRunner.ts | 120 +++++++++++++--------------- src/automation/runnerExecution.ts | 2 +- src/automation/runnerTypes.ts | 2 +- src/locale/prompts/en.ts | 47 ++++++----- src/locale/prompts/ko.ts | 46 ++++++----- src/locale/prompts/prompts.test.ts | 40 +++++++++- src/locale/types.ts | 6 ++ src/memory/repoKnowledge.ts | 119 ++++++++++++++++++++++++++++ 9 files changed, 391 insertions(+), 113 deletions(-) create mode 100644 src/memory/repoKnowledge.ts diff --git a/src/agents/pairPipeline.ts b/src/agents/pairPipeline.ts index 4cccc0e..406048f 100644 --- a/src/agents/pairPipeline.ts +++ b/src/agents/pairPipeline.ts @@ -19,6 +19,7 @@ import * as agentPair from './agentPair.js'; import { runGuards, type GuardsRunResult } from './pipelineGuards.js'; import { hasRepoSnapshot, scanAndCache, analyzeIssue } from '../knowledge/index.js'; import { getRegistryStore } from '../registry/sqliteStore.js'; +import { recallRepoKnowledge } from '../memory/repoKnowledge.js'; import type { WorkerContext } from '../locale/types.js'; import * as workerAgent from './worker.js'; import * as reviewerAgent from './reviewer.js'; @@ -412,7 +413,19 @@ export class PairPipeline extends EventEmitter { } } - if (!wc.impactAnalysis && !wc.registryBriefs && !wc.draftAnalysis) return undefined; + // Recall repo knowledge accumulated from past tasks — the core loop that + // makes the worker understand this repo better over time (non-blocking on failure) + const memories = await recallRepoKnowledge( + context.projectPath, + context.task.title, + context.task.description || '', + ); + if (memories.length > 0) { + wc.repoMemories = memories; + console.log(`[Pipeline] Recalled ${memories.length} repo memories for context`); + } + + if (!wc.impactAnalysis && !wc.registryBriefs && !wc.draftAnalysis && !wc.repoMemories) return undefined; return wc; } catch (err) { console.warn('[Pipeline] Worker context collection failed (non-blocking):', err); @@ -672,6 +685,8 @@ export class PairPipeline extends EventEmitter { inputTokens: costInfo?.inputTokens, outputTokens: costInfo?.outputTokens, costUsd: costInfo?.costUsd, + durationMs: stageResult.duration, + ...summarizeStageResult(stage, result), } }); return stageResult; @@ -691,7 +706,11 @@ export class PairPipeline extends EventEmitter { console.log(`[${prefix}] ${stage} failed (${(stageResult.duration / 1000).toFixed(1)}s)`); this.emit('stage:fail', { stage, result: stageResult, context, error }); - broadcastEvent({ type: 'pipeline:stage', data: { taskId: context.task.id, stage, status: 'fail' } }); + broadcastEvent({ type: 'pipeline:stage', data: { + taskId: context.task.id, stage, status: 'fail', + durationMs: stageResult.duration, + error: error instanceof Error ? error.message : String(error), + } }); return stageResult; } } @@ -781,7 +800,7 @@ export class PairPipeline extends EventEmitter { // ========== WORKER (with escalation) ========== const workerCfg = this.config.roles?.worker; - const escalateThreshold = workerCfg?.escalateAfterIteration ?? 3; + const escalateThreshold = workerCfg?.escalateAfterIteration ?? 2; const escalateModel = workerCfg?.escalateModel; const shouldEscalate = context.currentIteration >= escalateThreshold && !!escalateModel; const baseWorkerModel = this.getModelForRole('worker', context.task); @@ -1144,5 +1163,102 @@ export function createPipelineFromConfig( // Helpers +/** + * Extract a worker-readable summary of what the agent did during a stage so + * the dashboard can display "wrote 4 files / approved / reviewed N issues" + * instead of just "stage=worker status=complete". + * + * Returns a plain object suitable for inclusion in the SSE `pipeline:stage` + * broadcast payload. Fields are optional — missing ones are simply omitted. + */ +function summarizeStageResult( + stage: PipelineStage, + result: WorkerResult | ReviewResult | TesterResult | DocumenterResult | AuditorResult | SkillDocumenterResult, +): Record { + // Cap arrays/strings before broadcasting so a chatty agent cannot blow up + // the SSE channel with a 10MB stage event. + const MAX_FILES = 12; + const MAX_COMMANDS = 8; + const SUMMARY_CAP = 240; + const FEEDBACK_CAP = 480; + const cap = (s: string | undefined, n: number): string | undefined => + s == null ? undefined : (s.length > n ? `${s.slice(0, n - 1)}…` : s); + + switch (stage) { + case 'worker': { + const r = result as WorkerResult; + return { + summary: cap(r.summary, SUMMARY_CAP), + filesChanged: Array.isArray(r.filesChanged) ? r.filesChanged.slice(0, MAX_FILES) : undefined, + filesChangedCount: r.filesChanged?.length ?? 0, + commands: Array.isArray(r.commands) ? r.commands.slice(0, MAX_COMMANDS) : undefined, + commandsCount: r.commands?.length ?? 0, + confidencePercent: r.confidencePercent, + haltReason: r.haltReason, + error: r.error ? cap(r.error, FEEDBACK_CAP) : undefined, + }; + } + + case 'reviewer': { + const r = result as ReviewResult; + return { + decision: r.decision, + feedback: cap(r.feedback, FEEDBACK_CAP), + issuesCount: r.issues?.length ?? 0, + issues: Array.isArray(r.issues) ? r.issues.slice(0, MAX_COMMANDS) : undefined, + suggestionsCount: r.suggestions?.length ?? 0, + }; + } + + case 'tester': { + const r = result as TesterResult; + return { + passed: r.testsPassed, + failed: r.testsFailed, + coverage: r.coverage, + failedTests: Array.isArray(r.failedTests) ? r.failedTests.slice(0, MAX_FILES) : undefined, + error: r.error ? cap(r.error, FEEDBACK_CAP) : undefined, + }; + } + + case 'documenter': { + const r = result as DocumenterResult; + return { + summary: cap(r.summary, SUMMARY_CAP), + filesChanged: Array.isArray(r.updatedFiles) ? r.updatedFiles.slice(0, MAX_FILES) : undefined, + filesChangedCount: r.updatedFiles?.length ?? 0, + changelogEntry: cap(r.changelogEntry, SUMMARY_CAP), + error: r.error ? cap(r.error, FEEDBACK_CAP) : undefined, + }; + } + + case 'auditor': { + const r = result as AuditorResult; + return { + summary: cap(r.summary, SUMMARY_CAP), + bsScore: r.bsScore, + criticalCount: r.criticalCount, + warningCount: r.warningCount, + issues: Array.isArray(r.issues) ? r.issues.slice(0, MAX_COMMANDS) : undefined, + issuesCount: r.issues?.length ?? 0, + error: r.error ? cap(r.error, FEEDBACK_CAP) : undefined, + }; + } + + case 'skill-documenter': { + const r = result as SkillDocumenterResult; + return { + summary: cap(r.summary, SUMMARY_CAP), + filesChanged: Array.isArray(r.updatedFiles) ? r.updatedFiles.slice(0, MAX_FILES) : undefined, + filesChangedCount: r.updatedFiles?.length ?? 0, + error: r.error ? cap(r.error, FEEDBACK_CAP) : undefined, + }; + } + + default: + return {}; + } +} + // Re-export formatting functions (extracted to pipelineFormat.ts) export { formatPipelineResult, formatPipelineResultEmbed } from './pipelineFormat.js'; diff --git a/src/automation/autonomousRunner.ts b/src/automation/autonomousRunner.ts index 722419f..9d0b403 100644 --- a/src/automation/autonomousRunner.ts +++ b/src/automation/autonomousRunner.ts @@ -14,7 +14,6 @@ import { setRetryTime, clearRetryTime, formatRetryTime, - getDailyCompletedCount, getDailyPaceInfo, recordProjectCompletion, canProjectAcceptTask, @@ -30,7 +29,7 @@ import { } from '../orchestration/decisionEngine.js'; // ExecutorResult used via execution.reportExecutionResult import { checkWorkAllowed } from '../support/timeWindow.js'; -import { saveCognitiveMemory } from '../memory/index.js'; +import { recordTaskOutcome } from '../memory/repoKnowledge.js'; import * as linear from '../linear/index.js'; import { updateProjectAfterTask } from '../linear/projectUpdater.js'; import { TaskScheduler, initScheduler } from '../orchestration/taskScheduler.js'; @@ -224,13 +223,15 @@ export class AutonomousRunner { console.error(`[Scheduler] Failed to update issue state:`, err); } - try { - await saveCognitiveMemory('strategy', - `Pipeline execution succeeded: "${task.title}"`, - { confidence: 0.9, derivedFrom: task.issueId } - ); - } catch (memErr) { - console.warn(`[Scheduler] Memory save failed (non-critical):`, memErr); + // Accumulate repo-scoped knowledge — recalled and injected into the worker prompt of the next similar task + const projectPath = result.taskContext?.projectPath; + if (projectPath) { + await recordTaskOutcome(projectPath, { + taskTitle: task.title, + derivedFrom: task.issueIdentifier ?? task.issueId, + workerResult: result.workerResult, + iterations: result.iterations, + }); } } @@ -261,6 +262,15 @@ export class AutonomousRunner { const feedback = result.reviewResult?.feedback || 'No feedback provided'; const rejectionCount = incrementRejection(task.issueId, feedback); + // Store the rejection reason as a repo pitfall (constraint) — blocks repeating the same mistake + if (result.taskContext?.projectPath) { + await recordTaskOutcome(result.taskContext.projectPath, { + taskTitle: task.title, + derivedFrom: task.issueIdentifier ?? task.issueId, + rejectionFeedback: feedback, + }); + } + console.log(`[Scheduler] Task rejected (${rejectionCount}/3): ${taskCtx} ${task.title}`); console.log(`[Scheduler] Rejection reason: ${feedback}`); @@ -403,41 +413,27 @@ export class AutonomousRunner { return filtered; } - /** Schedule next heartbeat with pace-aware cooldown */ + /** + * Trigger the next heartbeat as soon as possible. + * + * Historically this used a "pace-aware cooldown" that grew quadratically + * with daily completion count (ratio² × 3 multiplier) on top of a 30-minute + * baseline — the swarm would slow itself down dramatically after a few + * tasks. That was removed by user request: the cron schedule + * (`config.heartbeatSchedule`) is now the only knob, and between cron + * ticks we re-fire immediately when a task wraps up so the next backlog + * item starts without artificial dead time. + */ private _nextHeartbeatTimer: ReturnType | null = null; private scheduleNextHeartbeat(): void { - if (this._nextHeartbeatTimer) return; // already scheduled - - const isTurbo = this.getTurboMode(); - - // Turbo: 5min flat, no progressive slowdown - if (isTurbo) { - const turboCooldown = 5 * 60_000; // 5min - console.log(`[AutonomousRunner] TURBO: next heartbeat in 5min`); - this._nextHeartbeatTimer = setTimeout(() => { - this._nextHeartbeatTimer = null; - void this.heartbeat(); - }, turboCooldown); - return; - } - - // Normal: progressive slowdown based on 5h window usage - const perProjectCap = this.config.dailyTaskCap ?? 6; - const globalCap = Math.max(this.enabledProjects.size, 3) * perProjectCap; - const baseCooldown = this.config.interTaskCooldownMs ?? 1_800_000; // 30min default - const totalInWindow = getDailyCompletedCount(); - - // Progressive slowdown: ratio² × 3 multiplier - const ratio = totalInWindow / globalCap; - const multiplier = 1 + (ratio * ratio * 3); - const adjustedCooldown = Math.round(baseCooldown * multiplier); - - const cooldownMin = Math.round(adjustedCooldown / 60_000); - console.log(`[AutonomousRunner] Scheduling next heartbeat in ${cooldownMin}min (5h window: ${totalInWindow}/${globalCap}, multiplier: ${multiplier.toFixed(2)}x)`); + if (this._nextHeartbeatTimer) return; // already queued + // Fire on the next event-loop tick so the current scheduler callback + // returns first (avoids re-entrant heartbeat() while still in `completed` + // handlers). this._nextHeartbeatTimer = setTimeout(() => { this._nextHeartbeatTimer = null; void this.heartbeat(); - }, adjustedCooldown); + }, 0); } private async runAvailableTasks(): Promise { @@ -461,7 +457,7 @@ export class AutonomousRunner { return { worker: { enabled: true, - model: this.config.workerModel || 'claude-sonnet-4-5-20250929', + model: this.config.workerModel || 'claude-sonnet-4-6', timeoutMs: this.config.workerTimeoutMs ?? 0, }, reviewer: { @@ -474,7 +470,7 @@ export class AutonomousRunner { // Apply per-project overrides const base = this.config.defaultRoles || { - worker: { enabled: true, model: 'claude-sonnet-4-5-20250929', timeoutMs: 0 }, + worker: { enabled: true, model: 'claude-sonnet-4-6', timeoutMs: 0 }, reviewer: { enabled: true, model: 'claude-haiku-4-5-20251001', timeoutMs: 0 }, }; @@ -620,20 +616,12 @@ export class AutonomousRunner { console.log(`[AutonomousRunner] Quota warning: ${quotaCheck.utilization.toFixed(0)}% utilization`); } - // 1.6 Pace gate — per-project 5h rolling window - const isTurbo = this.getTurboMode(); - const perProjectCap = isTurbo ? 20 : (this.config.dailyTaskCap ?? 6); - const totalInWindow = getDailyCompletedCount(); - // 전역 상한: 프로젝트 수 × per-project cap (안전장치) - const globalCap = Math.max(this.enabledProjects.size, 3) * perProjectCap; - if (totalInWindow >= globalCap) { - console.log(`[AutonomousRunner] Global pace limit: ${totalInWindow}/${globalCap} tasks in 5h window — skipping`); - this.syslog(`⏸ Global pace: ${totalInWindow}/${globalCap} (5h window)`); - broadcastEvent({ type: 'log', data: { taskId: 'system', stage: 'pace', line: `⏸ Global pace: ${totalInWindow}/${globalCap}` } }); - return; - } - const modeLabel = isTurbo ? 'TURBO' : 'Normal'; - this.syslog(`✓ Pace: ${totalInWindow}/${globalCap} global, ${perProjectCap}/project [${modeLabel}]`); + // 1.6 Pace gate (removed) + // The 5h rolling window cap (globalCap = projects × dailyTaskCap) and + // turbo-mode multiplier used to gate heartbeat here. Both were removed + // by user request: speed is now governed only by the cron schedule and + // the Linear API rate limiter, not by an internal completion cap. + this.syslog(`✓ Pace: unrestricted (cron only)`); // 2. Fetch tasks from Linear this.syslog('⟳ Fetching tasks from Linear...'); @@ -946,13 +934,13 @@ export class AutonomousRunner { await execution.reconcileCompletionState(task); console.log(`[AutonomousRunner] Issue ${task.issueId} marked as Done`); - try { - await saveCognitiveMemory('strategy', - `Pair execution succeeded: "${task.title}"`, - { confidence: 0.9, derivedFrom: task.issueId } - ); - } catch (memErr) { - console.warn(`[AutonomousRunner] Memory save failed (non-critical):`, memErr); + if (result.taskContext?.projectPath) { + await recordTaskOutcome(result.taskContext.projectPath, { + taskTitle: task.title, + derivedFrom: task.issueIdentifier ?? task.issueId, + workerResult: result.workerResult, + iterations: result.iterations, + }); } } else if (result.finalStatus === 'rejected') { // Change to Blocked on review rejection @@ -1090,19 +1078,19 @@ export class AutonomousRunner { } getAdapterSummary() { - const defaultAdapter = this.config.defaultAdapter ?? 'claude'; + const defaultAdapter = this.config.defaultAdapter ?? 'codex'; const defaultRoles = this.config.defaultRoles; return { defaultAdapter, worker: { adapter: defaultRoles?.worker?.adapter ?? defaultAdapter, - model: defaultRoles?.worker?.model ?? this.config.workerModel ?? 'claude-sonnet-4-5-20250929', + model: defaultRoles?.worker?.model ?? this.config.workerModel ?? 'gpt-5-codex', enabled: defaultRoles?.worker?.enabled !== false, }, reviewer: { adapter: defaultRoles?.reviewer?.adapter ?? defaultAdapter, - model: defaultRoles?.reviewer?.model ?? this.config.reviewerModel ?? 'claude-haiku-4-5-20251001', + model: defaultRoles?.reviewer?.model ?? this.config.reviewerModel ?? 'gpt-5-codex', enabled: defaultRoles?.reviewer?.enabled !== false, }, tester: defaultRoles?.tester ? { @@ -1133,7 +1121,7 @@ export class AutonomousRunner { } if (isClaudeModel) return current; - if (role === 'reviewer') return 'claude-sonnet-4-20250514'; + if (role === 'reviewer') return 'claude-sonnet-4-6'; return 'claude-haiku-4-5-20251001'; }; diff --git a/src/automation/runnerExecution.ts b/src/automation/runnerExecution.ts index 5a75616..7537dbe 100644 --- a/src/automation/runnerExecution.ts +++ b/src/automation/runnerExecution.ts @@ -329,7 +329,7 @@ export async function decomposeTask( projectPath, projectName: task.linearProject?.name, targetMinutes, - model: ctx.plannerModel ?? 'claude-sonnet-4-5-20250929', + model: ctx.plannerModel ?? 'claude-opus-4-7', timeoutMs: ctx.plannerTimeoutMs ?? 600000, onLog: (line: string) => broadcastEvent({ type: 'log', data: { taskId, stage: 'decompose', line } }), impactAnalysis: impactAnalysis ?? undefined, diff --git a/src/automation/runnerTypes.ts b/src/automation/runnerTypes.ts index a38af0f..9826279 100644 --- a/src/automation/runnerTypes.ts +++ b/src/automation/runnerTypes.ts @@ -7,7 +7,7 @@ import type { ExecutorResult } from '../orchestration/workflow.js'; import type { DefaultRolesConfig, ProjectAgentConfig, JobProfile } from '../core/types.js'; export interface AutonomousConfig { - defaultAdapter?: 'claude' | 'codex' | 'gpt' | 'local' | 'lmstudio'; + defaultAdapter?: 'codex' | 'gpt' | 'local' | 'lmstudio' | 'openrouter'; linearTeamId: string; allowedProjects: string[]; heartbeatSchedule: string; diff --git a/src/locale/prompts/en.ts b/src/locale/prompts/en.ts index 901b324..78b357f 100644 --- a/src/locale/prompts/en.ts +++ b/src/locale/prompts/en.ts @@ -27,11 +27,21 @@ Apply the above feedback and make corrections. ` : ''; - // 코드 컨텍스트 섹션 (draftAnalysis + impactAnalysis + registryBriefs) + // Code context section (draftAnalysis + impactAnalysis + registryBriefs + repoMemories) let contextSection = ''; - if (context?.draftAnalysis || context?.impactAnalysis || context?.registryBriefs?.length) { + if (context?.draftAnalysis || context?.impactAnalysis || context?.registryBriefs?.length || context?.repoMemories?.length) { const parts: string[] = ['## Code Context (auto-generated)']; + if (context.repoMemories && context.repoMemories.length > 0) { + parts.push(''); + parts.push('### Repository Knowledge (learned from past tasks in this repo)'); + for (const m of context.repoMemories) { + const tag = m.type === 'constraint' ? '⚠️ PITFALL' : '✓ pattern'; + parts.push(`- [${tag}] **${m.title}**: ${m.content}`); + } + parts.push('Use this knowledge to skip re-discovery and avoid repeating past mistakes.'); + } + if (context.draftAnalysis) { const da = context.draftAnalysis; parts.push(''); @@ -100,25 +110,24 @@ ${feedbackSection}${contextSection} - Before completing: verify all changed files exist, no syntax errors, confidence reflects reality. ## Tools available -- \`cxt\` (code exploration toolkit, bundled with OpenSwarm): - - \`cxt check \` — entity brief for a file (faster than Read for structural lookups). - - \`cxt check --search \` — FTS5 search across the registry. - - \`cxt check --untested\` / \`--high-risk\` — surface risky spots before changing them. - - \`cxt bs\` — static bad-smell scan. - - Run \`cxt scan\` first if the registry seems stale; it's cheap. - - The \`File Map\` section above (when present) already comes from \`cxt\` — don't re-scan unless you need fresh data. - -## Output (JSON, at the end) +Use search_files (ripgrep) + read_file as your primary navigation. They're always available and cheapest. + +Optional: \`cxt\` (code registry, only if this repo already has one — do NOT run \`cxt scan\` to create one): + - \`cxt check \` / \`cxt check --search \` — entity briefs / FTS5 search, faster than Read for structure. + - If a \`File Map\` section appears above, it already came from \`cxt\` — don't re-scan. + - If \`cxt\` errors with "no registry" or similar, just use search_files/read_file instead — don't retry cxt. + +## Done? Just do the work. +Use the tools to actually edit files and run commands. File changes are detected +from git directly — you do NOT need to prove success with a JSON block. When the +task is complete, stop calling tools and write a short plain-text summary of what +you did and any caveats. + +If (and only if) you want to flag low confidence or a blocker, end with this JSON: \`\`\`json -{ - "success": true, - "summary": "What YOU did (1-2 sentences, not reviewer feedback)", - "filesChanged": ["full paths of files edited/written"], - "commands": ["bash commands executed"], - "confidencePercent": 85 -} +{ "success": false, "confidencePercent": 40, "haltReason": "why you're stuck" } \`\`\` -Set confidencePercent below 60 if uncertain. filesChanged must include all edited files (full paths). +Otherwise no JSON is needed — finishing without an error IS the success signal. `; }, diff --git a/src/locale/prompts/ko.ts b/src/locale/prompts/ko.ts index 8e1118d..7074d71 100644 --- a/src/locale/prompts/ko.ts +++ b/src/locale/prompts/ko.ts @@ -28,11 +28,21 @@ ${previousFeedback} ` : ''; - // 코드 컨텍스트 섹션 (draftAnalysis + impactAnalysis + registryBriefs) + // Code context section (draftAnalysis + impactAnalysis + registryBriefs + repoMemories) let contextSection = ''; - if (context?.draftAnalysis || context?.impactAnalysis || context?.registryBriefs?.length) { + if (context?.draftAnalysis || context?.impactAnalysis || context?.registryBriefs?.length || context?.repoMemories?.length) { const parts: string[] = ['## 코드 컨텍스트 (자동 생성)']; + if (context.repoMemories && context.repoMemories.length > 0) { + parts.push(''); + parts.push('### 저장소 지식 (이 repo의 과거 작업에서 학습)'); + for (const m of context.repoMemories) { + const tag = m.type === 'constraint' ? '⚠️ 함정' : '✓ 패턴'; + parts.push(`- [${tag}] **${m.title}**: ${m.content}`); + } + parts.push('이 지식을 활용해 재탐색을 건너뛰고 과거 실수를 반복하지 마라.'); + } + if (context.draftAnalysis) { const da = context.draftAnalysis; parts.push(''); @@ -101,25 +111,23 @@ ${feedbackSection}${contextSection} - 완료 전: 모든 변경 파일 존재 확인, 구문 오류 없음 확인, confidence 정확히 설정. ## 사용 가능한 도구 -- \`cxt\` (OpenSwarm 내장 Code eXploration Toolkit): - - \`cxt check \` — 파일 엔티티 브리프 (구조 파악용, Read보다 빠름). - - \`cxt check --search \` — FTS5 기반 전역 검색. - - \`cxt check --untested\` / \`--high-risk\` — 수정 전에 위험 포인트 먼저 확인. - - \`cxt bs\` — 정적 bad smell 스캔. - - 레지스트리가 오래됐으면 \`cxt scan\` 먼저 (저렴함). - - 위 \`파일 맵\` 섹션이 있으면 이미 \`cxt\` 결과 — 새로 스캔할 필요 없음. - -## Output (JSON, 마지막에 출력) +주 탐색은 search_files(ripgrep) + read_file. 항상 쓸 수 있고 가장 저렴하다. + +선택: \`cxt\` (코드 레지스트리, 이미 있는 repo에서만 — \`cxt scan\`으로 새로 만들지 말 것): + - \`cxt check \` / \`cxt check --search \` — 엔티티 브리프 / FTS5 검색, 구조 파악은 Read보다 빠름. + - 위 \`파일 맵\` 섹션이 있으면 이미 \`cxt\` 결과 — 새로 스캔 금지. + - \`cxt\`가 "no registry" 류 에러를 내면 그냥 search_files/read_file 사용 — cxt 재시도 금지. + +## 완료? 그냥 작업하면 된다. +도구로 실제 파일을 수정하고 명령을 실행하라. 파일 변경은 git에서 직접 감지하므로 +JSON 블록으로 성공을 증명할 필요가 없다. 작업이 끝나면 도구 호출을 멈추고, 무엇을 +했는지와 주의사항을 짧은 평문으로 요약하라. + +낮은 확신이나 블로커를 알릴 때만(그럴 때만) 마지막에 이 JSON을 붙여라: \`\`\`json -{ - "success": true, - "summary": "내가 수행한 작업 (1-2문장, 리뷰어 피드백 복사 금지)", - "filesChanged": ["Edit/Write한 파일 전체 경로"], - "commands": ["실행한 bash 명령어"], - "confidencePercent": 85 -} +{ "success": false, "confidencePercent": 40, "haltReason": "막힌 이유" } \`\`\` -불확실하면 confidencePercent 60 미만. filesChanged에 변경한 모든 파일 포함 (전체 경로). +그 외에는 JSON 불필요 — 에러 없이 끝내는 것 자체가 성공 신호다. `; }, diff --git a/src/locale/prompts/prompts.test.ts b/src/locale/prompts/prompts.test.ts index 990d3b8..0ab7807 100644 --- a/src/locale/prompts/prompts.test.ts +++ b/src/locale/prompts/prompts.test.ts @@ -43,11 +43,13 @@ describe('buildWorkerPrompt', () => { expect(result).toContain('Session expires too fast'); }); - it('contains output format JSON block', () => { + it('does not force a JSON success block (git diff is the success signal)', () => { const result = enPrompts.buildWorkerPrompt(base); - expect(result).toContain('"success"'); - expect(result).toContain('"filesChanged"'); - expect(result).toContain('"confidencePercent"'); + // JSON is now optional, only for flagging a halt/low-confidence. + expect(result).toContain('no JSON is needed'); + expect(result).toContain('haltReason'); + // The success path is "stop calling tools and summarize", not a JSON block. + expect(result).toContain('success signal'); }); it('contains rules section', () => { @@ -148,6 +150,36 @@ describe('buildWorkerPrompt', () => { }); expect(result).toContain('no need to Read these files'); }); + + it('with repoMemories: renders repository knowledge with pitfall/pattern tags', () => { + const result = enPrompts.buildWorkerPrompt({ + ...base, + context: { + repoMemories: [ + { type: 'system_pattern', title: 'Solved: auth refactor', content: 'Changed src/auth.ts using token rotation.' }, + { type: 'constraint', title: 'Review rejection: session fix', content: 'Do not bypass session validation in middleware.' }, + ], + }, + }); + expect(result).toContain('Repository Knowledge'); + expect(result).toContain('✓ pattern'); + expect(result).toContain('⚠️ PITFALL'); + expect(result).toContain('token rotation'); + expect(result).toContain('avoid repeating past mistakes'); + }); + + it('repoMemories alone is enough to render the Code Context section', () => { + const result = enPrompts.buildWorkerPrompt({ + ...base, + context: { + repoMemories: [ + { type: 'fact', title: 'Build', content: 'Use pnpm, not npm.' }, + ], + }, + }); + expect(result).toContain('Code Context'); + expect(result).toContain('pnpm'); + }); }); // ── 3. buildReviewerPrompt ───────────────────────────────────── diff --git a/src/locale/types.ts b/src/locale/types.ts index f2889c0..acd65f7 100644 --- a/src/locale/types.ts +++ b/src/locale/types.ts @@ -419,6 +419,12 @@ export interface WorkerContext { suggestedApproach: string; projectStats?: string; }; + /** Repo knowledge accumulated from past tasks (memory/repoKnowledge.recallRepoKnowledge) */ + repoMemories?: Array<{ + type: string; // system_pattern (success pattern) | constraint (pitfall) | fact ... + title: string; + content: string; + }>; } export interface PromptTemplates { diff --git a/src/memory/repoKnowledge.ts b/src/memory/repoKnowledge.ts new file mode 100644 index 0000000..9e5ea70 --- /dev/null +++ b/src/memory/repoKnowledge.ts @@ -0,0 +1,119 @@ +// ============================================ +// OpenSwarm - Repository Knowledge (repo-scoped memory) +// Created: 2026-06-10 +// Purpose: Make repository understanding accumulate across tasks — extract repo +// knowledge from task outcomes (write) and inject it into the next +// task's worker prompt (read). Storage reuses the existing memoryCore +// (LanceDB, repo field) — no new storage layer. +// vega-agent pattern: relevance-based dynamic injection rather than a +// fixed persona block. +// ============================================ + +import { saveMemory, searchMemorySafe } from './memoryCore.js'; +import type { WorkerResult } from '../agents/agentPair.js'; + +/** Repo knowledge item injected into the worker prompt (mirrors locale WorkerContext) */ +export interface RepoMemoryBrief { + type: string; // system_pattern | constraint | fact ... + title: string; + content: string; +} + +/** Per-memory content cap at injection time — keeps long retros from eating the prompt */ +const MAX_CONTENT_CHARS = 400; +const RECALL_LIMIT = 5; + +/** + * Recall repo knowledge relevant to the current task. + * Always non-blocking — the pipeline runs even if memory is empty or the DB is down. + */ +export async function recallRepoKnowledge( + projectPath: string, + taskTitle: string, + taskDescription: string, +): Promise { + try { + const query = `${taskTitle}\n${taskDescription}`.slice(0, 500); + const result = await searchMemorySafe(query, { + repo: projectPath, + // Include 'belief' — memories from other write paths may have been + // distilled down to belief and would otherwise be filtered out. + types: ['system_pattern', 'constraint', 'fact', 'strategy', 'belief'], + limit: RECALL_LIMIT, + minSimilarity: 0.35, + }); + if (!result.success) return []; + return result.memories.map((m) => ({ + type: m.type, + title: m.title, + content: m.content.length > MAX_CONTENT_CHARS + ? m.content.slice(0, MAX_CONTENT_CHARS) + '…' + : m.content, + })); + } catch { + return []; + } +} + +export interface TaskOutcomeInput { + taskTitle: string; + /** Provenance tracker, e.g. Linear issue ID */ + derivedFrom?: string; + workerResult?: Pick | null; + /** Reviewer rejection reason — stored as a constraint (pitfall) when present */ + rejectionFeedback?: string; + /** Pipeline iteration count (1 = passed on the first attempt) */ + iterations?: number; +} + +/** + * Extract and store repo knowledge from a task outcome. + * - Success: which files changed and how it passed → system_pattern (a shortcut + * for the next similar task) + * - Rejection: the pitfall the reviewer flagged → constraint (blocks repeating + * the same mistake) + * skipDistillation: this is already structured knowledge — distillation would + * downgrade the type to 'belief' and drop it from type-filtered recall. + */ +export async function recordTaskOutcome( + projectPath: string, + outcome: TaskOutcomeInput, +): Promise { + try { + if (outcome.rejectionFeedback) { + await saveMemory( + 'constraint', + projectPath, + `Review rejection: ${outcome.taskTitle.slice(0, 80)}`, + `Task "${outcome.taskTitle}" was rejected by the reviewer.\n` + + `Reviewer feedback (avoid repeating this): ${outcome.rejectionFeedback.slice(0, 600)}`, + { derivedFrom: outcome.derivedFrom, isVerified: true, skipDistillation: true }, + ); + return; + } + + const files = outcome.workerResult?.filesChanged ?? []; + if (files.length === 0) return; // nothing to learn from a task that changed no files + + const parts = [ + `Task "${outcome.taskTitle}" completed successfully.`, + `Files changed: ${files.slice(0, 10).join(', ')}${files.length > 10 ? ` (+${files.length - 10} more)` : ''}.`, + ]; + if (outcome.workerResult?.summary) { + parts.push(`Approach: ${outcome.workerResult.summary.slice(0, 400)}`); + } + if (outcome.iterations && outcome.iterations > 1) { + parts.push(`Took ${outcome.iterations} iterations before passing review.`); + } + await saveMemory( + 'system_pattern', + projectPath, + `Solved: ${outcome.taskTitle.slice(0, 80)}`, + parts.join('\n'), + { derivedFrom: outcome.derivedFrom, isVerified: true, skipDistillation: true }, + ); + } catch (err) { + // Memory write failures must never stop the pipeline + console.warn('[RepoKnowledge] recordTaskOutcome failed (non-critical):', err); + } +} From 116bb97dc079b40e04f2ebd5d7d7b45ed0789cde Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 11 Jun 2026 00:24:30 +0900 Subject: [PATCH 3/7] feat(benchmarks): L0-L6 difficulty rubric, model routing benchmark, SWE-bench harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - benchmarks/tasks/codingTasks.ts + modelSelect.ts: 12 synthetic tasks (L0-L5) with deterministic grading (regex / test run / tsc); produces a model x level pass-rate table and a cost-efficiency Pareto ranking used to set the default worker/reviewer/planner routing. - benchmarks/sweBench.ts: L6 = real GitHub issues (SWE-bench Lite). Solve with the OpenSwarm harness (openrouter adapter), grade with the official swebench harness in Docker. Supports hybrid mode: SWE_DIAG_MODEL runs a frontier read-only diagnosis stage, SWE_MODEL implements with the verification loop; SWE_DIAG_FILE reuses a saved diagnosis. - benchmarks/RUBRIC.md: level definitions, recommended models per level, L6 grading procedure and pitfalls, and measured results. Headline result: hybrid (gpt-5 diagnosis + lightweight implementer) resolved 3/3 attempted SWE-bench Lite instances (pylint 7080/5859/7993) where every single-model lightweight run had failed — including a re-diagnosis escalate loop on 7993 where the frontier found the bug in its own first fix plan from the failing patch + test output. Evidence under benchmarks/results/. --- benchmarks/RUBRIC.md | 151 ++++++ benchmarks/modelSelect.ts | 333 ++++++++++++ benchmarks/results/latest.json | 56 ++ benchmarks/results/swe_5859_diagnosis.txt | 42 ++ benchmarks/results/swe_7993_diagnosis.txt | 55 ++ benchmarks/results/swe_7993_rediagnosis.txt | 41 ++ benchmarks/results/swe_preds.json | 7 + .../results/swe_pylint_3models_preds.json | 1 + ...we_pylint_7993_hybrid_RESOLVED_report.json | 23 + .../swe_pylint_7993_hybrid_retry_preds.json | 7 + .../swe_pylint_7993_hybrid_v3_preds.json | 7 + .../swe_pylint_7993_hybrid_v4_preds.json | 7 + .../swe_pylint_7993_hybrid_v5_preds.json | 7 + .../swe_pylint_7993_hybrid_v6_preds.json | 7 + .../swe_pylint_7993_hybrid_v7_preds.json | 7 + .../results/swe_pylint_RESOLVED_report.json | 23 + .../results/swe_pylint_gemini_preds.json | 7 + benchmarks/results/swe_pylint_glm_preds.json | 7 + benchmarks/results/swe_pylint_gpt5_preds.json | 7 + .../swe_pylint_hybrid_RESOLVED_preds.json | 7 + .../swe_pylint_hybrid_RESOLVED_report.json | 23 + .../swe_pylint_hybrid_deepseek_preds.json | 7 + .../swe_pylint_hybrid_deepseek_report.json | 23 + .../results/swe_pylint_hybrid_diagnosis.txt | 54 ++ .../results/swe_pylint_hybrid_glm_preds.json | 7 + .../results/swe_pylint_new2_hybrid_preds.json | 12 + .../swe_pylint_new2_hybrid_report.json | 27 + benchmarks/sweBench.ts | 214 ++++++++ benchmarks/tasks/codingTasks.ts | 491 ++++++++++++++++++ benchmarks/throughputProbe.ts | 76 +++ 30 files changed, 1736 insertions(+) create mode 100644 benchmarks/RUBRIC.md create mode 100644 benchmarks/modelSelect.ts create mode 100644 benchmarks/results/latest.json create mode 100644 benchmarks/results/swe_5859_diagnosis.txt create mode 100644 benchmarks/results/swe_7993_diagnosis.txt create mode 100644 benchmarks/results/swe_7993_rediagnosis.txt create mode 100644 benchmarks/results/swe_preds.json create mode 100644 benchmarks/results/swe_pylint_3models_preds.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_RESOLVED_report.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_retry_preds.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_v3_preds.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_v4_preds.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_v5_preds.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_v6_preds.json create mode 100644 benchmarks/results/swe_pylint_7993_hybrid_v7_preds.json create mode 100644 benchmarks/results/swe_pylint_RESOLVED_report.json create mode 100644 benchmarks/results/swe_pylint_gemini_preds.json create mode 100644 benchmarks/results/swe_pylint_glm_preds.json create mode 100644 benchmarks/results/swe_pylint_gpt5_preds.json create mode 100644 benchmarks/results/swe_pylint_hybrid_RESOLVED_preds.json create mode 100644 benchmarks/results/swe_pylint_hybrid_RESOLVED_report.json create mode 100644 benchmarks/results/swe_pylint_hybrid_deepseek_preds.json create mode 100644 benchmarks/results/swe_pylint_hybrid_deepseek_report.json create mode 100644 benchmarks/results/swe_pylint_hybrid_diagnosis.txt create mode 100644 benchmarks/results/swe_pylint_hybrid_glm_preds.json create mode 100644 benchmarks/results/swe_pylint_new2_hybrid_preds.json create mode 100644 benchmarks/results/swe_pylint_new2_hybrid_report.json create mode 100644 benchmarks/sweBench.ts create mode 100644 benchmarks/tasks/codingTasks.ts create mode 100644 benchmarks/throughputProbe.ts diff --git a/benchmarks/RUBRIC.md b/benchmarks/RUBRIC.md new file mode 100644 index 0000000..0d73caf --- /dev/null +++ b/benchmarks/RUBRIC.md @@ -0,0 +1,151 @@ +# OpenSwarm 코딩 벤치마크 루브릭 (L0–L6) + +OpenSwarm 하네스(worker = `runAgenticLoop`, openrouter 어댑터)의 코딩 능력을 난이도별로 +측정하고, 각 난이도에 **비용효율적인 모델**을 데이터로 라우팅하기 위한 루브릭. + +> 측정 대상은 **하네스 + 모델**의 결합이다. codex 어댑터(Codex CLI 위임)는 OpenSwarm +> 하네스를 우회하므로 측정에서 제외 — 반드시 openrouter 어댑터로 돈다. + +## 난이도 사다리 + +| Lv | 이름 | 검증하는 능력 | 채점 방식 | 인프라 | +|----|------|--------------|-----------|--------| +| **L0** | 단일 수정 | 한 줄~한 함수 버그픽스 | 정규식 (`check()`) | 즉시 | +| **L1** | 탐색+수정 | 가드 추가, 단순 기능 | 정규식 | 즉시 | +| **L2** | 다중 파일 | 리네임/시그니처 연쇄 (3~4 파일) | 정규식 | 즉시 | +| **L3** | 테스트 통과 | 스텁 구현해 기존 테스트 green | **테스트 실행** (`tsx`) | 즉시 | +| **L4** | 고난도 | 깊은 의존성 연쇄, edge case 완전성, 숨은 버그 추적, 타입 변경 | 테스트 실행 + **tsc** | 즉시 | +| **L5** | 난해 | 알고리즘 정확성(merge-intervals/LRU), 상태기계(tokenizer), 제네릭 타입 | 테스트 실행 | 즉시 | +| **L6** | **실전** | **실제 GitHub 이슈** (SWE-bench Lite) — 대형 repo 탐색 + 근본원인 + 정확한 patch | **공식 swebench 하니스** (Docker) | OrbStack, 분 단위 | + +- **L0–L5**: `benchmarks/tasks/codingTasks.ts` (합성, self-contained). 빠른 회귀 테스트. + `npx tsx benchmarks/modelSelect.ts --repeat N`. 채점은 LLM judge 없는 결정적 방식. +- **L6**: `benchmarks/sweBench.ts`. SWE-bench Lite instance를 OpenSwarm worker가 풀고, + 공식 `swebench.harness.run_evaluation`이 FAIL_TO_PASS+PASS_TO_PASS로 채점. + +## 레벨별 추천 모델 (측정 기반) + +벤치 데이터(`benchmarks/results/`)로 도출. 점수 = pass_rate → $/pass → tool calls. + +| Lv | 추천 worker 모델 | 근거 | +|----|------------------|------| +| L0–L3 | **z-ai/glm-4.7-flash** 또는 deepseek-v4-flash | 100% pass, $0.002~0.004/pass. glm은 2759 tok/s(DeepInfra)로 최속. 경량으로 충분 | +| L4 | 경량 + escalate | 경량 모델도 대부분 통과(100%), 실패 시 frontier escalate | +| L5 | 경량 (일부 실패 감수) | glm/qwen이 L5-lru 등 1~2개 실패(87~95%). escalate가 흡수 | +| **L6** | **frontier (openai/gpt-5)** | **경량은 정답 정확도 부족** — 아래 L6 측정 참조 | + +### L6 실측 (pylint-dev__pylint-7080, 2026-06) + +| 모델 | patch | 결과 | 비고 | +|------|-------|------|------| +| **openai/gpt-5** | ✅ | **RESOLVED** | `expand_modules.py` 정답 위치 (`os.path.relpath`) | +| gemini-2.5-flash | ✅ | unresolved | `pylinter.py`만 — 정답 위치 빗나감 | +| glm-4.7-flash | ✅ | unresolved | 정답 파일 건드렸으나 부정확 | +| qwen3-coder-30b | ✅ | unresolved | 부정확 | +| deepseek-v4-flash | ❌ | (빈 patch) | 수정 미도달 | +| gpt-5-mini | ❌ | (빈 patch) | 수정 미도달 | + +→ **이 instance는 frontier(gpt-5)만 풀었다 (1/6).** 경량 모델은 압축 임계 수정(24k→60k) 후 +patch는 생성하지만 **정답 정확도가 frontier에 못 미친다.** SWE-bench Lite는 frontier도 +30~50%대 난이도이므로 L6은 frontier 라우팅 + 충분한 maxTurns(80) 필요. + +### "검증 강제로 경량을 통과시킬 수 있나?" 실험 (v2, 천장 검증) + +검증 루프를 MANDATORY로 강화("edit 후 반드시 run_tests.sh, 실패면 반복")하고 재측정: + +| 모델 | v1 (검증 선택) | v2 (검증 강제 + 모든 하네스 수정) | +|------|----------------|----------------------------------| +| gemini-2.5-flash | edit 1, 검증 0 → 틀린 patch | **edit 9 + test 13회 반복** → 그래도 unresolved (FAIL_TO_PASS 0/1, PASS_TO_PASS 120/120 무사) | +| deepseek-v4-flash | edit 0 (압축 수정 전) | **여전히 edit 0** — 80턴 탐색만, 수정 결정 못 내림 | + +**결론: 이 난이도의 진단형 버그에서는 사실상 모델 천장.** 검증 강제는 행동을 크게 +바꿨지만(blind 제출 → 반복 루프), 정답에 필요한 통찰("재귀 탐색의 절대/상대경로 표현 +불일치")은 피드백 13회로도 못 얻었다. 하네스가 줄 수 있는 건 기회(루프·컨텍스트)지 +진단 깊이가 아니다. + +### 하이브리드 실험: frontier 진단 + 경량 구현 — ✅ 시도한 3 instance 전부 RESOLVED (3/3) + +planner/worker 분리 가설을 실측: **gpt-5가 read-only 진단**(root cause + 구체적 fix plan) +→ **경량 모델이 구현 + 검증 루프** → 공식 swebench 채점. **통과 누적 4회** — instance를 +바꿔도(5859, 7993), 구현자를 바꿔도(deepseek) 재현된다. + +| 구성 | instance | 결과 | +|------|----------|------| +| gemini 단독 (검증 강제, 9 edit + 13 test) | 7080 | unresolved — 진단 실패 | +| **gpt-5 진단(52턴 read-only) + gemini 구현(3 edit + 2 test)** | 7080 | **RESOLVED** ✅ | +| **gpt-5 진단 + deepseek-v4-flash 구현** (단독은 0 edit이던 모델) | 7080 | **RESOLVED** ✅ | +| **gpt-5 진단 + gemini 구현** (새 instance 풀 파이프라인) | 5859 | **RESOLVED** ✅ | +| gpt-5 진단 + glm-4.7-flash | 7080 | 빈 patch — **구현자 부적합** (no-edit 가드 무시, 0 edit) | +| gpt-5 진단 + gemini (v1~v5) | 7993 | unresolved — 1차 진단서의 pseudocode 버그를 구현자가 충실 복제 | +| **gpt-5 재진단(실패 patch+테스트 출력 피드백) + deepseek 구현** | 7993 | **RESOLVED** ✅ | + +→ **경량 모델의 L6 천장은 "진단 깊이"이고, 그 부분만 frontier가 메우면 통과한다.** +단 구현자 적합성은 모델별로 갈린다: deepseek ✅✅(기계적 마무리 안정) / gemini ✅(import 누락 +등 마무리 변동) / glm ✗. + +**재진단 escalate 루프 (7993이 입증한 완성형)**: 1차 진단의 fix plan에 버그가 있으면 경량 +구현자는 그것을 충실히 복제한다("trust this analysis" — 신뢰 경계 지침으로도 못 뚫음, 4회 +연속). 해법은 구현자 설득이 아니라 **(실패 patch + 테스트 출력)을 들고 frontier 재진단** — +gpt-5는 피드백을 받자 자기 pseudocode의 버그(Formatter.parse literal re-escape 누락)를 정확히 +짚었고, deepseek이 그 재진단서로 완주했다. OpenSwarm worker escalate 루프와 동일 구조. +SWE_DIAG_MODEL=openai/gpt-5 + SWE_MODEL=경량으로 실행. 진단은 재사용 가능(SWE_DIAG_FILE) — +같은 instance의 stage 2 재시도 시 frontier 비용 0. + +운영 함의: L6급 작업도 "frontier planner가 분석 → 경량 worker가 구현" 분업으로 frontier +full-solve(82턴) 대비 frontier 사용을 진단(52턴 read-only)으로 줄일 수 있다. 경량 +구현자는 변동성이 있다(같은 진단으로도 조기 포기 가능) — no-edit 가드(`nudgeMaxOnNoEdit`)와 +풍부한 진단(구체적 pseudocode 포함)이 성공 요인. best-of-N은 기대 낮음(진단 없는 gemini +9회 시도 전부 오답). + +하이브리드의 추가 결함 모드 (7993에서 발견): +- **진단서 오류 전파**: 진단서 pseudocode 자체에 버그가 있으면(Formatter.parse literal + re-escape 누락) 구현자가 "trust this analysis" 지시 탓에 그 버그를 충실히 복제한다(3회 + 연속 동일 patch). → stage 2 지침에 "THE TEST RESULT OUTRANKS THE PLAN" 신뢰 경계 추가. +- **검증 하네스 자가 해체** (결함 6호): 테스트 실패 원인을 검증 스크립트로 오판해 + run_tests.sh를 5회 수정. → `protectedFiles` 옵션 (edit/write 거부). +- **bash 침묵 타임아웃** (결함 7호): 30초 고정 타임아웃이 docker 경유 테스트에서 출력 없이 + 죽어 "환경 고장"으로 오판 유도. → `bashTimeoutMs` 옵션 + 명시적 TIMEOUT 메시지. + +## 라우팅 원칙 (티어링) + +- **판단 무거운 역할** (Planner/분해, Reviewer): frontier 고정 (gpt-5). 잘못된 판단이 + 하류 전체를 오염시키므로 경량화 안 함. +- **실행 역할** (Worker/Tester/Documenter/Auditor): 경량 기본 + 실패 2회 시 frontier escalate. + - 단 **L6급 실전 작업은 worker도 frontier 권장** — 경량으로는 정답률이 낮다. + +## L6 채점 절차 + +```bash +# 1. OrbStack 사용 (Apple Silicon에서 amd64 에뮬레이션 안정). Docker Desktop은 손상됨. +export DOCKER_HOST="unix:///Users//.orbstack/run/docker.sock" + +# 2. OpenSwarm worker가 instance를 풀어 prediction 생성 +OPENROUTER_API=... SWE_MODEL=openai/gpt-5 \ + npx tsx benchmarks/sweBench.ts + +# 3. 공식 swebench 하니스로 채점 (per-model, max_workers 1 — 동시 다중은 VM 부하) +/path/swebench-env/bin/python -m swebench.harness.run_evaluation \ + --dataset_name SWE-bench/SWE-bench_Lite --predictions_path \ + --run_id --instance_ids --cache_level instance --max_workers 1 +``` + +### L6 함정 (전부 측정으로 확인) +- **OrbStack 필수.** Docker Desktop은 amd64 SWE-bench 워크로드에서 매번 "unable to start" + 503 손상 → 재부팅 필요. OrbStack은 안정적으로 완주. +- 옛 instance는 당대 Python(3.6~3.9) 필수 → 공식 Docker 이미지의 conda env "testbed" 사용. + naive venv 불가 (`cgi`/`collections.Mapping` 제거). +- requests 옛 instance는 외부 httpbin 의존(503) → 부적합. **순수 로직 repo**(pylint/sympy/ + sphinx) 권장. +- 같은 instance_id를 여러 모델로 한 prediction 파일에 넣으면 마지막 1개만 채점됨 → **모델별 + 분리 채점**. +- 이미지 태그: `swebench/sweb.eval.x86_64.`. + +## 하네스 결함 — L6에서 발견·수정 (합성 L0–L5에선 안 드러남) + +대형 repo에서만 발현하는 결함 3건을 L6이 잡아냈다: +1. **cwd 미인지**: agenticLoop이 모델에게 작업 디렉터리를 안 알려줘 절대경로 추측 → 탐색 차단. + → user 프롬프트에 `Working directory: ` 주입. +2. **bash exit-1 오판**: grep "매치 없음"(exit 1)을 치명 에러로 처리, stdout 미반환 → 무한 반복. + → 에러 시에도 stdout/stderr+exit code 반환, exit1+무출력은 benign. +3. **압축 무한 루프** (핵심): 긴 작업(60+턴)에서 압축이 읽은 파일을 깎아 무한 재read → + edit 도달 못 함. → 임계 24k→60k, compactAfterMessages 24→60, keepRecent 8→16. diff --git a/benchmarks/modelSelect.ts b/benchmarks/modelSelect.ts new file mode 100644 index 0000000..ba3768a --- /dev/null +++ b/benchmarks/modelSelect.ts @@ -0,0 +1,333 @@ +#!/usr/bin/env tsx +// ============================================ +// OpenSwarm - Model Selection Benchmark +// Created: 2026-06-09 +// Purpose: 코딩 태스크에 대해 worker 모델별 품질·비용을 측정해 파레토 경계를 찾는다. +// VEGA benchmarks/model_select.py 이식: 점수 = pass_rate → 실비용 → turn 수. +// 자동 교체 안 함 — 랭킹만 출력, 사람이 config 반영(통제력 유지). +// +// 실행: +// source ~/dev/VEGA/.env (OPENROUTER_API 필요) +// npx tsx benchmarks/modelSelect.ts --repeat 3 +// npx tsx benchmarks/modelSelect.ts --model openai/gpt-5 --model qwen/qwen3-coder +// npx tsx benchmarks/modelSelect.ts --task L0-fix-multiply --repeat 5 +// ============================================ + +import { mkdtemp, writeFile, rm, mkdir } from 'node:fs/promises'; +import { existsSync, readFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, dirname } from 'node:path'; +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; +import { runWorker } from '../src/agents/worker.js'; +import { setDefaultAdapter } from '../src/adapters/index.js'; +import { initLocale } from '../src/locale/index.js'; +import { CODING_TASKS, type BenchTask } from './tasks/codingTasks.js'; + +const exec = promisify(execFile); + +// ---- 후보 모델 풀 ---- +// non-frontier 오픈 모델 중심 (gpt 계열은 별도 측정에서 비교됨). +// baseline = gemini-2.5-flash (직전 라운드 100% pass 우승자). +const DEFAULT_CANDIDATES = [ + 'google/gemini-2.5-flash', // baseline (직전 우승) + 'deepseek/deepseek-v4-pro', // deepseek 플래그십 + 'deepseek/deepseek-v4-flash', // deepseek 경량 + 'minimax/minimax-m3', // MiniMax 최신 + 'z-ai/glm-5', // GLM 최신 + 'z-ai/glm-4.7-flash', // GLM 초저가 + 'moonshotai/kimi-k2-thinking', // Kimi 코딩 강자 + 'qwen/qwen3-coder-30b-a3b-instruct',// qwen 코딩 경량 +]; + +interface RunResult { + model: string; + taskId: string; + rep: number; + passed: boolean; + reason: string; + toolCalls: number; + apiCalls: number; + promptTokens: number; + completionTokens: number; + costUsd: number; + durationMs: number; +} + +interface ModelAgg { + model: string; + runs: number; + passes: number; + passRate: number; + avgCostUsd: number; + costPerPass: number; // 성공당 비용 (핵심 효율 지표) + avgToolCalls: number; + avgDurationMs: number; +} + +// ---- OpenRouter 가격 카탈로그 ---- +async function fetchPrices(apiKey: string): Promise> { + const res = await fetch('https://openrouter.ai/api/v1/models', { + headers: { Authorization: `Bearer ${apiKey}` }, + }); + const data = (await res.json()) as { data: Array<{ id: string; pricing?: { prompt?: string; completion?: string } }> }; + const map = new Map(); + for (const m of data.data) { + map.set(m.id, { + in: parseFloat(m.pricing?.prompt ?? '0'), + out: parseFloat(m.pricing?.completion ?? '0'), + }); + } + return map; +} + +async function setupRepo(task: BenchTask): Promise { + const dir = await mkdtemp(join(tmpdir(), 'osw-bench-')); + for (const [rel, content] of Object.entries(task.files)) { + const full = join(dir, rel); + await mkdir(dirname(full), { recursive: true }); + await writeFile(full, content); + } + await exec('git', ['init', '-q'], { cwd: dir }); + await exec('git', ['config', 'user.email', 'bench@local'], { cwd: dir }); + await exec('git', ['config', 'user.name', 'bench'], { cwd: dir }); + await exec('git', ['add', '-A'], { cwd: dir }); + await exec('git', ['commit', '-qm', 'init'], { cwd: dir }); + return dir; +} + +async function runOne( + task: BenchTask, + model: string, + rep: number, + prices: Map, +): Promise { + const dir = await setupRepo(task); + const logs: string[] = []; + const t0 = Date.now(); + + try { + await runWorker({ + taskTitle: task.title, + taskDescription: task.description, + projectPath: dir, + adapterName: 'openrouter', + model, + timeoutMs: 240_000, + maxTurns: 20, + onLog: (l) => logs.push(l), + }); + } catch (err) { + logs.push(`FATAL: ${err instanceof Error ? err.message : String(err)}`); + } + + const durationMs = Date.now() - t0; + + // 채점: repo 상태를 read 함수로 넘김 + const read = (rel: string): string | null => { + try { + return readFileSync(join(dir, rel), 'utf-8'); + } catch { + return null; + } + }; + const verdict = task.check(read, dir); + + // 로그에서 메트릭 추출 + const toolCalls = logs.filter((l) => l.includes('🔧')).length; + const apiCalls = logs.filter((l) => l.includes('API call #')).length; + const tokenLine = logs.find((l) => /\d+ tokens/.test(l)) ?? ''; + const totalTokens = Number(tokenLine.match(/(\d+) tokens/)?.[1] ?? 0); + // prompt/completion 분리는 어댑터가 합산만 주므로 근사: 입력 80% / 출력 20% 가정. + // (정밀 측정이 필요하면 어댑터가 usage 분리 반환하도록 확장) + const promptTokens = Math.round(totalTokens * 0.8); + const completionTokens = totalTokens - promptTokens; + + const price = prices.get(model) ?? { in: 0, out: 0 }; + const costUsd = promptTokens * price.in + completionTokens * price.out; + + await rm(dir, { recursive: true, force: true }); + + return { + model, taskId: task.id, rep, + passed: verdict.passed, reason: verdict.reason, + toolCalls, apiCalls, promptTokens, completionTokens, costUsd, durationMs, + }; +} + +function aggregate(results: RunResult[]): ModelAgg[] { + const byModel = new Map(); + for (const r of results) { + if (!byModel.has(r.model)) byModel.set(r.model, []); + byModel.get(r.model)!.push(r); + } + + const aggs: ModelAgg[] = []; + for (const [model, rs] of byModel) { + const passes = rs.filter((r) => r.passed).length; + const passRate = passes / rs.length; + const avgCostUsd = rs.reduce((s, r) => s + r.costUsd, 0) / rs.length; + const totalCost = rs.reduce((s, r) => s + r.costUsd, 0); + aggs.push({ + model, + runs: rs.length, + passes, + passRate, + avgCostUsd, + costPerPass: passes > 0 ? totalCost / passes : Infinity, + avgToolCalls: rs.reduce((s, r) => s + r.toolCalls, 0) / rs.length, + avgDurationMs: rs.reduce((s, r) => s + r.durationMs, 0) / rs.length, + }); + } + + // VEGA 랭킹: pass_rate 내림차순 → costPerPass 오름차순 → toolCalls 오름차순 + aggs.sort((a, b) => + b.passRate - a.passRate || + a.costPerPass - b.costPerPass || + a.avgToolCalls - b.avgToolCalls); + return aggs; +} + +function fmtUsd(n: number): string { + if (!isFinite(n)) return '∞'; + if (n < 0.001) return `$${(n * 1000).toFixed(3)}m`; // milli-dollar + return `$${n.toFixed(4)}`; +} + +/** taskId → level 매핑 (RUBRIC.md의 난이도 사다리) */ +function levelOf(taskId: string): string { + const m = taskId.match(/^(L\d)/); + return m ? m[1] : '?'; +} + +/** + * 모델 × 레벨 pass-rate 표. RUBRIC.md의 핵심 — "어느 난이도에서 모델이 갈리는가". + * 경량 모델은 보통 L0~L4를 통과하다 L5~L6에서 무너진다(변별 지점). + */ +function levelTable(results: RunResult[]): string { + const levels = ['L0', 'L1', 'L2', 'L3', 'L4', 'L5']; + const models = [...new Set(results.map((r) => r.model))]; + const cell = new Map(); + for (const r of results) { + const k = `${r.model}|${levelOf(r.taskId)}`; + const c = cell.get(k) ?? { p: 0, n: 0 }; + c.n++; if (r.passed) c.p++; + cell.set(k, c); + } + const lines: string[] = []; + lines.push('\n========== LEVEL × MODEL pass-rate (RUBRIC discrimination) =========='); + lines.push(`${'model'.padEnd(34)} ${levels.map((l) => l.padStart(5)).join(' ')}`); + for (const model of models) { + const cells = levels.map((l) => { + const c = cell.get(`${model}|${l}`); + return c && c.n ? `${Math.round((c.p / c.n) * 100)}%`.padStart(5) : ' - '; + }); + lines.push(`${model.padEnd(34)} ${cells.join(' ')}`); + } + lines.push('(L6 = 실전 SWE-bench, 별도 채점 — benchmarks/sweBench.ts + RUBRIC.md)'); + return lines.join('\n'); +} + +function report(aggs: ModelAgg[], baseline: string, results: RunResult[]): string { + const lines: string[] = []; + lines.push(levelTable(results)); + lines.push('\n========== MODEL RANKING (pass_rate → $/pass → tool calls) =========='); + lines.push('rank model pass $/pass avg$ tools dur'); + aggs.forEach((a, i) => { + const tag = a.model === baseline ? ' (baseline)' : ''; + lines.push( + `${String(i + 1).padEnd(4)} ${a.model.padEnd(26)} ` + + `${(a.passRate * 100).toFixed(0).padStart(3)}% ` + + `${fmtUsd(a.costPerPass).padStart(8)} ` + + `${fmtUsd(a.avgCostUsd).padStart(8)} ` + + `${a.avgToolCalls.toFixed(1).padStart(5)} ` + + `${(a.avgDurationMs / 1000).toFixed(0)}s${tag}`, + ); + }); + + // 파레토 경계: baseline과 동일 pass_rate를 유지하는 가장 싼 모델 + const base = aggs.find((a) => a.model === baseline); + if (base) { + const asGoodCheaper = aggs.filter( + (a) => a.model !== baseline && a.passRate >= base.passRate && a.costPerPass < base.costPerPass, + ); + lines.push('\n========== PARETO FINDING =========='); + lines.push(`baseline: ${baseline} — pass ${(base.passRate * 100).toFixed(0)}%, $/pass ${fmtUsd(base.costPerPass)}`); + if (asGoodCheaper.length > 0) { + const best = asGoodCheaper[0]; + const savings = ((1 - best.costPerPass / base.costPerPass) * 100).toFixed(0); + lines.push(`✅ ${best.model} matches quality (${(best.passRate * 100).toFixed(0)}%) at ${savings}% lower $/pass`); + lines.push(` → safe to route here; keep ${baseline} as escalate target.`); + } else { + lines.push(`⚠️ No cheaper model matched ${baseline}'s pass rate. Keep frontier or add more repeats.`); + } + } + return lines.join('\n'); +} + +// ---- CLI ---- +function parseArgs(argv: string[]) { + const models: string[] = []; + const taskIds: string[] = []; + let repeat = 3; + let baseline = 'google/gemini-2.5-flash'; + for (let i = 0; i < argv.length; i++) { + if (argv[i] === '--model') models.push(argv[++i]); + else if (argv[i] === '--task') taskIds.push(argv[++i]); + else if (argv[i] === '--repeat') repeat = Number(argv[++i]); + else if (argv[i] === '--baseline') baseline = argv[++i]; + } + return { + models: models.length ? models : DEFAULT_CANDIDATES, + tasks: taskIds.length ? CODING_TASKS.filter((t) => taskIds.includes(t.id)) : CODING_TASKS, + repeat, baseline, + }; +} + +async function main() { + initLocale('en'); + setDefaultAdapter('openrouter'); + + const apiKey = process.env.OPENROUTER_API; + if (!apiKey) { + console.error('OPENROUTER_API not set. Run: source ~/dev/VEGA/.env'); + process.exit(1); + } + + const { models, tasks, repeat, baseline } = parseArgs(process.argv.slice(2)); + console.log(`[bench] models=${models.length} tasks=${tasks.length} repeat=${repeat}`); + console.log(`[bench] total runs = ${models.length * tasks.length * repeat}`); + + const prices = await fetchPrices(apiKey); + + // 모델별 병렬 실행 — 각 모델은 독립 임시 repo를 쓰므로 충돌 없음. + // 한 모델 내에서는 태스크×반복을 직렬로 돌려 단일 모델의 rate limit을 피한다. + // gpt-5 같은 느린 모델이 빠른 모델을 막지 않아 전체 wall-clock이 크게 준다. + const perModel = await Promise.all( + models.map(async (model) => { + const out: RunResult[] = []; + for (const task of tasks) { + for (let rep = 1; rep <= repeat; rep++) { + const r = await runOne(task, model, rep, prices); + out.push(r); + const mark = r.passed ? '✅' : '❌'; + console.log(` ${mark} ${model.padEnd(24)} ${task.id.padEnd(24)} rep${rep} ${fmtUsd(r.costUsd)} ${r.toolCalls}tc ${(r.durationMs / 1000).toFixed(0)}s ${r.reason}`); + } + } + return out; + }), + ); + const results: RunResult[] = perModel.flat(); + + const aggs = aggregate(results); + console.log(report(aggs, baseline, results)); + + // 결과 저장 (타임스탬프는 인자로 안 받으므로 결과 파일명은 고정 — 덮어씀) + const outDir = join(dirname(new URL(import.meta.url).pathname), 'results'); + if (!existsSync(outDir)) await mkdir(outDir, { recursive: true }); + const outPath = join(outDir, 'latest.json'); + await writeFile(outPath, JSON.stringify({ results, aggs, baseline }, null, 2)); + console.log(`\n[bench] raw results → ${outPath}`); +} + +main().catch((e) => { console.error('FATAL:', e); process.exit(1); }); diff --git a/benchmarks/results/latest.json b/benchmarks/results/latest.json new file mode 100644 index 0000000..83696c4 --- /dev/null +++ b/benchmarks/results/latest.json @@ -0,0 +1,56 @@ +{ + "results": [ + { + "model": "z-ai/glm-4.7-flash", + "taskId": "L0-fix-multiply", + "rep": 1, + "passed": true, + "reason": "multiply=a*b, add intact", + "toolCalls": 4, + "apiCalls": 5, + "promptTokens": 6383, + "completionTokens": 1596, + "costUsd": 0.00102138, + "durationMs": 6091 + }, + { + "model": "z-ai/glm-4.7-flash", + "taskId": "L2-rename-across-files", + "rep": 1, + "passed": true, + "reason": "def + import + call all renamed", + "toolCalls": 7, + "apiCalls": 6, + "promptTokens": 9778, + "completionTokens": 2444, + "costUsd": 0.00156428, + "durationMs": 8043 + }, + { + "model": "z-ai/glm-4.7-flash", + "taskId": "L5-merge-intervals", + "rep": 1, + "passed": true, + "reason": "test passed (executed)", + "toolCalls": 4, + "apiCalls": 4, + "promptTokens": 6058, + "completionTokens": 1515, + "costUsd": 0.00096948, + "durationMs": 5993 + } + ], + "aggs": [ + { + "model": "z-ai/glm-4.7-flash", + "runs": 3, + "passes": 3, + "passRate": 1, + "avgCostUsd": 0.0011850466666666667, + "costPerPass": 0.0011850466666666667, + "avgToolCalls": 5, + "avgDurationMs": 6709 + } + ], + "baseline": "z-ai/glm-4.7-flash" +} \ No newline at end of file diff --git a/benchmarks/results/swe_5859_diagnosis.txt b/benchmarks/results/swe_5859_diagnosis.txt new file mode 100644 index 0000000..001a32f --- /dev/null +++ b/benchmarks/results/swe_5859_diagnosis.txt @@ -0,0 +1,42 @@ +ROOT CAUSE +- Location: pylint/checkers/misc.py, class EncodingChecker, method open() +- Mechanism: The regex built for fixme detection appends a trailing word boundary (\b): + - If notes_rgx is set: r"#\s*({notes}|{self.config.notes_rgx})\b" + - Else: r"#\s*({notes})\b" +- Why it fails: \b matches only between a word char [A-Za-z0-9_] and a non-word char. For tags that are entirely punctuation (e.g., "???"), the character before and after the tag are non-word characters, so \b does not match. As a result, comments like "# ???: no" never satisfy the pattern, and no W0511 is emitted. +- Evidence: + - misc.py lines 119–129 show regex construction with \b. + - process_tokens uses self._fixme_pattern.search("#" + comment_text.lower()), anchoring the tag at the start of the comment. The only delimiter check is the trailing \b, which blocks punctuation-only tags. + - Tests explicitly assert not matching mid-line tags and not matching "Todoist" (which relies on the boundary after the tag). Replacing \b with a non-word-or-EOL lookahead preserves these behaviors. + +FIX PLAN +- File: pylint/checkers/misc.py +- Function: EncodingChecker.open() +- Change: Replace the trailing word boundary \b with a lookahead that ensures the next character is a non-word character or end-of-line. This keeps current behavior for alphanumeric tags and enables punctuation-only tags. + +Concrete edit: +- Current: + - if self.config.notes_rgx: + regex_string = rf"#\s*({notes}|{self.config.notes_rgx})\b" + else: + regex_string = rf"#\s*({notes})\b" +- Replace with: + - if self.config.notes_rgx: + regex_string = rf"#\s*({notes}|{self.config.notes_rgx})(?=\W|$)" + else: + regex_string = rf"#\s*({notes})(?=\W|$)" +- Rationale: + - (?=\W|$) accepts punctuation-only tags (e.g., "???", "C++") because the following char is typically punctuation or EOL. + - It still prevents false positives like "Todoist" because the next char after "TODO" is a word char, failing the lookahead. + - Behavior for default tags (FIXME/XXX/TODO) remains unchanged relative to \b. + +Reports +- Files modified: none +- Commands run: + - search_files: pattern="fixme" in ".", glob="**/*" + - search_files: pattern="class .*Fixme" in ".", glob="**/*.py" + - search_files: pattern="fixme" in "pylint/checkers", glob="**/*.py" + - read_file: "pylint/checkers/misc.py" + - read_file: "tests/checkers/unittest_misc.py" + - search_files: pattern="non_alphanumeric_codetag" in "tests", glob="**/*" + - search_files: pattern="non_alphanumeric|punctuation|codetag|notes-rgx" in "tests", glob="**/*" \ No newline at end of file diff --git a/benchmarks/results/swe_7993_diagnosis.txt b/benchmarks/results/swe_7993_diagnosis.txt new file mode 100644 index 0000000..c8e0c8d --- /dev/null +++ b/benchmarks/results/swe_7993_diagnosis.txt @@ -0,0 +1,55 @@ +ROOT CAUSE +- Location: pylint/reporters/text.py, TextReporter.on_set_current_module +- Code: + - Extracts fields from the user-supplied --msg-template using regex: r"\{(.+?)(:.*)?\}" + - For each match, warns if not in MESSAGE_FIELDS and removes it via re.sub. +- Failure mechanism: + - The regex does not account for escaped braces used to output literal braces in str.format (i.e., “{{ … }}”). + - Given template: '{{ "Category": "{category}" }}' + - The regex incorrectly matches the entire '{ "Category": "{category}" }' as a “field” and treats '"Category": "{category}"' as the field name. + - This triggers the warning “Don't recognize the argument '{ "Category"'” and then strips chunks of the template, leaving only stray quotes and braces, producing the observed broken output: + " } + ... +- Why it regressed: + - Older Pylint (1.7) didn’t pre-parse the template to validate fields; Python’s str.format handled double braces correctly. + - Since 2.x, template validation via the naive regex misinterprets escaped braces. + +FIX PLAN +- File: pylint/reporters/text.py +- Function: TextReporter.on_set_current_module +- Changes: + 1) Import string at top of file: import string + 2) Replace the regex-based extraction and removal logic with Python’s formatter parser to properly handle escaped braces and real fields. + +Concrete patch description (pseudocode-level exactness): +- Remove: + - arguments = re.findall(r"\{(.+?)(:.*)?\}", template) + - for argument in arguments: + if argument[0] not in MESSAGE_FIELDS: + warnings.warn(...) + template = re.sub(r"\{" + argument[0] + r"(:.*?)?\}", "", template) + - self._fixed_template = template +- Add: + - formatter = string.Formatter() + - parts = [] + - for literal, field_name, format_spec, conversion in formatter.parse(template): + - parts.append(literal) + - if field_name is None: + - continue # no field here (also covers escaped braces which are returned as literals) + - if field_name not in MESSAGE_FIELDS: + - warnings.warn(f"Don't recognize the argument '{field_name}' in the --msg-template. Are you sure it is supported on the current version of pylint?") + - continue # drop unknown fields by inserting nothing + - conv = f"!{conversion}" if conversion else "" + - spec = f":{format_spec}" if format_spec else "" + - parts.append("{" + field_name + conv + spec + "}") + - self._fixed_template = "".join(parts) + +Notes: +- Using string.Formatter().parse correctly ignores escaped double braces and yields actual field names (e.g., 'category') without false positives. +- Preserves format specs and optional conversion flags if users supply them. +- Continues to warn and drop unknown fields, preserving existing behavior and tests like test_template_option_non_existing. +- This should also address the failing tests around header/template handling (e.g., tests/reporters/unittest_reporting.py::test_template_option_with_header) by preventing false field detection in strings containing literals with braces. + +Reports +- Files modified: none +- Commands run: none \ No newline at end of file diff --git a/benchmarks/results/swe_7993_rediagnosis.txt b/benchmarks/results/swe_7993_rediagnosis.txt new file mode 100644 index 0000000..4a4098f --- /dev/null +++ b/benchmarks/results/swe_7993_rediagnosis.txt @@ -0,0 +1,41 @@ +ROOT CAUSE +- Where: pylint/reporters/text.py, TextReporter.on_set_current_module. +- What goes wrong: The failed fix replaced the regex-based field scan with string.Formatter().parse() and then rebuilt self._fixed_template by concatenating: + - literal parts as returned by parse() + - valid fields re-inserted as {field_name[!conversion][:format_spec]}. +- Why it breaks: string.Formatter().parse() returns literal_text with escaped braces already unescaped. For example, the template '{{ "Category": "{category}" }}' is parsed with literal_text = '{ "Category": "' (single brace) before the field and '" }' (single brace) after. The fix appended those literals verbatim, so self._fixed_template contains single braces that are not escape-escaped. When write_message later calls self._fixed_template.format(**self_dict), str.format treats the leading single '{' as the start of a new replacement field and tries to look up a key named ' "Category"', leading to KeyError: ' "Category"'. This matches the observed failure. + +FIX PLAN +- File: pylint/reporters/text.py +- Function: TextReporter.on_set_current_module +- Change: When rebuilding _fixed_template from string.Formatter().parse(template), re-escape braces in literal segments before appending them. Keep the existing handling for known/unknown fields and the conversion/format_spec reconstitution. + +Concrete patch outline: +- At top of file (imports), ensure: + import string +- In on_set_current_module, replace the current regex logic with: + + formatter = string.Formatter() + parts: list[str] = [] + for literal, field_name, format_spec, conversion in formatter.parse(template): + # Re-escape braces in literals so a later .format() won’t re-interpret them + if literal: + parts.append(literal.replace("{", "{{").replace("}", "}}")) + if field_name is None: + continue # no field here (covers escaped braces) + if field_name not in MESSAGE_FIELDS: + warnings.warn( + f"Don't recognize the argument '{field_name}' in the --msg-template. " + "Are you sure it is supported on the current version of pylint?" + ) + continue # drop unknown fields entirely, as before + conv = f"!{conversion}" if conversion else "" + spec = f":{format_spec}" if format_spec else "" + parts.append("{" + field_name + conv + spec + "}") + self._fixed_template = "".join(parts) + +- Rationale: parse() collapses '{{' and '}}' into '{' and '}' in literal_text. Re-escaping them when reconstructing ensures the final template is safe for str.format(), preventing spurious field parsing and the KeyError. + +Reports: +- Files modified: none +- Commands run: none \ No newline at end of file diff --git a/benchmarks/results/swe_preds.json b/benchmarks/results/swe_preds.json new file mode 100644 index 0000000..a951d2e --- /dev/null +++ b/benchmarks/results/swe_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "psf__requests-2317", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/requests/sessions.py b/requests/sessions.py\nindex d701ff2..7015eba 100644\n--- a/requests/sessions.py\n+++ b/requests/sessions.py\n@@ -425,6 +425,8 @@ class Session(SessionRedirectMixin):\n If Tuple, ('cert', 'key') pair.\n \"\"\"\n \n+ if isinstance(method, bytes):\n+ method = method.decode('utf-8')\n method = builtin_str(method)\n \n # Create the Request.\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_3models_preds.json b/benchmarks/results/swe_pylint_3models_preds.json new file mode 100644 index 0000000..f5dc2b1 --- /dev/null +++ b/benchmarks/results/swe_pylint_3models_preds.json @@ -0,0 +1 @@ +[{"instance_id": "pylint-dev__pylint-7080", "model_name_or_path": "openswarm-gpt5", "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..afb48daa 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -53,10 +53,19 @@ def _is_ignored_file(\n ignore_list_paths_re: list[Pattern[str]],\n ) -> bool:\n basename = os.path.basename(element)\n+ # Match path-based ignores against both the provided path and its\n+ # representation relative to the current working directory. This ensures\n+ # that patterns like \"^src/gen/\" also work when absolute paths are used\n+ # internally during recursive discovery/expansion.\n+ try:\n+ rel_element = os.path.relpath(element)\n+ except Exception: # pragma: no cover - platform-specific edge cases (e.g. different drives)\n+ rel_element = element\n return (\n basename in ignore_list\n or _is_in_ignore_list_re(basename, ignore_list_re)\n or _is_in_ignore_list_re(element, ignore_list_paths_re)\n+ or _is_in_ignore_list_re(rel_element, ignore_list_paths_re)\n )\n \n \n@@ -142,7 +151,18 @@ def expand_modules(\n continue\n if _is_in_ignore_list_re(\n os.path.basename(subfilepath), ignore_list_re\n- ) or _is_in_ignore_list_re(subfilepath, ignore_list_paths_re):\n+ ):\n+ continue\n+ # Also check path-based ignore patterns against a relative path\n+ # so that patterns anchored at project roots (e.g. '^src/gen/')\n+ # work even when absolute paths are used internally.\n+ try:\n+ rel_subfilepath = os.path.relpath(subfilepath)\n+ except Exception: # pragma: no cover - platform-specific edge cases\n+ rel_subfilepath = subfilepath\n+ if _is_in_ignore_list_re(subfilepath, ignore_list_paths_re) or _is_in_ignore_list_re(\n+ rel_subfilepath, ignore_list_paths_re\n+ ):\n continue\n \n modpath = _modpath_from_file(\n"}, {"instance_id": "pylint-dev__pylint-7080", "model_name_or_path": "openswarm-gemini", "model_patch": "diff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..31e235ae 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -13,6 +13,7 @@ import sys\n import tokenize\n import traceback\n import warnings\n+import re\n from collections import defaultdict\n from collections.abc import Callable, Iterable, Iterator, Sequence\n from io import TextIOWrapper\n@@ -771,10 +772,11 @@ class PyLinter(\n \n def _expand_files(self, modules: Sequence[str]) -> list[ModuleDescriptionDict]:\n \"\"\"Get modules and errors from a list of modules and handle errors.\"\"\"\n+ compiled_ignore_patterns = [re.compile(r) for r in self.config.ignore_patterns]\n result, errors = expand_modules(\n modules,\n self.config.ignore,\n- self.config.ignore_patterns,\n+ compiled_ignore_patterns,\n self._ignore_paths,\n )\n for error in errors:\n"}, {"instance_id": "pylint-dev__pylint-7080", "model_name_or_path": "openswarm-glm", "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..886bd87e 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -51,12 +51,13 @@ def _is_ignored_file(\n ignore_list: list[str],\n ignore_list_re: list[Pattern[str]],\n ignore_list_paths_re: list[Pattern[str]],\n+ is_directory: bool = False,\n ) -> bool:\n basename = os.path.basename(element)\n return (\n basename in ignore_list\n or _is_in_ignore_list_re(basename, ignore_list_re)\n- or _is_in_ignore_list_re(element, ignore_list_paths_re)\n+ or _is_in_ignore_list_re(element if is_directory else basename, ignore_list_paths_re)\n )\n \n \ndiff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..6320f0c9 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -602,6 +602,7 @@ class PyLinter(\n self.config.ignore,\n self.config.ignore_patterns,\n self.config.ignore_paths,\n+ is_directory=True,\n ):\n skip_subtrees.append(root)\n continue\n"}] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_7993_hybrid_RESOLVED_report.json b/benchmarks/results/swe_pylint_7993_hybrid_RESOLVED_report.json new file mode 100644 index 0000000..14324d5 --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7993" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7993" + ], + "resolved_ids": [ + "pylint-dev__pylint-7993" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_pylint_7993_hybrid_retry_preds.json b/benchmarks/results/swe_pylint_7993_hybrid_retry_preds.json new file mode 100644 index 0000000..570710a --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_retry_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..e1c59a059 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -12,6 +12,7 @@ from __future__ import annotations\n \n import os\n import re\n+import string\n import sys\n import warnings\n from dataclasses import asdict, fields\n@@ -175,15 +176,24 @@ class TextReporter(BaseReporter):\n self._template = template\n \n # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n+ formatter = string.Formatter()\n+ parts = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ parts.append(literal)\n+ if field_name is None:\n+ # No field here (also covers escaped braces which are returned as literals)\n+ continue\n+ if field_name not in MESSAGE_FIELDS:\n warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n+ f\"Don't recognize the argument '{field_name}' in the --msg-template. \"\n \"Are you sure it is supported on the current version of pylint?\"\n )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ # Drop unknown fields by inserting nothing\n+ continue\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_7993_hybrid_v3_preds.json b/benchmarks/results/swe_pylint_7993_hybrid_v3_preds.json new file mode 100644 index 0000000..e1146b2 --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_v3_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..5a85012e2 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -14,6 +14,7 @@ import os\n import re\n import sys\n import warnings\n+import string\n from dataclasses import asdict, fields\n from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, TextIO, cast, overload\n \n@@ -174,16 +175,19 @@ class TextReporter(BaseReporter):\n # Set template to the currently selected template\n self._template = template\n \n- # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n- warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n- \"Are you sure it is supported on the current version of pylint?\"\n- )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ formatter = string.Formatter()\n+ parts = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ parts.append(literal)\n+ if field_name is None:\n+ continue # no field here (also covers escaped braces which are returned as literals)\n+ if field_name not in MESSAGE_FIELDS:\n+ warnings.warn(f\"Don't recognize the argument '{field_name}' in the --msg-template. Are you sure it is supported on the current version of pylint?\")\n+ continue # drop unknown fields by inserting nothing\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_7993_hybrid_v4_preds.json b/benchmarks/results/swe_pylint_7993_hybrid_v4_preds.json new file mode 100644 index 0000000..ffb1a5e --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_v4_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..fe193f9a1 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -12,6 +12,7 @@ from __future__ import annotations\n \n import os\n import re\n+import string\n import sys\n import warnings\n from dataclasses import asdict, fields\n@@ -174,16 +175,21 @@ class TextReporter(BaseReporter):\n # Set template to the currently selected template\n self._template = template\n \n- # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n+ formatter = string.Formatter()\n+ parts = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ parts.append(literal)\n+ if field_name is None:\n+ continue # no field here (also covers escaped braces which are returned as literals)\n+ if field_name not in MESSAGE_FIELDS:\n warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n- \"Are you sure it is supported on the current version of pylint?\"\n+ f\"Don't recognize the argument '{field_name}' in the --msg-template. Are you sure it is supported on the current version of pylint?\"\n )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ continue # drop unknown fields by inserting nothing\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_7993_hybrid_v5_preds.json b/benchmarks/results/swe_pylint_7993_hybrid_v5_preds.json new file mode 100644 index 0000000..501078a --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_v5_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..ff823ab27 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -11,6 +11,7 @@\n from __future__ import annotations\n \n import os\n+import string\n import re\n import sys\n import warnings\n@@ -175,15 +176,21 @@ class TextReporter(BaseReporter):\n self._template = template\n \n # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n+ formatter = string.Formatter()\n+ parts = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ parts.append(literal)\n+ if field_name is None:\n+ continue # no field here (also covers escaped braces which are returned as literals)\n+ if field_name not in MESSAGE_FIELDS:\n warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n- \"Are you sure it is supported on the current version of pylint?\"\n+ f\"Don't recognize the argument '{field_name}' in the --msg-template. Are you sure it is supported on the current version of pylint?\"\n )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ continue # drop unknown fields by inserting nothing\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_7993_hybrid_v6_preds.json b/benchmarks/results/swe_pylint_7993_hybrid_v6_preds.json new file mode 100644 index 0000000..e6eaf23 --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_v6_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..4e73ee871 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -174,16 +174,24 @@ class TextReporter(BaseReporter):\n # Set template to the currently selected template\n self._template = template\n \n- # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n+ formatter = string.Formatter()\n+ parts: list[str] = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ # Re-escape braces in literals so a later .format() won’t re-interpret them\n+ if literal:\n+ parts.append(literal.replace(\"{\", \"{{\").replace(\"}\", \"}}\"))\n+ if field_name is None:\n+ continue # no field here (covers escaped braces)\n+ if field_name not in MESSAGE_FIELDS:\n warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n+ f\"Don't recognize the argument '{field_name}' in the --msg-template. \"\n \"Are you sure it is supported on the current version of pylint?\"\n )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ continue # drop unknown fields entirely, as before\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_7993_hybrid_v7_preds.json b/benchmarks/results/swe_pylint_7993_hybrid_v7_preds.json new file mode 100644 index 0000000..503ad61 --- /dev/null +++ b/benchmarks/results/swe_pylint_7993_hybrid_v7_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..14595ad44 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -12,6 +12,7 @@ from __future__ import annotations\n \n import os\n import re\n+import string\n import sys\n import warnings\n from dataclasses import asdict, fields\n@@ -175,15 +176,24 @@ class TextReporter(BaseReporter):\n self._template = template\n \n # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n+ formatter = string.Formatter()\n+ parts: list[str] = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ # Re-escape braces in literals so a later .format() won't re-interpret them\n+ if literal:\n+ parts.append(literal.replace(\"{\", \"{{\").replace(\"}\", \"}}\"))\n+ if field_name is None:\n+ continue # no field here (covers escaped braces)\n+ if field_name not in MESSAGE_FIELDS:\n warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n+ f\"Don't recognize the argument '{field_name}' in the --msg-template. \"\n \"Are you sure it is supported on the current version of pylint?\"\n )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ continue # drop unknown fields entirely\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\ndiff --git a/tests/reporters/unittest_reporting.py b/tests/reporters/unittest_reporting.py\nindex 37f3e5fd9..9c853dabf 100644\n--- a/tests/reporters/unittest_reporting.py\n+++ b/tests/reporters/unittest_reporting.py\n@@ -375,6 +375,23 @@ def test_multi_reporter_independant_messages() -> None:\n ), \"Message object should not be changed by reporters.\"\n \n \n+def test_template_option_with_header(linter) -> None:\n+ \"\"\"Test the msg-template option with custom braces (e.g. JSON-like output).\"\"\"\n+ output = StringIO()\n+ linter.reporter.out = output\n+ linter.config.msg_template = '{{ \"Category\": \"{category}\" }}'\n+ linter.open()\n+ linter.set_current_module(\"my_mod\")\n+ linter.add_message(\"C0301\", line=1, args=(1, 2))\n+ linter.add_message(\n+ \"line-too-long\", line=2, end_lineno=2, end_col_offset=4, args=(3, 4)\n+ )\n+\n+ out_lines = output.getvalue().split(\"\\n\")\n+ assert out_lines[1] == '{ \"Category\": \"convention\" }'\n+ assert out_lines[2] == '{ \"Category\": \"convention\" }'\n+\n+\n def test_display_results_is_renamed() -> None:\n class CustomReporter(TextReporter):\n def _display(self, layout: Section) -> None:\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_RESOLVED_report.json b/benchmarks/results/swe_pylint_RESOLVED_report.json new file mode 100644 index 0000000..81b626d --- /dev/null +++ b/benchmarks/results/swe_pylint_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7080" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7080" + ], + "resolved_ids": [ + "pylint-dev__pylint-7080" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_pylint_gemini_preds.json b/benchmarks/results/swe_pylint_gemini_preds.json new file mode 100644 index 0000000..d44b78a --- /dev/null +++ b/benchmarks/results/swe_pylint_gemini_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..31e235ae 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -13,6 +13,7 @@ import sys\n import tokenize\n import traceback\n import warnings\n+import re\n from collections import defaultdict\n from collections.abc import Callable, Iterable, Iterator, Sequence\n from io import TextIOWrapper\n@@ -771,10 +772,11 @@ class PyLinter(\n \n def _expand_files(self, modules: Sequence[str]) -> list[ModuleDescriptionDict]:\n \"\"\"Get modules and errors from a list of modules and handle errors.\"\"\"\n+ compiled_ignore_patterns = [re.compile(r) for r in self.config.ignore_patterns]\n result, errors = expand_modules(\n modules,\n self.config.ignore,\n- self.config.ignore_patterns,\n+ compiled_ignore_patterns,\n self._ignore_paths,\n )\n for error in errors:\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_glm_preds.json b/benchmarks/results/swe_pylint_glm_preds.json new file mode 100644 index 0000000..9b3427d --- /dev/null +++ b/benchmarks/results/swe_pylint_glm_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..886bd87e 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -51,12 +51,13 @@ def _is_ignored_file(\n ignore_list: list[str],\n ignore_list_re: list[Pattern[str]],\n ignore_list_paths_re: list[Pattern[str]],\n+ is_directory: bool = False,\n ) -> bool:\n basename = os.path.basename(element)\n return (\n basename in ignore_list\n or _is_in_ignore_list_re(basename, ignore_list_re)\n- or _is_in_ignore_list_re(element, ignore_list_paths_re)\n+ or _is_in_ignore_list_re(element if is_directory else basename, ignore_list_paths_re)\n )\n \n \ndiff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..6320f0c9 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -602,6 +602,7 @@ class PyLinter(\n self.config.ignore,\n self.config.ignore_patterns,\n self.config.ignore_paths,\n+ is_directory=True,\n ):\n skip_subtrees.append(root)\n continue\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_gpt5_preds.json b/benchmarks/results/swe_pylint_gpt5_preds.json new file mode 100644 index 0000000..fd4a0c0 --- /dev/null +++ b/benchmarks/results/swe_pylint_gpt5_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..afb48daa 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -53,10 +53,19 @@ def _is_ignored_file(\n ignore_list_paths_re: list[Pattern[str]],\n ) -> bool:\n basename = os.path.basename(element)\n+ # Match path-based ignores against both the provided path and its\n+ # representation relative to the current working directory. This ensures\n+ # that patterns like \"^src/gen/\" also work when absolute paths are used\n+ # internally during recursive discovery/expansion.\n+ try:\n+ rel_element = os.path.relpath(element)\n+ except Exception: # pragma: no cover - platform-specific edge cases (e.g. different drives)\n+ rel_element = element\n return (\n basename in ignore_list\n or _is_in_ignore_list_re(basename, ignore_list_re)\n or _is_in_ignore_list_re(element, ignore_list_paths_re)\n+ or _is_in_ignore_list_re(rel_element, ignore_list_paths_re)\n )\n \n \n@@ -142,7 +151,18 @@ def expand_modules(\n continue\n if _is_in_ignore_list_re(\n os.path.basename(subfilepath), ignore_list_re\n- ) or _is_in_ignore_list_re(subfilepath, ignore_list_paths_re):\n+ ):\n+ continue\n+ # Also check path-based ignore patterns against a relative path\n+ # so that patterns anchored at project roots (e.g. '^src/gen/')\n+ # work even when absolute paths are used internally.\n+ try:\n+ rel_subfilepath = os.path.relpath(subfilepath)\n+ except Exception: # pragma: no cover - platform-specific edge cases\n+ rel_subfilepath = subfilepath\n+ if _is_in_ignore_list_re(subfilepath, ignore_list_paths_re) or _is_in_ignore_list_re(\n+ rel_subfilepath, ignore_list_paths_re\n+ ):\n continue\n \n modpath = _modpath_from_file(\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_hybrid_RESOLVED_preds.json b/benchmarks/results/swe_pylint_hybrid_RESOLVED_preds.json new file mode 100644 index 0000000..9530395 --- /dev/null +++ b/benchmarks/results/swe_pylint_hybrid_RESOLVED_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..35f44d34 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -60,6 +60,14 @@ def _is_ignored_file(\n )\n \n \n+def _normalize_path(path: str) -> str:\n+ \"\"\"Normalize a path by stripping leading './' or '.\\\\'.\"\"\"\n+ if path.startswith(\"./\"):\n+ return path[2:]\n+ if path.startswith(\".\\\\\"):\n+ return path[3:]\n+ return path\n+\n def expand_modules(\n files_or_modules: Sequence[str],\n ignore_list: list[str],\n@@ -74,6 +82,7 @@ def expand_modules(\n path = sys.path.copy()\n \n for something in files_or_modules:\n+ something = _normalize_path(something)\n basename = os.path.basename(something)\n if _is_ignored_file(\n something, ignore_list, ignore_list_re, ignore_list_paths_re\ndiff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..4b123a4b 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -582,17 +582,27 @@ class PyLinter(\n if not msg.may_be_emitted():\n self._msgs_state[msg.msgid] = False\n \n+ def _normalize_path(self, path: str) -> str:\n+ \"\"\"Normalize a path by stripping leading './' or '.\\\\'.\"\"\"\n+ if path.startswith(\"./\"):\n+ return path[2:]\n+ if path.startswith(\".\\\\\"):\n+ return path[3:]\n+ return path\n+\n def _discover_files(self, files_or_modules: Sequence[str]) -> Iterator[str]:\n \"\"\"Discover python modules and packages in sub-directory.\n \n Returns iterator of paths to discovered modules and packages.\n \"\"\"\n for something in files_or_modules:\n+ something = self._normalize_path(something)\n if os.path.isdir(something) and not os.path.isfile(\n os.path.join(something, \"__init__.py\")\n ):\n skip_subtrees: list[str] = []\n for root, _, files in os.walk(something):\n+ root = self._normalize_path(root)\n if any(root.startswith(s) for s in skip_subtrees):\n # Skip subtree of already discovered package.\n continue\n@@ -611,7 +621,7 @@ class PyLinter(\n yield root\n else:\n yield from (\n- os.path.join(root, file)\n+ self._normalize_path(os.path.join(root, file))\n for file in files\n if file.endswith(\".py\")\n )\n@@ -1011,7 +1021,7 @@ class PyLinter(\n self.config.extension_pkg_whitelist\n )\n self.stats.reset_message_count()\n- self._ignore_paths = self.linter.config.ignore_paths\n+ self._ignore_paths = self.config.ignore_paths\n \n def generate_reports(self) -> int | None:\n \"\"\"Close the whole package /module, it's time to make reports !\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_hybrid_RESOLVED_report.json b/benchmarks/results/swe_pylint_hybrid_RESOLVED_report.json new file mode 100644 index 0000000..81b626d --- /dev/null +++ b/benchmarks/results/swe_pylint_hybrid_RESOLVED_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7080" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7080" + ], + "resolved_ids": [ + "pylint-dev__pylint-7080" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_pylint_hybrid_deepseek_preds.json b/benchmarks/results/swe_pylint_hybrid_deepseek_preds.json new file mode 100644 index 0000000..0e69bb2 --- /dev/null +++ b/benchmarks/results/swe_pylint_hybrid_deepseek_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/lint/expand_modules.py b/pylint/lint/expand_modules.py\nindex 5cacc037..de2053cf 100644\n--- a/pylint/lint/expand_modules.py\n+++ b/pylint/lint/expand_modules.py\n@@ -74,6 +74,11 @@ def expand_modules(\n path = sys.path.copy()\n \n for something in files_or_modules:\n+ # Normalize paths to remove leading \"./\" or \".\\\\\" so that\n+ # user-provided ignore-paths patterns (e.g. \"^src/...\") match\n+ # consistently when running from the current directory.\n+ if something.startswith(\"./\") or something.startswith(\".\\\\\"):\n+ something = something[2:]\n basename = os.path.basename(something)\n if _is_ignored_file(\n something, ignore_list, ignore_list_re, ignore_list_paths_re\ndiff --git a/pylint/lint/pylinter.py b/pylint/lint/pylinter.py\nindex e2bbf061..da742338 100644\n--- a/pylint/lint/pylinter.py\n+++ b/pylint/lint/pylinter.py\n@@ -593,6 +593,11 @@ class PyLinter(\n ):\n skip_subtrees: list[str] = []\n for root, _, files in os.walk(something):\n+ # Normalize paths to remove leading \"./\" or \".\\\\\" so that\n+ # user-provided ignore-paths patterns (e.g. \"^src/...\") match\n+ # consistently when running from the current directory.\n+ if root.startswith(\"./\") or root.startswith(\".\\\\\"):\n+ root = root[2:]\n if any(root.startswith(s) for s in skip_subtrees):\n # Skip subtree of already discovered package.\n continue\n@@ -1011,7 +1016,7 @@ class PyLinter(\n self.config.extension_pkg_whitelist\n )\n self.stats.reset_message_count()\n- self._ignore_paths = self.linter.config.ignore_paths\n+ self._ignore_paths = self.config.ignore_paths\n \n def generate_reports(self) -> int | None:\n \"\"\"Close the whole package /module, it's time to make reports !\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_hybrid_deepseek_report.json b/benchmarks/results/swe_pylint_hybrid_deepseek_report.json new file mode 100644 index 0000000..81b626d --- /dev/null +++ b/benchmarks/results/swe_pylint_hybrid_deepseek_report.json @@ -0,0 +1,23 @@ +{ + "total_instances": 1, + "submitted_instances": 1, + "completed_instances": 1, + "resolved_instances": 1, + "unresolved_instances": 0, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-7080" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-7080" + ], + "resolved_ids": [ + "pylint-dev__pylint-7080" + ], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/results/swe_pylint_hybrid_diagnosis.txt b/benchmarks/results/swe_pylint_hybrid_diagnosis.txt new file mode 100644 index 0000000..db44e09 --- /dev/null +++ b/benchmarks/results/swe_pylint_hybrid_diagnosis.txt @@ -0,0 +1,54 @@ +I cannot run more tools due to the tool budget being exhausted. I'll proceed with the final answer based on the current analysis.FILES TOUCHED +- None + +COMMANDS RUN +- None + +ROOT CAUSE +- The recursive walk and the “expand modules” flow both support ignore-paths, but there’s a mismatch in how the paths are formed and when filtering is applied, which causes patterns to be missed in common scenarios. +- Specifically: + 1) In PyLinter._discover_files (pylint/lint/pylinter.py), directories are filtered using _is_ignored_file(root, …, ignore_paths). This checks the directory path (e.g. src\gen), not its contents. If the user has a regex such as ^src/gen/.*$ (typical: match all files under src/gen), it does not match the directory itself (src\gen) and the subtree is not pruned. + - This is OK only if the second-stage filtering catches the files themselves. + 2) After recursive discovery, PyLinter._iterate_file_descrs -> _expand_files (pylint/lint/pylinter.py -> pylint/lint/expand_modules.py) applies ignore-paths again per element. That should normally catch files like src\gen\about.py against ^src/gen/.*$. + 3) The regression appears when the starting point is the current directory “.” with --recursive=y. os.walk('.') yields roots like '.' and './src', and files like './src/gen/about.py'. These have a “./” prefix, so a user regex such as ^src/gen/.*$ (based on docs and examples) won’t match './src/gen/about.py' (leading ‘./’ prevents ^src/... from matching). The ignore-paths option is documented to work with normalized paths (Posix/Windows), but it does not handle this “./” prefix case. As a result, ignore-paths are effectively ignored when running recursively from the current directory, which is exactly what the failing test (TestRunTC::test_ignore_path_recursive_current_dir) exercises. +- Secondary issue: In PyLinter.open(), _ignore_paths is populated from self.linter.config.ignore_paths instead of self.config.ignore_paths (pylint/lint/pylinter.py:1014). While PyLinter usually has self.linter == self for checkers, this is fragile and inconsistent with other uses of self.config. It risks _ignore_paths being stale/incorrect in some contexts. + +FIX PLAN +1) Normalize discovered file paths to remove a leading “./” so that user-provided ignore-paths which anchor at the start of the path (e.g. ^src/...) will match consistently when running from the current directory. + - File: pylint/lint/pylinter.py + - Function: _discover_files + - Change: Before yielding roots and file paths from os.walk, normalize the produced paths by stripping a leading “./” and “.\\” (Windows) to a consistent relative form. + - Pseudocode modification inside the os.walk loop: + - After computing ‘root’, add: + - if root.startswith("./"): root = root[2:] + - if root.startswith(".\\"): root = root[2:] + - When yielding files, ensure you normalize the join similarly: + - path = os.path.join(root, file) + - if path.startswith("./") or path.startswith(".\\"): strip the prefix + - This ensures elements passed to ignore matching start with the project-relative directory (e.g. src/...) instead of ./src/..., letting ^src/... patterns match. + - Also consider normalizing input arguments similarly at the beginning of _discover_files (for “something”): if something == "." or something starts with "./" or ".\\", strip the prefix before passing to os.walk. + +2) Ensure _ignore_paths is read from the correct config namespace consistently. + - File: pylint/lint/pylinter.py + - Function: open + - Change line assigning _ignore_paths: + - Replace: self._ignore_paths = self.linter.config.ignore_paths + - With: self._ignore_paths = self.config.ignore_paths + - This removes reliance on self.linter and keeps behavior consistent with other references (e.g. self.config.ignore_paths used in _discover_files). + +3) Optional safety enhancement: In expand_modules (pylint/lint/expand_modules.py), when evaluating whether to ignore an element, we currently use the provided element string as-is. To align behavior across both flows, we can normalize a leading “./” (or “.\\”) before matching, in case other call paths also feed elements with such prefixes. + - File: pylint/lint/expand_modules.py + - Function: expand_modules + - Change at the top of the for-loop over files_or_modules: + - Before computing basename and calling _is_ignored_file, add: + - if something.startswith("./") or something.startswith(".\\"): something = something[2:] + - This is defensive and ensures consistency even if other entrypoints yield “./”-prefixed paths. + +Rationale +- This directly addresses why ignore-paths work when the target path is “src/...” but fail when the run is initiated from “.” with --recursive=y. +- It keeps ignore-pattern semantics intact (still matching start-of-string), but makes the internal paths consistent with user expectations and documentation. +- Correcting self._ignore_paths source prevents subtle bugs and matches how other config values are retrieved. + +Uncertainties +- Path normalization and platform behavior: While stripping “./” and “.\\” is safe for relative paths, ensure there’s no case where absolute paths are fed (those should remain untouched). The proposed checks only strip when the string literal begins with those prefixes. +- Tests depend on relative-path matching; confirm no tests assert that “./” should be preserved in diagnostics. We’re only normalizing for matching/selection, not for reporting, so this should be fine. \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_hybrid_glm_preds.json b/benchmarks/results/swe_pylint_hybrid_glm_preds.json new file mode 100644 index 0000000..fe56e77 --- /dev/null +++ b/benchmarks/results/swe_pylint_hybrid_glm_preds.json @@ -0,0 +1,7 @@ +[ + { + "instance_id": "pylint-dev__pylint-7080", + "model_name_or_path": "openswarm", + "model_patch": "" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_new2_hybrid_preds.json b/benchmarks/results/swe_pylint_new2_hybrid_preds.json new file mode 100644 index 0000000..6225e11 --- /dev/null +++ b/benchmarks/results/swe_pylint_new2_hybrid_preds.json @@ -0,0 +1,12 @@ +[ + { + "instance_id": "pylint-dev__pylint-5859", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/checkers/misc.py b/pylint/checkers/misc.py\nindex 69149e61a..28978365f 100644\n--- a/pylint/checkers/misc.py\n+++ b/pylint/checkers/misc.py\n@@ -121,9 +121,9 @@ class EncodingChecker(BaseChecker):\n \n notes = \"|\".join(re.escape(note) for note in self.config.notes)\n if self.config.notes_rgx:\n- regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})\\b\"\n+ regex_string = rf\"#\\s*({notes}|{self.config.notes_rgx})(?=\\W|$)\"\n else:\n- regex_string = rf\"#\\s*({notes})\\b\"\n+ regex_string = rf\"#\\s*({notes})(?=\\W|$)\"\n \n self._fixme_pattern = re.compile(regex_string, re.I)\n \n" + }, + { + "instance_id": "pylint-dev__pylint-7993", + "model_name_or_path": "openswarm", + "model_patch": "diff --git a/pylint/reporters/text.py b/pylint/reporters/text.py\nindex 29bd46798..9f8b9c017 100644\n--- a/pylint/reporters/text.py\n+++ b/pylint/reporters/text.py\n@@ -14,6 +14,7 @@ import os\n import re\n import sys\n import warnings\n+import string\n from dataclasses import asdict, fields\n from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, TextIO, cast, overload\n \n@@ -175,15 +176,19 @@ class TextReporter(BaseReporter):\n self._template = template\n \n # Check to see if all parameters in the template are attributes of the Message\n- arguments = re.findall(r\"\\{(.+?)(:.*)?\\}\", template)\n- for argument in arguments:\n- if argument[0] not in MESSAGE_FIELDS:\n- warnings.warn(\n- f\"Don't recognize the argument '{argument[0]}' in the --msg-template. \"\n- \"Are you sure it is supported on the current version of pylint?\"\n- )\n- template = re.sub(r\"\\{\" + argument[0] + r\"(:.*?)?\\}\", \"\", template)\n- self._fixed_template = template\n+ formatter = string.Formatter()\n+ parts = []\n+ for literal, field_name, format_spec, conversion in formatter.parse(template):\n+ parts.append(literal)\n+ if field_name is None:\n+ continue # no field here (also covers escaped braces which are returned as literals)\n+ if field_name not in MESSAGE_FIELDS:\n+ warnings.warn(f\"Don't recognize the argument '{field_name}' in the --msg-template. Are you sure it is supported on the current version of pylint?\")\n+ continue # drop unknown fields by inserting nothing\n+ conv = f\"!{conversion}\" if conversion else \"\"\n+ spec = f\":{format_spec}\" if format_spec else \"\"\n+ parts.append(\"{\" + field_name + conv + spec + \"}\")\n+ self._fixed_template = \"\".join(parts)\n \n def write_message(self, msg: Message) -> None:\n \"\"\"Convenience method to write a formatted message with class default\n" + } +] \ No newline at end of file diff --git a/benchmarks/results/swe_pylint_new2_hybrid_report.json b/benchmarks/results/swe_pylint_new2_hybrid_report.json new file mode 100644 index 0000000..cc642f5 --- /dev/null +++ b/benchmarks/results/swe_pylint_new2_hybrid_report.json @@ -0,0 +1,27 @@ +{ + "total_instances": 2, + "submitted_instances": 2, + "completed_instances": 2, + "resolved_instances": 1, + "unresolved_instances": 1, + "empty_patch_instances": 0, + "error_instances": 0, + "completed_ids": [ + "pylint-dev__pylint-5859", + "pylint-dev__pylint-7993" + ], + "incomplete_ids": [], + "empty_patch_ids": [], + "submitted_ids": [ + "pylint-dev__pylint-5859", + "pylint-dev__pylint-7993" + ], + "resolved_ids": [ + "pylint-dev__pylint-5859" + ], + "unresolved_ids": [ + "pylint-dev__pylint-7993" + ], + "error_ids": [], + "schema_version": 2 +} diff --git a/benchmarks/sweBench.ts b/benchmarks/sweBench.ts new file mode 100644 index 0000000..29acaac --- /dev/null +++ b/benchmarks/sweBench.ts @@ -0,0 +1,214 @@ +#!/usr/bin/env tsx +// ============================================ +// OpenSwarm - SWE-bench Lite: solve with OpenSwarm, grade with official harness +// Created: 2026-06-09 +// Purpose: OpenSwarm worker(runAgenticLoop)가 실제 SWE-bench 버그를 풀고, 채점은 공식 +// swebench 하니스에 위임. 채점을 손으로 재현하다 함정(테스트 수집/gold/base 상태)에 +// 반복해 걸려서 — 해결은 OpenSwarm, 평가는 표준 도구로 분리. +// +// 흐름: +// 1. SWE-bench 이미지에서 /testbed(=base+test_patch) 호스트 추출 +// 2. OpenSwarm worker가 호스트 소스를 read/edit로 수정 (Codex CLI 아님 — 진짜 하네스) +// 자가검증: bash run_tests.sh → 컨테이너 conda env에서 FAIL_TO_PASS 실행 +// 3. git diff로 model_patch 추출 → prediction JSON 작성 +// → 채점은 별도: python -m swebench.harness.run_evaluation -p preds.json ... +// +// 실행: OPENROUTER_API=... npx tsx benchmarks/sweBench.ts [outPreds.json] +// ============================================ + +import { mkdtemp, rm, writeFile, readFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; +import { runWorker } from '../src/agents/worker.js'; +import { setDefaultAdapter } from '../src/adapters/index.js'; +import { initLocale } from '../src/locale/index.js'; + +const exec = promisify(execFile); + +interface SweInstance { + instance_id: string; + repo: string; + base_commit: string; + problem_statement: string; + FAIL_TO_PASS: string; +} + +function imageFor(id: string): string { + return `swebench/sweb.eval.x86_64.${id.replace('__', '_1776_')}:latest`; +} + +/** + * Diagnosis section for hybrid stage 2 (implementer). Includes mechanical + * finishing instructions — in the first hybrid run the implementer had the + * diagnosis (correct location) yet failed on a missing import and `self` use + * inside a @staticmethod; this prevents that recurrence. + * Trust boundary: the diagnosis pseudocode itself may be buggy (measured on + * 7993 — a missing Formatter.parse literal re-escape propagated verbatim to + * the implementer twice). Stating that test results outrank the fix plan + * blocks blind copying. + */ +function buildDiagnosisSection(diagText: string): string { + return ( + `\n\n## Root-cause diagnosis (from a senior engineer — trust this analysis)\n` + + `${diagText}\n\n` + + `Apply the FIX PLAN above. Your job is the implementation and verification, not re-diagnosis.\n` + + `The ROOT CAUSE analysis is reliable, but the FIX PLAN pseudocode may contain mechanical bugs ` + + `of its own. THE TEST RESULT OUTRANKS THE PLAN: if you applied the plan faithfully and the ` + + `tests still fail, the pseudocode itself is buggy — debug from the actual test output and fix ` + + `the implementation (keeping the root-cause approach), do NOT re-apply the same code again.\n` + + `Implementation mechanics matter: if you call a function not imported in that file, ADD the ` + + `import. If you reference \`self\` inside a @staticmethod, convert it to an instance method ` + + `(add \`self\` param, remove the decorator) or pass the values in. If edit_file fails with ` + + `"old_string not found", re-read the exact lines and retry with the verbatim text — do NOT give up. ` + + `If the test output shows NameError / ImportError / AttributeError / TypeError, that is YOUR ` + + `mechanical bug — read the traceback and fix it, do not abandon the approach.\n` + ); +} + +async function sh(cmd: string, args: string[], opts: { cwd?: string; timeoutMs?: number } = {}) { + return exec(cmd, args, { cwd: opts.cwd, timeout: opts.timeoutMs ?? 600_000, maxBuffer: 1024 * 1024 * 64 }); +} + +const MODEL_NAME = 'openswarm'; + +async function solveOne(inst: SweInstance, model: string): Promise<{ pred: Record; resolvedHint: string }> { + const image = imageFor(inst.instance_id); + const container = `swe-${inst.instance_id.replace(/[^a-z0-9]/gi, '-')}`; + const hostDir = await mkdtemp(join(tmpdir(), 'swe-')); + const log = (s: string) => console.log(s); + const failToPass: string[] = JSON.parse(inst.FAIL_TO_PASS); + + await sh('docker', ['rm', '-f', container], { timeoutMs: 30_000 }).catch(() => {}); + try { + log(`\n=== ${inst.instance_id} (${inst.repo}) ===`); + await sh('docker', ['run', '-d', '--name', container, '--platform', 'linux/amd64', image, 'sleep', 'infinity'], { timeoutMs: 60_000 }); + log(' extracting /testbed...'); + await sh('docker', ['cp', `${container}:/testbed/.`, hostDir], { timeoutMs: 120_000 }); + + // git baseline 고정 (patch 추출 기준). 추출된 /testbed는 이미 git repo. + await sh('git', ['add', '-A'], { cwd: hostDir, timeoutMs: 30_000 }).catch(() => {}); + await sh('git', ['-c', 'user.email=b@b', '-c', 'user.name=b', 'commit', '-qm', 'baseline', '--allow-empty'], { cwd: hostDir, timeoutMs: 30_000 }).catch(() => {}); + const baseSha = (await sh('git', ['rev-parse', 'HEAD'], { cwd: hostDir, timeoutMs: 10_000 })).stdout.trim(); + + // worker 자가검증 래퍼 — 변경분을 컨테이너에 sync 후 conda env에서 테스트. + const runTests = [ + '#!/usr/bin/env bash', + `docker cp . ${container}:/testbed >/dev/null 2>&1`, + `docker exec ${container} bash -lc "source /opt/miniconda3/bin/activate testbed && cd /testbed && python -m pytest ${failToPass.map((t) => `'${t}'`).join(' ')} -q --no-header -p no:warnings --tb=short 2>&1 | tail -25"`, + ].join('\n'); + await writeFile(join(hostDir, 'run_tests.sh'), runTests, { mode: 0o755 }); + + // ---- 하이브리드 모드 (SWE_DIAG_MODEL 설정 시) ---- + // Stage 1: frontier가 read-only 진단 → root cause + 수정 계획 텍스트. + // Stage 2: 경량 모델이 진단서를 받아 구현 + 검증 루프. + // 가설: 경량의 천장은 "진단 깊이"이므로, 그 부분만 frontier가 메우면 + // 긴 edit-test 루프(토큰 대부분)는 싼 모델로 충분할 것. + const diagModel = process.env.SWE_DIAG_MODEL; + const diagFile = process.env.SWE_DIAG_FILE; // 저장된 진단 재사용 (stage 2만 재시도) + let diagnosisSection = ''; + if (diagFile) { + const diagText = (await readFile(diagFile, 'utf-8')).trim(); + log(` stage 1 skipped — reusing diagnosis from ${diagFile} (${diagText.length} chars)`); + diagnosisSection = buildDiagnosisSection(diagText); + } else if (diagModel) { + log(` stage 1: diagnosing with ${diagModel} (read-only)...`); + const diag = await runWorker({ + taskTitle: `Diagnose ${inst.instance_id}`, + taskDescription: + `${inst.problem_statement}\n\n` + + `You are a DIAGNOSTICIAN, not an implementer. Explore the ${inst.repo} source ` + + `(search_files + read_file) and produce a precise root-cause diagnosis. ` + + `Do NOT edit any files. Do NOT run run_tests.sh. Read-only.\n\n` + + `Your final message MUST contain:\n` + + `1. ROOT CAUSE: the exact mechanism of the bug (which function, what goes wrong, why).\n` + + `2. FIX PLAN: the exact file + function to change, and precisely what the change should be ` + + `(describe the code to add/modify — concrete enough that a junior developer could apply it ` + + `without re-deriving the analysis).\n` + + `Failing tests (for context):\n` + failToPass.map((t) => ` - ${t}`).join('\n'), + projectPath: hostDir, + adapterName: 'openrouter', + model: diagModel, + timeoutMs: 900_000, + maxTurns: 50, + onLog: process.env.SWE_VERBOSE ? (l) => console.log(` [diag] ${l}`) : () => {}, + }); + // 진단자가 실수로 수정했어도 구현 단계는 깨끗한 베이스에서 시작 + await sh('git', ['checkout', '--', '.'], { cwd: hostDir, timeoutMs: 30_000 }).catch(() => {}); + const diagText = (diag.output || diag.summary || '').trim(); + log(` stage 1 done — diagnosis ${diagText.length} chars`); + await writeFile(`/tmp/swe_diagnosis_${inst.instance_id}.txt`, diagText).catch(() => {}); + diagnosisSection = buildDiagnosisSection(diagText); + } + + log(` ${diagnosisSection ? 'stage 2: implementing' : 'worker solving'} (OpenSwarm harness)...`); + const result = await runWorker({ + taskTitle: `Fix ${inst.instance_id}`, + taskDescription: + `${inst.problem_statement}${diagnosisSection}\n\n` + + `This is a real bug in ${inst.repo}. Your job: locate the root cause in the SOURCE files ` + + `(search_files + read_file) and FIX it with edit_file. Do NOT edit test files.\n\n` + + `Do NOT try to set up a Python environment (no pip install, no venv) — the test environment ` + + `is already managed inside a container.\n\n` + + `MANDATORY verification loop: after EVERY edit, run \`bash run_tests.sh\` — it executes the ` + + `failing tests in the correct environment and prints pass/fail. If tests still fail, read the ` + + `failure output, refine your diagnosis, and edit again. Repeat edit→test until the tests pass. ` + + `Do NOT finish while the tests are failing — an unverified patch is worthless. A plausible-looking ` + + `fix in the wrong place is the most common failure mode; only the test output proves correctness.\n\n` + + `Failing tests:\n` + failToPass.map((t) => ` - ${t}`).join('\n'), + projectPath: hostDir, + adapterName: 'openrouter', + model, + timeoutMs: 1_200_000, + maxTurns: 80, + // SWE 작업은 수정이 필수 — 모델이 분석만 하고 끝내려 하면 2회까지 되민다. + nudgeMaxOnNoEdit: 2, + // Verification-harness protection — on 7993 the implementer blamed test + // failures on run_tests.sh and edited it 5 times, dismantling verification. + protectedFiles: ['run_tests.sh'], + // run_tests.sh = docker cp + in-container pytest — the 30s default times + // out into a silent no-output failure the model reads as a broken env. + bashTimeoutMs: 240_000, + onLog: process.env.SWE_VERBOSE ? (l) => console.log(` ${l}`) : () => {}, + }); + + // model_patch 추출 — run_tests.sh는 제외(평가 노이즈 방지) + await sh('git', ['rm', '--cached', '-q', 'run_tests.sh'], { cwd: hostDir, timeoutMs: 10_000 }).catch(() => {}); + const diff = (await sh('git', ['diff', baseSha, '--', '.', ':(exclude)run_tests.sh'], { cwd: hostDir, timeoutMs: 30_000 })).stdout; + + log(` worker done — ${result.filesChanged?.length ?? 0} files, patch ${diff.split('\n').length} lines`); + return { + pred: { instance_id: inst.instance_id, model_name_or_path: MODEL_NAME, model_patch: diff }, + resolvedHint: `${result.filesChanged?.length ?? 0} files`, + }; + } finally { + await sh('docker', ['rm', '-f', container], { timeoutMs: 30_000 }).catch(() => {}); + await rm(hostDir, { recursive: true, force: true }); + } +} + +async function main() { + initLocale('en'); + setDefaultAdapter('openrouter'); + const file = process.argv[2]; + const outPreds = process.argv[3] ?? join(tmpdir(), 'swe-preds.json'); + if (!file) { console.error('usage: sweBench.ts [outPreds.json]'); process.exit(1); } + const instances: SweInstance[] = JSON.parse(await readFile(file, 'utf-8')); + const model = process.env.SWE_MODEL ?? 'deepseek/deepseek-v4-flash'; + console.log(`[swe] ${instances.length} instances, model=${model} (OpenSwarm harness → official grading)`); + + const preds = []; + for (const inst of instances) { + const { pred } = await solveOne(inst, model); + preds.push(pred); + } + await writeFile(outPreds, JSON.stringify(preds, null, 2)); + console.log(`\npredictions → ${outPreds}`); + console.log(`\nGrade with official harness:`); + console.log(` /tmp/swebench-env/bin/python -m swebench.harness.run_evaluation \\`); + console.log(` --dataset_name SWE-bench/SWE-bench_Lite --predictions_path ${outPreds} \\`); + console.log(` --run_id openswarm-run --instance_ids ${instances.map((i) => i.instance_id).join(' ')} --cache_level instance`); +} + +main().catch((e) => { console.error('FATAL', e); process.exit(1); }); diff --git a/benchmarks/tasks/codingTasks.ts b/benchmarks/tasks/codingTasks.ts new file mode 100644 index 0000000..675b1d2 --- /dev/null +++ b/benchmarks/tasks/codingTasks.ts @@ -0,0 +1,491 @@ +// ============================================ +// OpenSwarm - Coding Benchmark Task Set +// Created: 2026-06-09 +// Purpose: 모델 라우팅 파레토 측정용 코딩 태스크. 각 태스크는 임시 git repo를 +// 만들고(setup), runWorker로 작업시킨 뒤, 객관적 기준(check)으로 채점한다. +// VEGA benchmarks의 Track A(결정적 매처) 철학을 코딩 도메인에 이식. +// ============================================ + +export interface BenchTask { + id: string; + /** + * L0=단일 수정, L1=탐색+수정, L2=다중 파일/추론, L3=테스트 통과, + * L4=고난도(연쇄 의존성/edge case 완전성/숨은 버그 추적), + * L5=난해(알고리즘 정확성/상태기계/미묘한 경계·타입 — 약한 모델이 실패하는 변별 영역). + * L6(실전 GitHub 버그, SWE-bench)은 self-contained가 아니라 Docker+공식 채점이 필요해 + * 이 파일이 아니라 `benchmarks/sweBench.ts`가 담당한다. 루브릭 전체는 RUBRIC.md 참조. + */ + level: 'L0' | 'L1' | 'L2' | 'L3' | 'L4' | 'L5'; + title: string; + description: string; + /** repo 초기 파일 셋 (path → content) */ + files: Record; + /** + * 채점: 작업 후 repo 상태를 받아 통과 여부 판정. 객관적이어야 한다. + * - read: repo 내 파일 내용 읽기 (포매팅·구현방식 무관 검증용) + * - repoDir: repo 절대경로 (테스트 실제 실행 등 행위 검증용) + * 가능하면 정규식 휴리스틱보다 실행(테스트 통과)으로 채점해 false negative를 피한다. + */ + check: ( + read: (path: string) => string | null, + repoDir: string, + ) => { passed: boolean; reason: string }; +} + +import { execFileSync } from 'node:child_process'; +import { readdirSync } from 'node:fs'; + +/** + * 행위 검증 공통 헬퍼: repo 안의 테스트 파일을 실제 실행해 PASS 마커를 확인한다. + * 정규식 휴리스틱 대비 false negative가 없다(구현 방식과 무관, 통과 여부만 본다). + */ +function runTestFile(repoDir: string, testFile: string): { passed: boolean; reason: string } { + try { + const out = execFileSync('npx', ['tsx', testFile], { + cwd: repoDir, + timeout: 60_000, + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'pipe'], + }); + if (out.includes('PASS')) return { passed: true, reason: 'test passed (executed)' }; + return { passed: false, reason: `ran but no PASS marker: ${out.slice(0, 80)}` }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { passed: false, reason: `test failed: ${msg.slice(0, 140)}` }; + } +} + +/** + * 타입체크 검증: repo의 모든 .ts를 strict tsc로 컴파일해 타입 에러가 없는지 본다. + * 타입 변경 태스크(런타임만으론 검증 불가)에 쓴다. tsx는 타입을 무시하므로 별도 필요. + */ +function runTypeCheck(repoDir: string, includeTests = false): { passed: boolean; reason: string } { + const files = readdirSync(repoDir).filter( + (f) => f.endsWith('.ts') && (includeTests || !f.endsWith('.test.ts')), + ); + try { + // npx -p typescript 로 명시해 repo-local tsc 부재 시에도 정확한 tsc를 받는다. + execFileSync('npx', ['-y', '-p', 'typescript@5.6.3', 'tsc', '--noEmit', '--strict', '--skipLibCheck', '--moduleResolution', 'bundler', '--module', 'esnext', '--target', 'es2022', ...files], { + cwd: repoDir, + timeout: 120_000, + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'pipe'], + }); + return { passed: true, reason: 'typecheck clean' }; + } catch (err) { + const out = ((err as { stdout?: string }).stdout ?? '') + ((err as { stderr?: string }).stderr ?? ''); + return { passed: false, reason: `typecheck failed: ${(out || String(err)).slice(0, 140)}` }; + } +} + +export const CODING_TASKS: BenchTask[] = [ + { + id: 'L0-fix-multiply', + level: 'L0', + title: 'Fix the multiply() bug in calc.ts', + description: + 'multiply(a, b) in calc.ts currently returns a + b, which is wrong. ' + + 'It must return the product a * b. Fix only multiply; leave add() untouched.', + files: { + 'calc.ts': + 'export function add(a: number, b: number): number {\n return a + b;\n}\n\n' + + '// BUG: multiply adds instead of multiplying\n' + + 'export function multiply(a: number, b: number): number {\n return a + b;\n}\n', + 'README.md': '# scratch\n\nMath utilities.\n', + }, + check: (read) => { + const c = read('calc.ts'); + if (!c) return { passed: false, reason: 'calc.ts missing' }; + const mul = /function multiply[\s\S]*?return\s+a\s*\*\s*b/.test(c); + const addOk = /function add[\s\S]*?return\s+a\s*\+\s*b/.test(c); + if (!mul) return { passed: false, reason: 'multiply not fixed to a*b' }; + if (!addOk) return { passed: false, reason: 'add() was altered' }; + return { passed: true, reason: 'multiply=a*b, add intact' }; + }, + }, + + { + id: 'L1-add-null-guard', + level: 'L1', + title: 'Add a null/empty guard to parseConfig', + description: + 'parseConfig(raw) in config.ts calls JSON.parse(raw) directly and throws on ' + + 'null/undefined/empty input. Add a guard: if raw is null, undefined, or an empty ' + + 'string, return an empty object {} instead of throwing. Keep valid JSON parsing intact.', + files: { + 'config.ts': + 'export function parseConfig(raw: string): Record {\n' + + ' return JSON.parse(raw);\n' + + '}\n', + }, + check: (read) => { + const c = read('config.ts'); + if (!c) return { passed: false, reason: 'config.ts missing' }; + // 가드가 있어야 함: raw falsy → {} 반환. 다양한 표현 허용. + const hasGuard = + /if\s*\(\s*!raw/.test(c) || + /raw\s*===?\s*null/.test(c) || + /raw\s*==\s*null/.test(c) || + /!raw\s*\|\|/.test(c) || + /raw\?\?/.test(c) || + /raw\.length\s*===?\s*0/.test(c) || + /raw\.trim\(\)/.test(c); + const returnsEmpty = /\{\s*\}/.test(c); // empty-object literal present somewhere + const stillParses = /JSON\.parse/.test(c); + if (!hasGuard) return { passed: false, reason: 'no null/empty guard found' }; + if (!returnsEmpty) return { passed: false, reason: 'no empty-object return' }; + if (!stillParses) return { passed: false, reason: 'JSON.parse removed' }; + return { passed: true, reason: 'guard + empty return + parse intact' }; + }, + }, + + { + id: 'L2-rename-across-files', + level: 'L2', + title: 'Rename getUserName to getDisplayName across the module', + description: + 'Rename the function getUserName to getDisplayName. It is defined in user.ts and ' + + 'called in greet.ts. Update BOTH the definition and the call site so the code stays ' + + 'consistent. Do not change behavior, only the name.', + files: { + 'user.ts': + 'export function getUserName(id: string): string {\n' + + ' return `user-${id}`;\n' + + '}\n', + 'greet.ts': + "import { getUserName } from './user.js';\n\n" + + 'export function greet(id: string): string {\n' + + ' return `Hello, ${getUserName(id)}`;\n' + + '}\n', + }, + check: (read) => { + const u = read('user.ts'); + const g = read('greet.ts'); + if (!u || !g) return { passed: false, reason: 'user.ts or greet.ts missing' }; + const defRenamed = /function getDisplayName/.test(u) && !/function getUserName/.test(u); + const callRenamed = /getDisplayName\(/.test(g) && !/getUserName/.test(g); + const importRenamed = /import\s*\{\s*getDisplayName\s*\}/.test(g); + if (!defRenamed) return { passed: false, reason: 'definition not renamed' }; + if (!callRenamed) return { passed: false, reason: 'call site not renamed' }; + if (!importRenamed) return { passed: false, reason: 'import not updated' }; + return { passed: true, reason: 'def + import + call all renamed' }; + }, + }, + + { + id: 'L3-implement-to-pass-test', + level: 'L3', + title: 'Implement isPalindrome so the existing test passes', + description: + 'isPalindrome(s) in palindrome.ts is a stub that always returns false. Implement it ' + + 'so it returns true iff the string reads the same forwards and backwards (case-sensitive, ' + + 'comparing the raw characters). The test file palindrome.test.ts already exists — make it pass. ' + + 'Run the test to verify.', + files: { + 'palindrome.ts': + 'export function isPalindrome(s: string): boolean {\n' + + ' return false; // TODO: implement\n' + + '}\n', + 'palindrome.test.ts': + "import { isPalindrome } from './palindrome.js';\n" + + "if (isPalindrome('racecar') !== true) throw new Error('racecar should be palindrome');\n" + + "if (isPalindrome('hello') !== false) throw new Error('hello is not palindrome');\n" + + "if (isPalindrome('') !== true) throw new Error('empty is palindrome');\n" + + "if (isPalindrome('ab') !== false) throw new Error('ab is not palindrome');\n" + + "console.log('PASS');\n", + }, + check: (read, repoDir) => { + const p = read('palindrome.ts'); + if (!p) return { passed: false, reason: 'palindrome.ts missing' }; + // 행위 검증: 실제로 테스트를 실행한다. 구현 방식(reverse/투포인터/재귀)과 + // 무관하게 "테스트가 통과하는가"만 본다 — 정규식 false negative 제거. + try { + const out = execFileSync('npx', ['tsx', 'palindrome.test.ts'], { + cwd: repoDir, + timeout: 60_000, + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'pipe'], + }); + if (out.includes('PASS')) return { passed: true, reason: 'test passed (executed)' }; + return { passed: false, reason: `test ran but no PASS marker: ${out.slice(0, 80)}` }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return { passed: false, reason: `test failed: ${msg.slice(0, 120)}` }; + } + }, + }, + + // ============ L4 — 고난도 (경량 모델 변별용) ============ + + { + id: 'L4-cascading-signature-change', + level: 'L4', + title: 'Change formatPrice to take a currency arg, fix ALL call sites', + description: + 'formatPrice(amount) in money.ts must become formatPrice(amount, currency) where currency ' + + 'is a string like "USD" prepended to the output (e.g. formatPrice(5, "USD") → "USD 5.00"). ' + + 'This function is called in THREE other files: cart.ts, invoice.ts, and receipt.ts. Update the ' + + 'signature AND every call site so all callers pass a sensible currency. The test file checks ' + + 'every module — run it to verify nothing was missed.', + files: { + 'money.ts': + 'export function formatPrice(amount: number): string {\n' + + ' return amount.toFixed(2);\n' + + '}\n', + 'cart.ts': + "import { formatPrice } from './money.js';\n" + + 'export function cartLine(qty: number, price: number): string {\n' + + ' return `${qty} x ${formatPrice(price)}`;\n' + + '}\n', + 'invoice.ts': + "import { formatPrice } from './money.js';\n" + + 'export function invoiceTotal(sum: number): string {\n' + + ' return `Total: ${formatPrice(sum)}`;\n' + + '}\n', + 'receipt.ts': + "import { formatPrice } from './money.js';\n" + + 'export function receiptLine(label: string, amt: number): string {\n' + + ' return `${label}: ${formatPrice(amt)}`;\n' + + '}\n', + 'check.test.ts': + "import { formatPrice } from './money.js';\n" + + "import { cartLine } from './cart.js';\n" + + "import { invoiceTotal } from './invoice.js';\n" + + "import { receiptLine } from './receipt.js';\n" + + "if (formatPrice(5, 'USD') !== 'USD 5.00') throw new Error('formatPrice signature/output wrong: ' + formatPrice(5,'USD'));\n" + + "if (!cartLine(2, 3).includes('USD')) throw new Error('cart.ts call site not updated');\n" + + "if (!invoiceTotal(9).includes('USD')) throw new Error('invoice.ts call site not updated');\n" + + "if (!receiptLine('Tax', 1).includes('USD')) throw new Error('receipt.ts call site not updated');\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'check.test.ts'), + }, + + { + id: 'L4-edge-case-completeness', + level: 'L4', + title: 'Implement median() correctly for all edge cases', + description: + 'Implement median(nums) in stats.ts. The median of a sorted list is the middle value, OR the ' + + 'average of the two middle values when the count is even. Handle: empty array (return 0), ' + + 'single element, even count (average the two middle values), and UNSORTED input (you must sort first — ' + + 'the input is not guaranteed sorted). The test exercises all of these. Run it to verify.', + files: { + 'stats.ts': + 'export function median(nums: number[]): number {\n' + + ' return nums[Math.floor(nums.length / 2)]; // naive + wrong for even/unsorted/empty\n' + + '}\n', + 'stats.test.ts': + "import { median } from './stats.js';\n" + + "function eq(a: number, b: number, m: string){ if (!(Math.abs(a-b) <= 1e-9)) throw new Error(m + ' got ' + a); }\n" + + "eq(median([]), 0, 'empty → 0');\n" + + "eq(median([5]), 5, 'single');\n" + + "eq(median([1,2,3]), 2, 'odd sorted');\n" + + "eq(median([1,2,3,4]), 2.5, 'even → average');\n" + + "eq(median([3,1,2]), 2, 'UNSORTED odd');\n" + + "eq(median([4,1,3,2]), 2.5, 'UNSORTED even');\n" + + "eq(median([7,7,7]), 7, 'duplicates');\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'stats.test.ts'), + }, + + { + id: 'L4-hidden-bug-debug', + level: 'L4', + title: 'Find and fix the bug making the cart total wrong', + description: + 'cartTotal() in shop.ts returns the wrong total and test.ts fails. The symptom is a wrong number, ' + + 'but the root cause is NOT in cartTotal itself — it is in a helper it calls. Investigate the helpers ' + + '(applyDiscount, lineSubtotal), find the actual bug, and fix it. Do not just patch cartTotal to mask ' + + 'the symptom — fix the real cause. Run test.ts to verify.', + files: { + // 진짜 버그: applyDiscount가 percent를 0-1로 기대하는데 lineSubtotal은 0-100으로 넘긴다. + // cartTotal 자체는 멀쩡하다 — 증상은 cartTotal에서, 원인은 applyDiscount/호출 규약에서. + 'shop.ts': + 'function lineSubtotal(price: number, qty: number): number {\n' + + ' return price * qty;\n' + + '}\n' + + '// discount is a percentage 0-100 (e.g. 10 = 10% off)\n' + + 'function applyDiscount(amount: number, discount: number): number {\n' + + ' return amount - amount * discount; // BUG: treats discount as a fraction, not a percent\n' + + '}\n' + + 'export function cartTotal(price: number, qty: number, discountPercent: number): number {\n' + + ' return applyDiscount(lineSubtotal(price, qty), discountPercent);\n' + + '}\n', + 'test.ts': + "import { cartTotal } from './shop.js';\n" + + "// 100 * 2 = 200, 10% off → 180\n" + + "const got = cartTotal(100, 2, 10);\n" + + "if (Math.abs(got - 180) > 1e-9) throw new Error('expected 180, got ' + got);\n" + + "// 50 * 1 = 50, 0% off → 50\n" + + "if (Math.abs(cartTotal(50, 1, 0) - 50) > 1e-9) throw new Error('0% case wrong: ' + cartTotal(50,1,0));\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'test.ts'), + }, + + { + id: 'L4-deep-dependency-chain', + level: 'L4', + title: 'Change the User.id type from number to string across the chain', + description: + 'User.id is currently a number. Change it to a string everywhere it flows: the User interface ' + + '(types.ts), the factory makeUser (factory.ts), the lookup findUser (repo.ts which compares ids), ' + + 'and the formatter userLabel (format.ts). Every layer touches the id — update all of them so types ' + + 'stay consistent and the test passes. Run check.test.ts.', + files: { + 'types.ts': 'export interface User {\n id: number;\n name: string;\n}\n', + 'factory.ts': + "import type { User } from './types.js';\n" + + 'export function makeUser(id: number, name: string): User {\n' + + ' return { id, name };\n' + + '}\n', + 'repo.ts': + "import type { User } from './types.js';\n" + + 'export function findUser(users: User[], id: number): User | undefined {\n' + + ' return users.find((u) => u.id === id);\n' + + '}\n', + 'format.ts': + "import type { User } from './types.js';\n" + + 'export function userLabel(u: User): string {\n' + + ' return `#${u.id} ${u.name}`;\n' + + '}\n', + 'check.test.ts': + "import { makeUser } from './factory.js';\n" + + "import { findUser } from './repo.js';\n" + + "import { userLabel } from './format.js';\n" + + "const u = makeUser('abc', 'Ann');\n" + + "if (typeof u.id !== 'string') throw new Error('id should be string, got ' + typeof u.id);\n" + + "const found = findUser([u], 'abc');\n" + + "if (!found) throw new Error('findUser failed with string id');\n" + + "if (userLabel(u) !== '#abc Ann') throw new Error('label wrong: ' + userLabel(u));\n" + + "console.log('PASS');\n", + }, + // 타입 변경 태스크는 런타임만으로 검증 불가(tsx는 타입을 무시) — tsc 타입체크를 + // 먼저 통과해야 하고(그래야 id:number stub이 string 호출에서 걸림), 그 다음 런타임 테스트. + check: (_read, repoDir) => { + // test 파일까지 타입체크에 포함 — stub의 id:number가 string 호출에서 걸리게. + const typecheck = runTypeCheck(repoDir, true); + if (!typecheck.passed) return typecheck; + return runTestFile(repoDir, 'check.test.ts'); + }, + }, + + // ============ L5 — 난해 (알고리즘/상태/경계 — 강한 변별) ============ + + { + id: 'L5-merge-intervals', + level: 'L5', + title: 'Implement mergeIntervals correctly (overlap + sort + touch)', + description: + 'Implement mergeIntervals(intervals) in intervals.ts. Given an array of [start, end] pairs, merge ' + + 'all overlapping intervals and return them sorted by start. Tricky cases the test checks: unsorted ' + + 'input, intervals that merely touch (e.g. [1,2] and [2,3] → [1,3]), fully nested intervals, and an ' + + 'empty array. Return [] for empty. Run intervals.test.ts.', + files: { + 'intervals.ts': + 'export function mergeIntervals(intervals: number[][]): number[][] {\n' + + ' return intervals; // TODO: implement\n' + + '}\n', + 'intervals.test.ts': + "import { mergeIntervals } from './intervals.js';\n" + + "function eq(a: number[][], b: number[][], m: string){ if (JSON.stringify(a)!==JSON.stringify(b)) throw new Error(m+' got '+JSON.stringify(a)); }\n" + + "eq(mergeIntervals([]), [], 'empty');\n" + + "eq(mergeIntervals([[1,3],[2,6],[8,10]]), [[1,6],[8,10]], 'overlap');\n" + + "eq(mergeIntervals([[1,2],[2,3]]), [[1,3]], 'touching');\n" + + "eq(mergeIntervals([[1,10],[2,3],[4,5]]), [[1,10]], 'nested');\n" + + "eq(mergeIntervals([[8,10],[1,3],[2,6]]), [[1,6],[8,10]], 'unsorted');\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'intervals.test.ts'), + }, + + { + id: 'L5-lru-cache', + level: 'L5', + title: 'Implement an LRU cache with capacity eviction', + description: + 'Implement the LRUCache class in lru.ts with a constructor(capacity), get(key) returning the value ' + + 'or -1 if absent, and put(key, value). On put beyond capacity, evict the LEAST recently used entry. ' + + 'A get OR a put counts as a use (makes the key most-recently-used). The test drives a precise ' + + 'eviction sequence — order matters. Run lru.test.ts.', + files: { + 'lru.ts': + 'export class LRUCache {\n' + + ' constructor(capacity: number) { /* TODO */ }\n' + + ' get(key: number): number { return -1; }\n' + + ' put(key: number, value: number): void { /* TODO */ }\n' + + '}\n', + 'lru.test.ts': + "import { LRUCache } from './lru.js';\n" + + "const c = new LRUCache(2);\n" + + "c.put(1, 1); c.put(2, 2);\n" + + "if (c.get(1) !== 1) throw new Error('get(1) should be 1');\n" + + "c.put(3, 3); // evicts 2 (1 was just used)\n" + + "if (c.get(2) !== -1) throw new Error('2 should be evicted');\n" + + "c.put(4, 4); // evicts 1 (3 and... 1 is LRU now? order: get1, put3, get2(miss), put4 → evict 1)\n" + + "if (c.get(1) !== -1) throw new Error('1 should be evicted');\n" + + "if (c.get(3) !== 3) throw new Error('3 should remain');\n" + + "if (c.get(4) !== 4) throw new Error('4 should remain');\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'lru.test.ts'), + }, + + { + id: 'L5-tokenizer-state-machine', + level: 'L5', + title: 'Implement a tokenizer that respects quoted strings', + description: + 'Implement tokenize(input) in tokenizer.ts. Split the input on spaces into tokens, BUT text inside ' + + 'double quotes is a single token with the quotes removed, and may contain spaces. Example: ' + + 'tokenize(\'a "b c" d\') → ["a", "b c", "d"]. Also handle: empty input → [], multiple spaces ' + + 'collapse, and an escaped quote \\" inside a quoted string stays a literal quote. Run tokenizer.test.ts.', + files: { + 'tokenizer.ts': + 'export function tokenize(input: string): string[] {\n' + + ' return input.split(" "); // TODO: handle quotes, escapes, empties\n' + + '}\n', + 'tokenizer.test.ts': + "import { tokenize } from './tokenizer.js';\n" + + "function eq(a: string[], b: string[], m: string){ if (JSON.stringify(a)!==JSON.stringify(b)) throw new Error(m+' got '+JSON.stringify(a)); }\n" + + "eq(tokenize(''), [], 'empty');\n" + + "eq(tokenize('a b c'), ['a','b','c'], 'simple');\n" + + "eq(tokenize('a \"b c\" d'), ['a','b c','d'], 'quoted');\n" + + "eq(tokenize('a b'), ['a','b'], 'multi-space');\n" + + "eq(tokenize('\"hi \\\\\"there\\\\\"\"'), ['hi \"there\"'], 'escaped quote');\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'tokenizer.test.ts'), + }, + + { + id: 'L5-generic-groupby', + level: 'L5', + title: 'Implement a correctly-typed generic groupBy', + description: + 'Implement groupBy(items, keyFn) in groupby.ts. It groups an array of T by the key returned ' + + 'by keyFn (a string or number), returning a Map where insertion order within each group is ' + + 'preserved. The signature must be properly generic (no any). The test groups objects and numbers and ' + + 'checks both grouping correctness and that the return is a Map. Run groupby.test.ts.', + files: { + 'groupby.ts': + 'export function groupBy(items: any, keyFn: any): any {\n' + + ' return new Map(); // TODO: implement, and make it properly generic (no any)\n' + + '}\n', + 'groupby.test.ts': + "import { groupBy } from './groupby.js';\n" + + "const nums = [1,2,3,4,5,6];\n" + + "const byParity = groupBy(nums, (n: number) => n % 2 === 0 ? 'even' : 'odd');\n" + + "if (!(byParity instanceof Map)) throw new Error('must return a Map');\n" + + "if (JSON.stringify(byParity.get('odd')) !== JSON.stringify([1,3,5])) throw new Error('odd group wrong: ' + JSON.stringify(byParity.get('odd')));\n" + + "if (JSON.stringify(byParity.get('even')) !== JSON.stringify([2,4,6])) throw new Error('even group wrong');\n" + + "const people = [{n:'a',age:30},{n:'b',age:30},{n:'c',age:40}];\n" + + "const byAge = groupBy(people, (p: {age:number}) => p.age);\n" + + "if (byAge.get(30).length !== 2) throw new Error('age-30 group should have 2');\n" + + "console.log('PASS');\n", + }, + check: (_read, repoDir) => runTestFile(repoDir, 'groupby.test.ts'), + }, +]; diff --git a/benchmarks/throughputProbe.ts b/benchmarks/throughputProbe.ts new file mode 100644 index 0000000..135ea5a --- /dev/null +++ b/benchmarks/throughputProbe.ts @@ -0,0 +1,76 @@ +#!/usr/bin/env tsx +// Created: 2026-06-09 +// Purpose: ZDR(data_collection:deny) 조건에서 후보 모델의 provider/throughput 변동 측정. +// 실제 운영(provider 자동선택)과 동일 조건. tok/s + provider 분포 + TTFT. +// Dependencies: tsx, OPENROUTER_API +// Test Status: profiling +// +// 실행: source ~/dev/VEGA/.env && npx tsx benchmarks/throughputProbe.ts + +const API = 'https://openrouter.ai/api/v1/chat/completions'; +const PROMPT = + 'Write a TypeScript function validateEmail(s) using a regex, with a null/empty guard ' + + 'and JSDoc, then 3 unit test cases. Output code only.'; + +const MODELS = [ + 'z-ai/glm-4.7-flash', + 'qwen/qwen3-coder-30b-a3b-instruct', + 'deepseek/deepseek-v4-flash', + 'google/gemini-2.5-flash', +]; +const SAMPLES = 4; + +interface Sample { provider: string; tokens: number; sec: number; tps: number; err?: string } + +async function probe(apiKey: string, model: string): Promise { + const t0 = Date.now(); + try { + const res = await fetch(API, { + method: 'POST', + headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' }, + body: JSON.stringify({ + model, + messages: [{ role: 'user', content: PROMPT }], + max_tokens: 700, + provider: { data_collection: 'deny' }, // ZDR 유지 조건 + }), + }); + const sec = (Date.now() - t0) / 1000; + const d = await res.json() as { + provider?: string; + usage?: { completion_tokens?: number }; + error?: { message?: string }; + }; + if (d.error) return { provider: '-', tokens: 0, sec, tps: 0, err: d.error.message?.slice(0, 50) }; + const tokens = d.usage?.completion_tokens ?? 0; + return { provider: d.provider ?? '?', tokens, sec, tps: sec > 0 ? tokens / sec : 0 }; + } catch (e) { + return { provider: '-', tokens: 0, sec: (Date.now() - t0) / 1000, tps: 0, err: String(e).slice(0, 50) }; + } +} + +async function main() { + const apiKey = process.env.OPENROUTER_API; + if (!apiKey) { console.error('OPENROUTER_API not set'); process.exit(1); } + + for (const model of MODELS) { + console.log(`\n=== ${model} (ZDR, ${SAMPLES} samples) ===`); + const samples: Sample[] = []; + // 직렬 — 같은 모델 동시호출은 provider 큐를 왜곡 + for (let i = 0; i < SAMPLES; i++) { + const s = await probe(apiKey, model); + samples.push(s); + if (s.err) console.log(` sample${i + 1}: ERR ${s.err}`); + else console.log(` sample${i + 1}: ${s.provider.padEnd(14)} ${String(s.tokens).padStart(4)}tok ${s.sec.toFixed(1)}s ${s.tps.toFixed(1)} tok/s`); + } + const ok = samples.filter(s => !s.err && s.tokens > 0); + if (ok.length) { + const avgTps = ok.reduce((a, s) => a + s.tps, 0) / ok.length; + const avgSec = ok.reduce((a, s) => a + s.sec, 0) / ok.length; + const provs = [...new Set(ok.map(s => s.provider))].join(', '); + console.log(` → avg ${avgTps.toFixed(1)} tok/s, ${avgSec.toFixed(1)}s | providers: ${provs}`); + } + } +} + +main().catch(e => { console.error('FATAL', e); process.exit(1); }); From d3d6a66272749f082c881f49fb0ebcf3f8ffa960 Mon Sep 17 00:00:00 2001 From: unohee Date: Thu, 11 Jun 2026 00:24:49 +0900 Subject: [PATCH 4/7] chore(support): web dashboard/chat updates, repo metadata mapping, example configs - repoMetadata.ts (+tests): per-repo openswarm.json for explicit Linear project mapping, consumed by projectMapper (+tests). - Web dashboard/chat backend/TUI updates and event hub/service plumbing. - .env.example / config.example.yaml refreshed for the OpenRouter adapter and model routing defaults. - .gitignore: exclude local experiments (testing/), SWE-bench evaluation logs, and root grading report artifacts. --- .env.example | 13 +- .gitignore | 5 + config.example.yaml | 19 +- src/core/eventHub.ts | 38 ++- src/core/service.ts | 4 +- src/core/types.ts | 4 +- src/discord/discordCore.ts | 134 ++--------- src/support/chat.ts | 34 +-- src/support/chatBackend.ts | 110 +++------ src/support/chatTui.ts | 36 ++- src/support/dashboardHtml.ts | 383 +++++++++++++++++++++++++----- src/support/projectMapper.test.ts | 81 +++++++ src/support/projectMapper.ts | 32 ++- src/support/repoMetadata.test.ts | 71 ++++++ src/support/repoMetadata.ts | 92 +++++++ src/support/web.ts | 62 ++++- 16 files changed, 788 insertions(+), 330 deletions(-) create mode 100644 src/support/projectMapper.test.ts create mode 100644 src/support/repoMetadata.test.ts create mode 100644 src/support/repoMetadata.ts diff --git a/.env.example b/.env.example index 542242a..75b44ce 100644 --- a/.env.example +++ b/.env.example @@ -33,11 +33,20 @@ LINEAR_TEAM_ID="00000000-0000-0000-0000-000000000000" NPM_TOKENS="" # ----------------------------------------------------------------------------- -# OpenAI / Codex CLI auth (optional) +# OpenAI / ChatGPT OAuth (optional) # ----------------------------------------------------------------------------- -# Overrides the default OAuth client ID used by the PKCE flow +# `openswarm auth login --provider gpt` uses the public @openai/codex OAuth +# client out of the box. Only set this to pin a custom OAuth app. OPENAI_CLIENT_ID="" +# ----------------------------------------------------------------------------- +# OpenRouter (required if adapter=openrouter) +# ----------------------------------------------------------------------------- +# Option 1: Set OPENROUTER_API directly — adapter reads this env var automatically. +# Can source from VEGA .env: `source ~/dev/VEGA/.env` +# Option 2: PKCE login — `openswarm auth login --provider openrouter` +OPENROUTER_API="" + # ----------------------------------------------------------------------------- # Task state (optional) # ----------------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 6c4cb30..e81809d 100644 --- a/.gitignore +++ b/.gitignore @@ -71,5 +71,10 @@ models/ # Trash trash/ +# Local experiments & SWE-bench evaluation artifacts +testing/ +logs/run_evaluation/ +openswarm.orb-*.json + # Subprojects openclaw/ diff --git a/config.example.yaml b/config.example.yaml index 2d0664b..0d78456 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -3,8 +3,11 @@ # Copy this file to config.yaml to use # Default CLI adapter for worker/reviewer stages -# Options: claude, codex, gpt, local, lmstudio -# For GPT: run `openswarm auth login --provider gpt` first +# Options: claude, codex, gpt, local, lmstudio, openrouter +# For GPT: run `openswarm auth login --provider gpt` +# (uses the public Codex OAuth client by default — no extra config needed) +# For OpenRouter: run `openswarm auth login --provider openrouter` +# (PKCE browser flow → stores a sk-or-* API key; falls back to manual paste) # For local: start Ollama, LMStudio, or llama.cpp server # For lmstudio: start LM Studio Local Server (default http://localhost:1234) # Optional env: LMSTUDIO_BASE_URL, LMSTUDIO_MODEL, LMSTUDIO_API_KEY @@ -52,7 +55,7 @@ autonomous: decomposition: enabled: true # Enable decomposition thresholdMinutes: 30 # Decompose if estimated time exceeds this - plannerModel: claude-sonnet-4-20250514 # Planner model + plannerModel: claude-opus-4-7 # Planner model (Opus for deep decomposition) # Per-role settings # Hybrid config: Claude for complex coding, local model for review/docs ($0) @@ -60,15 +63,15 @@ autonomous: worker: enabled: true adapter: claude - model: claude-sonnet-4-20250514 # Sonnet for coding tasks - escalateModel: claude-opus-4-6 # On failure: Opus + model: claude-sonnet-4-6 # Sonnet for coding tasks + escalateModel: claude-opus-4-7 # On failure: Opus escalateAfterIteration: 3 timeoutMs: 1800000 # 30 minutes reviewer: enabled: true adapter: local # Local model — free, 7s response model: gemma-4-e4b-it # Gemma 4 e4b via LMStudio - escalateModel: claude-sonnet-4-20250514 # Spot check: Sonnet reviews after N revisions + escalateModel: claude-sonnet-4-6 # Spot check: Sonnet reviews after N revisions escalateAfterIteration: 3 # Escalate from 3rd iteration timeoutMs: 60000 # 1 minute (local models are slower) tester: @@ -108,8 +111,8 @@ autonomous: - name: deep-engineering minMinutes: 16 roles: - worker: claude-sonnet-4-20250514 # Sonnet for complex work - reviewer: claude-sonnet-4-20250514 # Sonnet reviews Sonnet + worker: claude-sonnet-4-6 # Sonnet for complex work + reviewer: claude-sonnet-4-6 # Sonnet reviews Sonnet # Long-running task monitoring (RunPod training, batch processing, etc.) # diff --git a/src/core/eventHub.ts b/src/core/eventHub.ts index 27f6522..8053fa1 100644 --- a/src/core/eventHub.ts +++ b/src/core/eventHub.ts @@ -23,7 +23,43 @@ export type HubEvent = | { type: 'task:queued'; data: { taskId: string; title: string; projectPath: string; issueIdentifier?: string } } | { type: 'task:started'; data: { taskId: string; title: string; issueIdentifier?: string } } | { type: 'task:completed'; data: { taskId: string; success: boolean; duration: number } } - | { type: 'pipeline:stage'; data: { taskId: string; stage: string; status: 'start' | 'complete' | 'fail'; model?: string; inputTokens?: number; outputTokens?: number; costUsd?: number } } + | { type: 'pipeline:stage'; data: { + taskId: string; + stage: string; + status: 'start' | 'complete' | 'fail'; + model?: string; + inputTokens?: number; + outputTokens?: number; + costUsd?: number; + durationMs?: number; + // What the agent actually produced — populated for `status: 'complete'`. + summary?: string; + filesChanged?: string[]; + filesChangedCount?: number; + commands?: string[]; + commandsCount?: number; + decision?: 'approve' | 'revise' | 'reject'; + feedback?: string; + issues?: string[]; + issuesCount?: number; + suggestionsCount?: number; + // Tester + passed?: number; + failed?: number; + coverage?: number; + failedTests?: string[]; + // Documenter + changelogEntry?: string; + // Auditor + bsScore?: number; + criticalCount?: number; + warningCount?: number; + // Worker confidence-gate + confidencePercent?: number; + haltReason?: string; + // Errors + error?: string; + } } | { type: 'pipeline:iteration'; data: { taskId: string; iteration: number } } | { type: 'pipeline:escalation'; data: { taskId: string; iteration: number; fromModel?: string; toModel: string } } | { type: 'log'; data: { taskId: string; stage: string; line: string } } diff --git a/src/core/service.ts b/src/core/service.ts index 0416ffb..10164c4 100644 --- a/src/core/service.ts +++ b/src/core/service.ts @@ -52,8 +52,8 @@ export async function startService(config: SwarmConfig): Promise { initLocale(config.language); // Default CLI adapter - setDefaultAdapter(config.adapter ?? 'claude'); - console.log(`🛠️ CLI adapter: ${config.adapter ?? 'claude'}`); + setDefaultAdapter(config.adapter ?? 'codex'); + console.log(`🛠️ CLI adapter: ${config.adapter ?? 'codex'}`); // Rate limiter initialization console.log('⚡ Initializing rate limiters...'); diff --git a/src/core/types.ts b/src/core/types.ts index b1a9794..5ab8443 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -92,7 +92,7 @@ export type SwarmEvent = { */ export type SwarmConfig = { /** Default CLI adapter */ - adapter?: 'claude' | 'codex' | 'gpt' | 'local' | 'lmstudio'; + adapter?: 'codex' | 'gpt' | 'local' | 'lmstudio' | 'openrouter'; /** UI language: 'en' | 'ko' (default: 'en') */ language: 'en' | 'ko'; /** Discord bot token */ @@ -256,7 +256,7 @@ export type RoleConfig = { /** Whether role is enabled */ enabled: boolean; /** CLI adapter name */ - adapter?: 'claude' | 'codex' | 'gpt' | 'local' | 'lmstudio'; + adapter?: 'codex' | 'gpt' | 'local' | 'lmstudio' | 'openrouter'; /** Model ID */ model: string; /** Timeout (ms), 0 = unlimited */ diff --git a/src/discord/discordCore.ts b/src/discord/discordCore.ts index dd7bd45..86d3c31 100644 --- a/src/discord/discordCore.ts +++ b/src/discord/discordCore.ts @@ -16,7 +16,7 @@ import { import { spawn } from 'node:child_process'; import fs from 'node:fs/promises'; import type { SwarmEvent, AgentStatus } from '../core/types.js'; -import { extractCostFromJson, formatCost } from '../support/costTracker.js'; +import { getAdapter, spawnCli } from '../adapters/index.js'; import * as memory from '../memory/index.js'; import { t, getPrompts, getDateLocale } from '../locale/index.js'; @@ -660,8 +660,8 @@ export async function handleChat(msg: Message): Promise { console.log(`[OpenSwarm] History context: ${channelHistoryMap.get(channelId)?.length ?? 0} messages`); - // Run Claude CLI - const { result: response, toolCalls } = await runClaude(prompt, { cwd: projectPath || undefined }); + // Run via adapter + const { result: response, toolCalls } = await runWithAdapter(prompt, { cwd: projectPath || undefined }); if (typingInterval) clearInterval(typingInterval); @@ -696,127 +696,31 @@ export async function handleChat(msg: Message): Promise { } } -// Currently running OpenSwarm Claude process -let currentClaudeProcess: ReturnType | null = null; - /** - * Run Claude CLI + * Run via the default adapter (codex / openrouter / lmstudio / local) */ -async function runClaude( +async function runWithAdapter( prompt: string, - options?: { cwd?: string } + options?: { cwd?: string }, ): Promise<{ result: string; toolCalls: string[] }> { - if (currentClaudeProcess) { - console.log('[Claude CLI] Killing previous process...'); - currentClaudeProcess.kill('SIGKILL'); - currentClaudeProcess = null; - } - - const workingDir = options?.cwd || process.cwd(); - - return new Promise((resolve, reject) => { - console.log(`[Claude CLI] Starting in ${workingDir}...`); - const proc = spawn('claude', [ - '-p', prompt, - '--output-format', 'json', - '--permission-mode', 'bypassPermissions', - ], { - shell: false, - cwd: workingDir, - env: process.env, - stdio: ['ignore', 'pipe', 'pipe'], - }); - - currentClaudeProcess = proc; - - let stdout = ''; - let stderr = ''; - - proc.stdout?.on('data', (data) => { stdout += data.toString(); }); - proc.stderr?.on('data', (data) => { stderr += data.toString(); }); - - proc.on('close', (code) => { - currentClaudeProcess = null; - if (code !== 0 && code !== null) { - console.error('[Claude CLI] Error:', stderr.slice(0, 200)); - reject(new Error(`Claude CLI failed with code ${code}`)); - return; - } - resolve(parseClaudeJson(stdout)); - }); - - proc.on('error', (err) => { - currentClaudeProcess = null; - reject(new Error(`Claude CLI spawn error: ${err.message}`)); - }); + const adapter = getAdapter(); + const cwd = options?.cwd ?? process.cwd(); + console.log(`[Adapter:${adapter.name}] Starting in ${cwd}...`); + + const raw = await spawnCli(adapter, { + prompt, + cwd, + timeoutMs: 120_000, + maxTurns: 10, }); -} -// Destructive command patterns -const DESTRUCTIVE_PATTERNS = [ - /\brm\s+(-[rf]+\s+)*.*(-[rf]+|--recursive|--force)/i, - /\bgit\s+(reset\s+--hard|clean\s+-[fd])/i, - /\b(drop|truncate)\s+(database|table)/i, - /\bchmod\s+777/i, - /\bdd\s+if=/i, - />\s*\/dev\/sd[a-z]/i, -]; + const workerResult = adapter.parseWorkerOutput(raw); + const toolCalls = workerResult.commands ?? []; + return { result: workerResult.summary ?? raw.stdout.trim(), toolCalls }; +} -/** - * Parse Claude JSON output - */ -function parseClaudeJson(output: string): { result: string; toolCalls: string[] } { - const toolCalls: string[] = []; - // Extract cost - const costInfo = extractCostFromJson(output); - if (costInfo) { - console.log(`[Discord] Claude cost: ${formatCost(costInfo)}`); - } - - try { - const match = output.match(/\[[\s\S]*\]/); - if (!match) return { result: output.trim() || t('common.fallback.noResponse'), toolCalls }; - - const arr = JSON.parse(match[0]); - let result = t('common.fallback.noResponse'); - - for (const item of arr) { - if (item.type === 'tool_use') { - const toolName = item.name || 'unknown'; - let toolSummary = toolName; - - if (toolName === 'Bash' && item.input?.command) { - const cmd = item.input.command.slice(0, 80); - toolSummary = `Bash: \`${cmd}${item.input.command.length > 80 ? '...' : ''}\``; - - for (const pattern of DESTRUCTIVE_PATTERNS) { - if (pattern.test(item.input.command)) { - toolSummary = `⛔ BLOCKED: ${cmd}`; - console.warn(`[OpenSwarm] Destructive command detected: ${item.input.command}`); - break; - } - } - } else if (['Read', 'Write', 'Edit'].includes(toolName) && item.input?.file_path) { - const path = item.input.file_path.split('/').slice(-2).join('/'); - toolSummary = `${toolName}: \`${path}\``; - } else if (toolName === 'Grep' && item.input?.pattern) { - toolSummary = `Grep: \`${item.input.pattern}\``; - } - - toolCalls.push(toolSummary); - } - if (item.type === 'result' && item.result) { - result = item.result; - } - } - - return { result, toolCalls }; - } catch { - return { result: output.trim() || t('common.fallback.noResponse'), toolCalls }; - } -} /** * Split message diff --git a/src/support/chat.ts b/src/support/chat.ts index 8423a78..901b258 100644 --- a/src/support/chat.ts +++ b/src/support/chat.ts @@ -30,7 +30,6 @@ type Session = { provider: AdapterName; model: string; messages: Message[]; - claudeSessionId?: string; createdAt: string; updatedAt: string; }; @@ -56,7 +55,6 @@ async function loadSession(id: string): Promise { provider, model: raw.model || getDefaultChatModel(provider), messages: Array.isArray(raw.messages) ? raw.messages : [], - claudeSessionId: raw.claudeSessionId, createdAt: raw.createdAt || new Date().toISOString(), updatedAt: raw.updatedAt || new Date().toISOString(), }; @@ -85,17 +83,12 @@ async function chat(session: Session, userMessage: string): Promise { prompt: userMessage, provider: session.provider, model: session.model, - sessionId: session.provider === 'claude' ? session.claudeSessionId : undefined, onText: (text, isThinking) => { if (!isThinking) process.stdout.write(text); }, }); process.stdout.write('\n\n'); - if (session.provider === 'claude' && result.sessionId) { - session.claudeSessionId = result.sessionId; - } - if (result.response) { session.messages.push({ role: 'assistant', content: result.response }); } else { @@ -126,7 +119,6 @@ async function handleCommand( case 'clear': case 'c': session.messages = []; - session.claudeSessionId = undefined; console.log(`${GREEN}Conversation cleared.${RESET}`); return 'handled'; @@ -149,9 +141,8 @@ async function handleCommand( for (const s of sessions.slice(-10)) { const data = await loadSession(s); const msgCount = data?.messages.length ?? 0; - const hasResume = data?.claudeSessionId ? ' (resumable)' : ''; const provider = data?.provider ?? inferProvider(undefined, data?.model); - console.log(` ${CYAN}${s}${RESET} ${msgCount} msgs ${DIM}[${provider}]${RESET}${hasResume}`); + console.log(` ${CYAN}${s}${RESET} ${msgCount} msgs ${DIM}[${provider}]${RESET}`); } } return 'handled'; @@ -170,16 +161,11 @@ async function handleCommand( const next = args[0]; if (!next) { console.log(`${BOLD}Provider:${RESET} ${session.provider}`); - console.log(`${DIM} claude | codex${RESET}`); - return 'handled'; - } - if (next !== 'claude' && next !== 'codex') { - console.log(`${RED}Unknown provider: ${next}${RESET}`); + console.log(`${DIM} codex | openrouter | lmstudio | local | gpt${RESET}`); return 'handled'; } - session.provider = next; - session.model = getDefaultChatModel(next); - session.claudeSessionId = undefined; + session.provider = next as AdapterName; + session.model = getDefaultChatModel(session.provider); console.log(`${GREEN}Provider: ${session.provider}${RESET}`); console.log(`${GREEN}Model: ${session.model}${RESET}`); return 'handled'; @@ -190,17 +176,16 @@ async function handleCommand( if (!newModel) { console.log(`${BOLD}Provider:${RESET} ${session.provider}`); console.log(`${BOLD}Model:${RESET} ${session.model}`); - if (session.provider === 'claude') { - console.log(`${DIM} sonnet → claude-sonnet-4-5-20250929${RESET}`); - console.log(`${DIM} haiku → claude-haiku-4-5-20251001${RESET}`); - console.log(`${DIM} opus → claude-opus-4-6${RESET}`); + if (session.provider === 'openrouter') { + console.log(`${DIM} sonnet → anthropic/claude-sonnet-4${RESET}`); + console.log(`${DIM} gemini → google/gemini-2.5-pro${RESET}`); + console.log(`${DIM} gpt-5 → openai/gpt-5${RESET}`); } else { console.log(`${DIM} codex → gpt-5-codex${RESET}`); } return 'handled'; } session.model = resolveChatModel(newModel, session.provider); - session.claudeSessionId = undefined; console.log(`${GREEN}Model: ${session.model}${RESET}`); return 'handled'; } @@ -211,7 +196,6 @@ async function handleCommand( console.log(`${BOLD}Provider:${RESET} ${session.provider}`); console.log(`${BOLD}Model:${RESET} ${session.model}`); console.log(`${BOLD}Messages:${RESET} ${session.messages.length}`); - console.log(`${BOLD}Claude resume:${RESET} ${session.claudeSessionId ? 'active' : 'none'}`); return 'handled'; case 'help': @@ -222,7 +206,7 @@ ${BOLD}Commands:${RESET} ${CYAN}/clear${RESET} Clear conversation ${CYAN}/save [name]${RESET} Save session ${CYAN}/load [name]${RESET} List/load sessions - ${CYAN}/provider [id]${RESET} Change provider (claude/codex) + ${CYAN}/provider [id]${RESET} Change provider (codex/openrouter/lmstudio/local/gpt) ${CYAN}/model [id]${RESET} Change model ${CYAN}/info${RESET} Session info ${CYAN}/exit${RESET} Exit (Ctrl+D) diff --git a/src/support/chatBackend.ts b/src/support/chatBackend.ts index f6cc6fd..4f95419 100644 --- a/src/support/chatBackend.ts +++ b/src/support/chatBackend.ts @@ -1,10 +1,7 @@ import { spawn } from 'node:child_process'; import { writeFile, unlink } from 'node:fs/promises'; import type { AdapterName } from '../adapters/index.js'; -import { getAdapter } from '../adapters/index.js'; -import { getDefaultAdapterName } from '../adapters/index.js'; -import { extractResultFromStreamJson } from '../agents/cliStreamParser.js'; -import { extractCostFromStreamJson } from './costTracker.js'; +import { getAdapter, getDefaultAdapterName } from '../adapters/index.js'; export interface ChatCompletionOptions { prompt: string; @@ -26,11 +23,6 @@ export interface ChatCompletionResult { } export const CHAT_MODEL_ALIASES: Record> = { - claude: { - sonnet: 'claude-sonnet-4-5-20250929', - haiku: 'claude-haiku-4-5-20251001', - opus: 'claude-opus-4-6', - }, codex: { codex: 'gpt-5-codex', gpt5: 'gpt-5-codex', @@ -60,15 +52,28 @@ export const CHAT_MODEL_ALIASES: Record> = { local: process.env.LMSTUDIO_MODEL ?? 'local-model', lmstudio: process.env.LMSTUDIO_MODEL ?? 'local-model', }, + openrouter: { + // Short aliases — full IDs (e.g. 'anthropic/claude-sonnet-4') pass through unchanged. + sonnet: 'anthropic/claude-sonnet-4', + opus: 'anthropic/claude-opus-4', + haiku: 'anthropic/claude-haiku-4-5', + 'gpt-4o': 'openai/gpt-4o', + 'gpt-5': 'openai/gpt-5', + 'o4-mini': 'openai/o4-mini', + gemini: 'google/gemini-2.5-pro', + kimi: 'moonshotai/kimi-k2', + glm: 'z-ai/glm-4.6', + }, }; export function inferProviderFromModel(model?: string): AdapterName { if (!model) return getDefaultAdapterName(); if (model.includes('codex')) return 'codex'; if (model.startsWith('gpt-') || model.startsWith('o3') || model.startsWith('o4')) return 'gpt'; + if (model.includes('/')) return 'openrouter'; // 로컬 모델 패턴: ollama 태그 형식 (name:tag) 또는 알려진 오픈소스 모델 if (model.includes(':') || /^(gemma|llama|mistral|codestral|qwen|deepseek|phi|starcoder)/i.test(model)) return 'local'; - return 'claude'; + return getDefaultAdapterName(); } export function getDefaultChatModel(provider: AdapterName): string { @@ -76,7 +81,8 @@ export function getDefaultChatModel(provider: AdapterName): string { if (provider === 'gpt') return 'gpt-4o'; if (provider === 'local') return 'gemma3:4b'; if (provider === 'lmstudio') return process.env.LMSTUDIO_MODEL ?? 'local-model'; - return 'claude-sonnet-4-5-20250929'; + if (provider === 'openrouter') return 'openai/gpt-5'; + return 'gpt-5-codex'; } export function resolveChatModel(input: string | undefined, provider: AdapterName): string { @@ -86,9 +92,8 @@ export function resolveChatModel(input: string | undefined, provider: AdapterNam } export function shortenChatModel(model: string): string { - if (model.startsWith('claude-')) { - return model.replace('claude-', '').replace(/-\d{8}$/, ''); - } + // OpenRouter: "anthropic/claude-sonnet-4" → "claude-sonnet-4" + if (model.includes('/')) return model.split('/').pop() ?? model; return model; } @@ -140,28 +145,13 @@ export async function runChatCompletion(options: ChatCompletionOptions): Promise if (!line) continue; try { const event = JSON.parse(line); - if (provider === 'claude') { - if (event.session_id && !capturedSessionId) { - capturedSessionId = event.session_id; - } - if (event.type === 'assistant' && event.message?.content) { - for (const block of event.message.content) { - if (block.type === 'text' && block.text) { - startedStreaming = true; - options.onText?.(block.text, false); - resetThinkingTimer(); - } - } - } - } else { - if (event.type === 'item.completed' && event.item?.type === 'agent_message' && typeof event.item.text === 'string') { - startedStreaming = true; - options.onText?.(event.item.text, false); - resetThinkingTimer(); - } - if (event.type === 'item.completed' && event.item?.type === 'reasoning') { - options.onText?.('', true); - } + if (event.type === 'item.completed' && event.item?.type === 'agent_message' && typeof event.item.text === 'string') { + startedStreaming = true; + options.onText?.(event.item.text, false); + resetThinkingTimer(); + } + if (event.type === 'item.completed' && event.item?.type === 'reasoning') { + options.onText?.('', true); } } catch { // Ignore malformed lines. @@ -197,11 +187,9 @@ export async function runChatCompletion(options: ChatCompletionOptions): Promise return; } - const response = provider === 'claude' - ? extractClaudeChatResponse(stdout) - : extractCodexChatResponse(stdout); - const cost = provider === 'claude' ? extractCostFromStreamJson(stdout)?.costUsd : undefined; - const tokens = provider === 'claude' ? extractClaudeTokens(stdout) : undefined; + const response = extractCodexChatResponse(stdout); + const cost = undefined; + const tokens = undefined; resolve({ response: response || '[No response]', @@ -228,30 +216,6 @@ export async function runChatCompletion(options: ChatCompletionOptions): Promise } } -function extractClaudeChatResponse(stdout: string): string { - const resultText = extractResultFromStreamJson(stdout); - if (resultText?.trim()) return resultText.trim(); - - const assistantTexts: string[] = []; - for (const line of stdout.split('\n')) { - const trimmed = line.trim(); - if (!trimmed) continue; - try { - const event = JSON.parse(trimmed); - if (event.type === 'assistant' && event.message?.content) { - for (const block of event.message.content) { - if (block.type === 'text' && block.text?.trim()) { - assistantTexts.push(block.text.trim()); - } - } - } - } catch { - // Ignore malformed lines. - } - } - return assistantTexts.join('\n\n').trim(); -} - function extractCodexChatResponse(stdout: string): string { let lastMessage = ''; for (const line of stdout.split('\n')) { @@ -269,18 +233,4 @@ function extractCodexChatResponse(stdout: string): string { return lastMessage; } -function extractClaudeTokens(stdout: string): number | undefined { - for (const line of stdout.split('\n')) { - const trimmed = line.trim(); - if (!trimmed) continue; - try { - const event = JSON.parse(trimmed); - if (event.type === 'result') { - return (event.input_tokens ?? 0) + (event.output_tokens ?? 0); - } - } catch { - // Ignore malformed lines. - } - } - return undefined; -} + diff --git a/src/support/chatTui.ts b/src/support/chatTui.ts index 89d7955..0af493c 100644 --- a/src/support/chatTui.ts +++ b/src/support/chatTui.ts @@ -43,7 +43,6 @@ type Session = { provider: AdapterName; model: string; messages: Message[]; - claudeSessionId?: string; totalCost: number; totalTokens: number; createdAt: string; @@ -113,14 +112,12 @@ async function callChatModel( prompt: string, provider: AdapterName, model: string, - sessionId: string | undefined, onStream: (text: string, isThinking: boolean) => void, ): Promise<{ response: string; sessionId: string; cost: number; tokens: number }> { const result = await runChatCompletion({ prompt, provider, model, - sessionId: provider === 'claude' ? sessionId : undefined, timeoutMs: 180000, onText: onStream, }); @@ -584,8 +581,10 @@ async function loadTasksData(box: blessed.Widgets.BoxElement) { const [icon, color] = statusMap[ev.status] || ['○', '#718096']; let model = ''; - if (ev.model?.includes('sonnet-4-5')) model = 'sonnet-4.5'; + if (ev.model?.includes('sonnet-4-6')) model = 'sonnet-4.6'; + else if (ev.model?.includes('sonnet-4-5')) model = 'sonnet-4.5'; else if (ev.model?.includes('haiku-4-5')) model = 'haiku-4.5'; + else if (ev.model?.includes('opus-4-7')) model = 'opus-4.7'; else if (ev.model?.includes('opus-4')) model = 'opus-4'; else if (ev.model) model = ev.model.split('-').pop() || ''; model = model.padEnd(12).slice(0, 12); @@ -825,8 +824,7 @@ async function sendMessage(state: AppState, ui: ReturnType, mes message, state.session.provider, state.session.model, - state.session.claudeSessionId, - (chunk, isThinking) => { + (chunk, isThinking) => { // Handle thinking notification (show/resume spinner) if (isThinking) { if (spinnerStopped) { @@ -877,10 +875,6 @@ async function sendMessage(state: AppState, ui: ReturnType, mes spinnerStopped = true; } - if (state.session.provider === 'claude' && result.sessionId) { - state.session.claudeSessionId = result.sessionId; - } - // Update session stats state.session.totalCost += result.cost; state.session.totalTokens += result.tokens; @@ -961,7 +955,6 @@ async function handleCommand( case 'clear': case 'c': state.session.messages = []; - state.session.claudeSessionId = undefined; state.session.totalCost = 0; state.session.totalTokens = 0; ui.chatLog.setContent(''); @@ -979,14 +972,14 @@ async function handleCommand( if (!next) { ui.chatLog.log(` {bold}Current provider:{/bold} {#c084fc-fg}${state.session.provider}{/}`); ui.chatLog.log(' {#718096-fg}Available providers:{/}'); - ui.chatLog.log(' {#a0aec0-fg}claude{/}'); ui.chatLog.log(' {#a0aec0-fg}codex{/}'); - } else if (next !== 'claude' && next !== 'codex') { - ui.chatLog.log(` {#ef4444-fg}Unknown provider: ${next}{/}`); + ui.chatLog.log(' {#a0aec0-fg}openrouter{/}'); + ui.chatLog.log(' {#a0aec0-fg}lmstudio{/}'); + ui.chatLog.log(' {#a0aec0-fg}local{/}'); + ui.chatLog.log(' {#a0aec0-fg}gpt{/}'); } else { - state.session.provider = next; - state.session.model = getDefaultChatModel(next); - state.session.claudeSessionId = undefined; + state.session.provider = next as AdapterName; + state.session.model = getDefaultChatModel(state.session.provider); ui.chatLog.log(` {#34d399-fg}✓ Provider changed to {bold}${next}{/bold}{/}`); ui.chatLog.log(` {#34d399-fg}✓ Model changed to {bold}${state.session.model}{/bold}{/}`); updateStatusBar(state, ui); @@ -1005,16 +998,15 @@ async function handleCommand( ui.chatLog.log(` {bold}Current model:{/bold} {#60a5fa-fg}${shortenChatModel(state.session.model)}{/}`); ui.chatLog.log(''); ui.chatLog.log(' {#718096-fg}Available models:{/}'); - if (state.session.provider === 'claude') { - ui.chatLog.log(' {#a0aec0-fg}sonnet{/} {#718096-fg}→{/} claude-sonnet-4-5'); - ui.chatLog.log(' {#a0aec0-fg}haiku{/} {#718096-fg}→{/} claude-haiku-4-5'); - ui.chatLog.log(' {#a0aec0-fg}opus{/} {#718096-fg}→{/} claude-opus-4-6'); + if (state.session.provider === 'openrouter') { + ui.chatLog.log(' {#a0aec0-fg}sonnet{/} {#718096-fg}→{/} anthropic/claude-sonnet-4'); + ui.chatLog.log(' {#a0aec0-fg}gemini{/} {#718096-fg}→{/} google/gemini-2.5-pro'); + ui.chatLog.log(' {#a0aec0-fg}gpt-5{/} {#718096-fg}→{/} openai/gpt-5'); } else { ui.chatLog.log(' {#a0aec0-fg}codex{/} {#718096-fg}→{/} gpt-5-codex'); } } else { state.session.model = resolveChatModel(newModel, state.session.provider); - state.session.claudeSessionId = undefined; const shortName = shortenChatModel(state.session.model); ui.chatLog.log(` {#34d399-fg}✓ Model changed to {bold}${shortName}{/bold}{/}`); updateStatusBar(state, ui); diff --git a/src/support/dashboardHtml.ts b/src/support/dashboardHtml.ts index 1bca788..60f25d8 100644 --- a/src/support/dashboardHtml.ts +++ b/src/support/dashboardHtml.ts @@ -6,37 +6,53 @@ const DASHBOARD_HTML = ` OpenSwarm :: Supervisor