diff --git a/src/domain/session-service.ts b/src/domain/session-service.ts index 7c37477..037120f 100644 --- a/src/domain/session-service.ts +++ b/src/domain/session-service.ts @@ -132,7 +132,13 @@ export class SessionService { ); } - private async requireSession(slug: string): Promise { + /** + * Public variant of the session lookup — throws `SESSION_NOT_FOUND` + * when the user never opened the slug. Used by the runner-tools + * layer to keep `run_local_tests` aligned with the pedagogy state + * machine (no orphaned runs). + */ + async requireSession(slug: string): Promise { const session = await this.store.load(slug); if (!session) { throw new LeetCodeError( @@ -142,4 +148,24 @@ export class SessionService { } return session; } + + /** + * Updates the session after a `run_local_tests` invocation. + * Increments `attempts`, sets `lastLocalRunPassed`, and bumps + * `status` to "attempting" on the first run (so subsequent + * resets-then-runs keep the lifecycle accurate). + */ + async recordLocalRun(slug: string, passed: boolean): Promise { + const session = await this.requireSession(slug); + const next: SessionState = { + ...session, + attempts: session.attempts + 1, + lastLocalRunPassed: passed, + status: + session.status === "started" ? "attempting" : session.status, + updatedAt: new Date().toISOString() + }; + await this.store.save(next); + return next; + } } diff --git a/src/index.ts b/src/index.ts index b9c316d..4ba550b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -19,10 +19,12 @@ import { registerAuthTools } from "./mcp/tools/auth-tools.js"; import { registerContestTools } from "./mcp/tools/contest-tools.js"; import { registerOnboardingTools } from "./mcp/tools/onboarding-tools.js"; import { registerProblemTools } from "./mcp/tools/problem-tools.js"; +import { registerRunnerTools } from "./mcp/tools/runner-tools.js"; import { registerSessionTools } from "./mcp/tools/session-tools.js"; import { registerSolutionTools } from "./mcp/tools/solution-tools.js"; import { registerSubmissionTools } from "./mcp/tools/submission-tools.js"; import { registerUserTools } from "./mcp/tools/user-tools.js"; +import { SubprocessRunner } from "./runner/subprocess-runner.js"; import logger from "./utils/logger.js"; /** @@ -145,6 +147,11 @@ async function main() { // returning content. const sessions = new SessionService(); + // Local subprocess runner: probes python3 / go / java on first use, + // wraps with bwrap / firejail / sandbox-exec where available, and + // backs the `run_local_tests` tool. Phase 4a ships python3 only. + const runner = new SubprocessRunner(); + // Register MCP prompts for learning mode and workspace guidance registerLearningPrompts(server, leetcodeService); @@ -158,8 +165,9 @@ async function main() { registerContestTools(server, leetcodeService); registerSessionTools(server, leetcodeService, sessions); registerSolutionTools(server, leetcodeService, sessions); + registerRunnerTools(server, leetcodeService, sessions, runner); registerAuthTools(server, leetcodeService); - registerSubmissionTools(server, leetcodeService); + registerSubmissionTools(server, leetcodeService, sessions); registerProblemResources(server, leetcodeService); registerSolutionResources(server, leetcodeService); diff --git a/src/mcp/tools/runner-tools.ts b/src/mcp/tools/runner-tools.ts new file mode 100644 index 0000000..7c7fa33 --- /dev/null +++ b/src/mcp/tools/runner-tools.ts @@ -0,0 +1,188 @@ +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { z } from "zod"; +import type { SessionService } from "../../domain/session-service.js"; +import { LeetcodeServiceInterface } from "../../leetcode/leetcode-service-interface.js"; +import { + IMPLEMENTED_LANGUAGES, + SUPPORTED_LANGUAGES, + type LocalRunner +} from "../../runner/runner.js"; +import type { RunnerLanguage } from "../../types/index.js"; +import { ErrorCode, LeetCodeError } from "../../types/index.js"; +import { errorEnvelope } from "./session-tools.js"; +import { ToolRegistry } from "./tool-registry.js"; + +/** + * Local-runner tools introduced in Phase 4. + * + * `run_local_tests` is the inner-loop primitive: agent passes code, + * runner spawns a sandboxed subprocess, captures stdout/stderr/exit + * code, and reports back. The session's `lastLocalRunPassed` flag is + * updated as a side effect so `submit_solution`'s strict-mode gate + * (Phase 6) and any future analytics have a stable hook. + * + * v1 deliberately does *not* parse `exampleTestcases` server-side or + * synthesize a per-problem harness. The agent — which already has the + * problem in context after `start_problem` — is responsible for adding + * test invocations to the code it submits to the runner. That keeps + * the wire surface tiny, language-agnostic, and free of LeetCode- + * specific signature parsing. + */ +export class RunnerToolRegistry extends ToolRegistry { + constructor( + server: McpServer, + leetcodeService: LeetcodeServiceInterface, + private readonly sessions: SessionService, + private readonly runner: LocalRunner + ) { + super(server, leetcodeService); + } + + protected registerPublic(): void { + this.registerRunLocalTests(); + this.registerDoctor(); + } + + private registerRunLocalTests(): void { + const supportedLiteral = z.enum( + SUPPORTED_LANGUAGES as unknown as [string, ...string[]] + ); + this.server.registerTool( + "run_local_tests", + { + description: + "Runs the user's code locally in an isolated subprocess, captures stdout / stderr / exit code, and updates the session's lastLocalRunPassed flag. Use this in the inner loop instead of submit_solution — it costs no LeetCode submission and turns around in seconds. The agent is responsible for including test invocations (e.g. `print(Solution().twoSum([2,7,11,15], 9))`) in the code passed in. Phase 4a ships python3; go and java land in Phase 4b/4c.", + inputSchema: { + titleSlug: z + .string() + .min(1) + .describe( + "The URL slug of the problem (must match an active session opened with start_problem)." + ), + language: supportedLiteral.describe( + `Language to execute as. Currently runnable: ${IMPLEMENTED_LANGUAGES.join( + ", " + )}. Other LeetCode languages remain valid for submit_solution.` + ), + code: z + .string() + .min(1) + .describe( + "Complete source code to execute. Should include test invocations that print results / raise on failure." + ), + timeoutMs: z + .number() + .int() + .min(100) + .max(60_000) + .optional() + .describe( + "Optional wall-clock budget in milliseconds. Defaults to 5000." + ) + } + }, + async ({ titleSlug, language, code, timeoutMs }) => { + try { + // Require a session — keeps the runner aligned with + // the pedagogy state machine (and gives us a sane + // place to record `attempts` / `lastLocalRunPassed`). + await this.sessions.requireSession(titleSlug); + + const result = await this.runner.run({ + titleSlug, + language: language as RunnerLanguage, + code, + timeoutMs + }); + + await this.sessions.recordLocalRun( + titleSlug, + result.passed + ); + + return { + content: [ + { + type: "text" as const, + text: JSON.stringify({ + titleSlug, + language, + result + }) + } + ] + }; + } catch (error) { + return errorEnvelope( + "Failed to run local tests", + wrapTimeout(error) + ); + } + } + ); + } + + private registerDoctor(): void { + this.server.registerTool( + "runner_doctor", + { + description: + "Reports which language runtimes (python3, go, java) and OS sandbox tools (bwrap, firejail, sandbox-exec) are detected on this host. Useful for diagnosing 'LANGUAGE_RUNTIME_NOT_FOUND' errors and confirming whether run_local_tests will be sandboxed.", + inputSchema: {} + }, + async () => { + try { + const capabilities = await this.runner.capabilities(); + return { + content: [ + { + type: "text" as const, + text: JSON.stringify(capabilities) + } + ] + }; + } catch (error) { + return errorEnvelope( + "Failed to inspect runner capabilities", + error + ); + } + } + ); + } +} + +/** + * `RUNNER_TIMEOUT` is reported as a plain `RunResult` with `timedOut: true`, + * not as a thrown error — but `run` itself can throw for the runtime- + * not-found / language-not-implemented cases. Anything else is normalised + * into `UPSTREAM_ERROR` by the shared envelope. + */ +function wrapTimeout(error: unknown): unknown { + if (error instanceof LeetCodeError) { + return error; + } + if (error instanceof Error && /timed out/i.test(error.message)) { + return new LeetCodeError( + ErrorCode.RUNNER_TIMEOUT, + error.message, + error + ); + } + return error; +} + +export function registerRunnerTools( + server: McpServer, + leetcodeService: LeetcodeServiceInterface, + sessions: SessionService, + runner: LocalRunner +): void { + const registry = new RunnerToolRegistry( + server, + leetcodeService, + sessions, + runner + ); + registry.register(); +} diff --git a/src/mcp/tools/submission-tools.ts b/src/mcp/tools/submission-tools.ts index 04ea8cf..a5b8635 100644 --- a/src/mcp/tools/submission-tools.ts +++ b/src/mcp/tools/submission-tools.ts @@ -1,19 +1,41 @@ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { z } from "zod"; +import type { SessionService } from "../../domain/session-service.js"; import { LeetcodeServiceInterface } from "../../leetcode/leetcode-service-interface.js"; +import { ErrorCode, LeetCodeError } from "../../types/index.js"; +import { errorEnvelope } from "./session-tools.js"; import { ToolRegistry } from "./tool-registry.js"; /** * Submission tool registry class that handles registration of LeetCode submission tools. + * + * Phase 4 wires the strict-mode gate (`LEETCODE_MCP_STRICT_MODE=1`): + * when enabled, `submit_solution` refuses to spend a real LeetCode + * submission unless the active session's `lastLocalRunPassed === true`. + * Default is *off* (preserves current behaviour); session is optional + * so existing flows without `start_problem` aren't broken. */ export class SubmissionToolRegistry extends ToolRegistry { + constructor( + server: McpServer, + leetcodeService: LeetcodeServiceInterface, + private readonly sessions?: SessionService + ) { + super(server, leetcodeService); + } + + private isStrictMode(): boolean { + const value = process.env.LEETCODE_MCP_STRICT_MODE; + return value === "1" || value === "true"; + } + protected registerPublic(): void { // Submission tool this.server.registerTool( "submit_solution", { description: - "Submit a solution to a LeetCode problem and get results. Returns acceptance status, runtime/memory stats, or failed test case details.", + "Submit a solution to a LeetCode problem and get results. Returns acceptance status, runtime/memory stats, or failed test case details. When LEETCODE_MCP_STRICT_MODE=1 is set, requires `run_local_tests` to have last passed for the problem first — saves real LeetCode submissions for solutions that pass examples locally.", inputSchema: { problemSlug: z .string() @@ -51,6 +73,21 @@ export class SubmissionToolRegistry extends ToolRegistry { }, async ({ problemSlug, code, language }) => { try { + if (this.isStrictMode() && this.sessions) { + // The strict gate only fires when the user has + // actually opened a session for this slug. If + // they never called `start_problem`, the + // pre-strict-mode behaviour is preserved (so + // strict mode is non-disruptive for ad-hoc + // calls outside the tutoring flow). + const session = await this.sessions.get(problemSlug); + if (session && session.lastLocalRunPassed !== true) { + throw new LeetCodeError( + ErrorCode.LOCAL_TESTS_NOT_PASSED, + "Strict mode is enabled and the most recent run_local_tests for this problem did not pass. Run it again and submit only when locals are green." + ); + } + } const result = await this.leetcodeService.submitSolution( problemSlug, code, @@ -59,23 +96,13 @@ export class SubmissionToolRegistry extends ToolRegistry { return { content: [ { - type: "text", + type: "text" as const, text: JSON.stringify(result, null, 2) } ] }; - } catch (error: any) { - return { - content: [ - { - type: "text", - text: JSON.stringify({ - error: "Failed to submit solution", - message: error.message - }) - } - ] - }; + } catch (error) { + return errorEnvelope("Failed to submit solution", error); } } ); @@ -87,11 +114,17 @@ export class SubmissionToolRegistry extends ToolRegistry { * * @param server - The MCP server instance to register tools with * @param leetcodeService - The LeetCode service implementation to use for API calls + * @param sessions - Optional session service used for the strict-mode gate */ export function registerSubmissionTools( server: McpServer, - leetcodeService: LeetcodeServiceInterface + leetcodeService: LeetcodeServiceInterface, + sessions?: SessionService ): void { - const registry = new SubmissionToolRegistry(server, leetcodeService); + const registry = new SubmissionToolRegistry( + server, + leetcodeService, + sessions + ); registry.register(); } diff --git a/src/runner/runner.ts b/src/runner/runner.ts new file mode 100644 index 0000000..addb32d --- /dev/null +++ b/src/runner/runner.ts @@ -0,0 +1,45 @@ +/** + * The local runner contract — implemented by `SubprocessRunner` for + * production and easily faked in tests. + * + * Tools should depend on this interface, never on the concrete + * implementation. Phase 4d will add an alternative implementation that + * delegates to a stronger sandbox; Phase 5 will compose this with the + * workspace abstraction. + */ +import type { + RunInput, + RunResult, + RunnerCapabilities, + RunnerLanguage +} from "../types/index.js"; + +export interface LocalRunner { + /** Runs the user's code; returns the result envelope (never throws on user-code failures). */ + run(input: RunInput): Promise; + /** Snapshot of what the runner detected on this host — drives the `doctor` command. */ + capabilities(): Promise; +} + +/** + * Languages the runner currently knows about. Used by the tool layer + * for early validation before spawning anything. + */ +export const SUPPORTED_LANGUAGES: readonly RunnerLanguage[] = [ + "python3", + "go", + "java" +] as const; + +/** + * The languages this build of the runner has *implemented*. Phase 4a + * ships `python3` only. Phase 4b/4c grow this list. + * + * Kept distinct from `SUPPORTED_LANGUAGES` so the wire-level + * `RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE` error has a single source of + * truth: anything in `SUPPORTED_LANGUAGES` but not in this list is a + * "coming soon" language. + */ +export const IMPLEMENTED_LANGUAGES: readonly RunnerLanguage[] = [ + "python3" +] as const; diff --git a/src/runner/sandbox.ts b/src/runner/sandbox.ts new file mode 100644 index 0000000..5fc866c --- /dev/null +++ b/src/runner/sandbox.ts @@ -0,0 +1,181 @@ +/** + * Detect the strongest OS-level sandbox available on this host and turn + * a plain command into a sandbox-wrapped command. + * + * We deliberately ship no JS-level sandbox; the threat model is + * "user-running-their-own-code", not "untrusted multi-tenant input". The + * sandbox reduces blast radius of accidental rm-rf or runaway loops, not + * malicious code escapes. + * + * Priority: + * - Linux: bwrap > firejail > none + * - macOS: sandbox-exec > none + * - Windows: none (native AppContainer wrappers are too platform- + * specific to ship in v1) + * + * If nothing is detected the runner falls back to a plain subprocess and + * surfaces a `warning` in the `RunResult`. Users who want to refuse to + * run without a sandbox can set `LEETCODE_MCP_REQUIRE_SANDBOX=1`; the + * tool layer enforces this — the runner only reports. + */ +import { execFile as execFileCb } from "node:child_process"; +import { access, constants as fsConstants } from "node:fs/promises"; +import { promisify } from "node:util"; + +import type { SandboxKind } from "../types/index.js"; + +const execFile = promisify(execFileCb); + +interface DetectedSandbox { + kind: SandboxKind; + /** When `kind === "none"`, the absolute path to the wrapping + * binary (`bwrap`, `firejail`, `sandbox-exec`) is undefined. */ + path?: string; +} + +let cached: DetectedSandbox | undefined; + +/** + * Returns whether ` --version` succeeds. Uses the no-shell + * `execFile` so the probe never re-interprets `bin`/`args` through + * `/bin/sh -c` — important because future callers might be tempted to + * pass dynamic values, and the default `child_process.exec` is a + * shell-expansion foot-gun. + */ +async function probe( + bin: string, + args: string[] = ["--version"] +): Promise { + try { + await execFile(bin, args, { timeout: 1500 }); + return true; + } catch { + return false; + } +} + +/** + * Probe the host once per server lifetime. Subsequent calls return the + * cached result; tests can use `__resetSandboxCacheForTest` to force + * re-detection. + */ +export async function detectSandbox(): Promise { + if (cached) { + return cached; + } + + const platform = process.platform; + if (platform === "darwin") { + // sandbox-exec lives at /usr/bin/sandbox-exec on every macOS + // version we care about. Detect by file existence + executable + // bit rather than spawning the binary — its `-help` flag is + // undocumented and exits non-zero on some macOS versions, which + // would silently fall through to `kind: "none"` and lie to + // users that no sandbox is available. + try { + await access("/usr/bin/sandbox-exec", fsConstants.X_OK); + cached = { kind: "sandbox-exec", path: "/usr/bin/sandbox-exec" }; + return cached; + } catch { + /* fall through to "none" */ + } + } else if (platform === "linux") { + if (await probe("bwrap")) { + cached = { kind: "bwrap" }; + return cached; + } + if (await probe("firejail")) { + cached = { kind: "firejail" }; + return cached; + } + } + + cached = { kind: "none" }; + return cached; +} + +/** + * Wrap an existing command with the detected sandbox. Returns the new + * `[cmd, args]` pair plus the kind that was applied. When no sandbox is + * available, returns the input pair untouched and `kind: "none"`. + * + * `cwdAllowed` is the temp directory the user code is permitted to read + * + write — the rest of the filesystem is read-only (Linux) or denied + * (macOS). + */ +export async function wrapWithSandbox( + cmd: string, + args: string[], + cwdAllowed: string +): Promise<{ cmd: string; args: string[]; kind: SandboxKind }> { + const detected = await detectSandbox(); + if (detected.kind === "bwrap") { + return { + cmd: "bwrap", + args: [ + "--ro-bind", + "/", + "/", + "--tmpfs", + "/tmp", + "--bind", + cwdAllowed, + cwdAllowed, + "--proc", + "/proc", + "--dev", + "/dev", + "--unshare-all", + "--die-with-parent", + "--", + cmd, + ...args + ], + kind: "bwrap" + }; + } + if (detected.kind === "firejail") { + return { + cmd: "firejail", + args: [ + "--quiet", + "--noprofile", + "--net=none", + "--private-tmp", + `--whitelist=${cwdAllowed}`, + "--", + cmd, + ...args + ], + kind: "firejail" + }; + } + if (detected.kind === "sandbox-exec") { + // Minimal sandbox-exec profile — deny by default, allow process + // primitives + reads everywhere + writes only under cwdAllowed. + const profile = `(version 1) +(deny default) +(allow process-fork) +(allow process-exec) +(allow file-read*) +(allow file-write* (subpath "${cwdAllowed.replace(/"/g, '\\"')}")) +(allow file-write* (regex #"^/dev/null$")) +(allow file-write* (regex #"^/dev/dtracehelper$")) +(allow sysctl-read) +(allow mach-lookup) +(allow signal (target self)) +(allow ipc-posix-shm) +(deny network*)`; + return { + cmd: "/usr/bin/sandbox-exec", + args: ["-p", profile, cmd, ...args], + kind: "sandbox-exec" + }; + } + return { cmd, args, kind: "none" }; +} + +/** Test helper — clears the per-process cache so unit tests can re-probe. */ +export function __resetSandboxCacheForTest(): void { + cached = undefined; +} diff --git a/src/runner/subprocess-runner.ts b/src/runner/subprocess-runner.ts new file mode 100644 index 0000000..8249131 --- /dev/null +++ b/src/runner/subprocess-runner.ts @@ -0,0 +1,353 @@ +/** + * Plain-subprocess `LocalRunner` implementation. + * + * Per-language registry (currently `python3`) describes how to: + * - probe whether the runtime is available on PATH + * - spawn the runtime against a source file written to the run's + * temp dir + * + * Probes run lazily on the first `run()` for the language and the + * results are cached for the lifetime of the process. + * + * Safety nets every run gets, even with no OS sandbox: + * - per-process wall-clock timeout (default 5_000 ms; configurable + * per `RunInput`) + * - clean env (just PATH / HOME / LANG forwarded — secrets in the + * user's shell never leak in) + * - cwd is a freshly-mkdtemp'd directory under the OS tmp; it is + * removed after the run regardless of outcome + * - stdout/stderr captured with a 1 MB ceiling; runaway output gets + * truncated with a marker rather than blowing memory + */ +import { execFile as execFileCb, spawn } from "node:child_process"; +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { performance } from "node:perf_hooks"; +import { promisify } from "node:util"; + +import type { + RunInput, + RunResult, + RunnerCapabilities, + RunnerLanguage, + SandboxKind +} from "../types/index.js"; +import { ErrorCode, LeetCodeError } from "../types/index.js"; +import logger from "../utils/logger.js"; +import type { LocalRunner } from "./runner.js"; +import { IMPLEMENTED_LANGUAGES, SUPPORTED_LANGUAGES } from "./runner.js"; +import { wrapWithSandbox } from "./sandbox.js"; + +// `execFile` (no shell) — never `promisify(exec)`, which routes through +// `/bin/sh -c` and is a shell-expansion foot-gun if anyone interpolates +// a dynamic value into a probe in the future. +const execFile = promisify(execFileCb); + +const MAX_OUTPUT_BYTES = 1_000_000; // 1 MB per stream +const DEFAULT_TIMEOUT_MS = 5_000; +const TRUNCATION_MARKER = "\n[...output truncated at 1 MB...]"; + +interface LanguageSpec { + /** File extension (without dot) used for the temp source file. */ + extension: string; + /** `[binary, args]` to probe — exit code 0 means available. */ + probe: { cmd: string; args: string[] }; + /** + * Build the spawn args given the path of the source file we wrote + * for this run. Compiled languages (Go, Java) will hook in extra + * compile steps via subclassing later. + */ + buildArgs(sourcePath: string): { cmd: string; args: string[] }; +} + +const LANGUAGES: Record = { + python3: { + extension: "py", + probe: { cmd: "python3", args: ["--version"] }, + buildArgs: (sourcePath) => ({ + cmd: "python3", + args: [sourcePath] + }) + }, + // Phase 4b/4c stubs — present in the registry so the type system + // requires they stay in sync with `RunnerLanguage`. The runner + // refuses to use these until we actually wire harnesses. + go: { + extension: "go", + probe: { cmd: "go", args: ["version"] }, + buildArgs: () => { + throw new LeetCodeError( + ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE, + "Go runner ships in Phase 4b" + ); + } + }, + java: { + extension: "java", + probe: { cmd: "java", args: ["-version"] }, + buildArgs: () => { + throw new LeetCodeError( + ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE, + "Java runner ships in Phase 4c" + ); + } + } +}; + +interface ProbeResult { + available: boolean; + version?: string; + path?: string; +} + +const probeCache = new Map(); + +async function probeLanguage(language: RunnerLanguage): Promise { + const cached = probeCache.get(language); + if (cached) { + return cached; + } + const spec = LANGUAGES[language]; + try { + const { stdout, stderr } = await execFile( + spec.probe.cmd, + spec.probe.args, + { timeout: 2000 } + ); + // `python3 --version` and `go version` write to stdout; `java + // -version` historically writes to stderr — accept either. + const versionLine = (stdout || stderr || "").split("\n")[0]?.trim(); + const result: ProbeResult = { + available: true, + version: versionLine || undefined + }; + try { + const { stdout: which } = await execFile( + "which", + [spec.probe.cmd], + { timeout: 1000 } + ); + result.path = which.trim() || undefined; + } catch { + /* `which` may not exist (Windows); leave `path` undefined */ + } + probeCache.set(language, result); + return result; + } catch (error) { + const result: ProbeResult = { available: false }; + probeCache.set(language, result); + logger.debug( + { language, error: (error as Error)?.message }, + "Language probe failed" + ); + return result; + } +} + +/** Test helper — clears the probe cache so unit tests can re-detect. */ +export function __resetProbeCacheForTest(): void { + probeCache.clear(); +} + +function clampOutput(buf: Buffer): string { + if (buf.length <= MAX_OUTPUT_BYTES) { + return buf.toString("utf-8"); + } + return ( + buf.subarray(0, MAX_OUTPUT_BYTES).toString("utf-8") + TRUNCATION_MARKER + ); +} + +export class SubprocessRunner implements LocalRunner { + async capabilities(): Promise { + const languages = await Promise.all( + SUPPORTED_LANGUAGES.map(async (language) => { + const probe = await probeLanguage(language); + return { + language, + available: probe.available, + version: probe.version, + path: probe.path + }; + }) + ); + // Sandbox detection is in `./sandbox.ts`; importing inline here + // avoids a dependency cycle with `subprocess-runner` ↔ `sandbox`. + const { detectSandbox } = await import("./sandbox.js"); + const detected = await detectSandbox(); + return { + languages, + sandbox: { + kind: detected.kind, + available: detected.kind !== "none" + } + }; + } + + async run(input: RunInput): Promise { + if (!IMPLEMENTED_LANGUAGES.includes(input.language)) { + throw new LeetCodeError( + ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE, + `Local runner has no harness for ${input.language} yet` + ); + } + + const probe = await probeLanguage(input.language); + if (!probe.available) { + throw new LeetCodeError( + ErrorCode.LANGUAGE_RUNTIME_NOT_FOUND, + `Required runtime for ${input.language} not found on PATH` + ); + } + + const spec = LANGUAGES[input.language]; + const timeoutMs = input.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const workDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-run-")); + const sourcePath = join(workDir, `solution.${spec.extension}`); + + try { + await writeFile(sourcePath, input.code, "utf-8"); + const baseArgs = spec.buildArgs(sourcePath); + const wrapped = await wrapWithSandbox( + baseArgs.cmd, + baseArgs.args, + workDir + ); + + return await this.spawnAndCapture({ + cmd: wrapped.cmd, + args: wrapped.args, + cwd: workDir, + timeoutMs, + sandbox: wrapped.kind + }); + } finally { + await rm(workDir, { recursive: true, force: true }).catch( + (error) => { + logger.debug( + { error: (error as Error)?.message, workDir }, + "Failed to clean up runner workdir" + ); + } + ); + } + } + + private spawnAndCapture(options: { + cmd: string; + args: string[]; + cwd: string; + timeoutMs: number; + sandbox: SandboxKind; + }): Promise { + return new Promise((resolve) => { + const start = performance.now(); + const child = spawn(options.cmd, options.args, { + cwd: options.cwd, + env: { + PATH: process.env.PATH ?? "", + HOME: options.cwd, + LANG: process.env.LANG ?? "C.UTF-8" + }, + stdio: ["ignore", "pipe", "pipe"] + }); + + const stdout: Buffer[] = []; + const stderr: Buffer[] = []; + let stdoutBytes = 0; + let stderrBytes = 0; + let timedOut = false; + let killTimer: NodeJS.Timeout | undefined; + + // Tight guard: never let the buffered total exceed + // `MAX_OUTPUT_BYTES` even by a chunk. We slice the + // overflowing chunk to the exact remaining headroom and + // drop the rest. `clampOutput` still runs at finalize as a + // belt-and-braces final cap. + const captureChunk = ( + buffers: Buffer[], + bytes: number, + chunk: Buffer + ): number => { + const remaining = MAX_OUTPUT_BYTES - bytes; + if (remaining <= 0) { + return bytes; + } + if (chunk.length <= remaining) { + buffers.push(chunk); + return bytes + chunk.length; + } + buffers.push(chunk.subarray(0, remaining)); + return bytes + remaining; + }; + + child.stdout?.on("data", (chunk: Buffer) => { + stdoutBytes = captureChunk(stdout, stdoutBytes, chunk); + }); + child.stderr?.on("data", (chunk: Buffer) => { + stderrBytes = captureChunk(stderr, stderrBytes, chunk); + }); + + const timer = setTimeout(() => { + timedOut = true; + // SIGTERM first; if the child ignores it, hard SIGKILL + // 500 ms later. Belt + braces for runaway loops. + child.kill("SIGTERM"); + killTimer = setTimeout(() => child.kill("SIGKILL"), 500); + }, options.timeoutMs); + + const finalize = (exitCode: number | null): void => { + clearTimeout(timer); + if (killTimer) { + clearTimeout(killTimer); + } + const durationMs = Math.round(performance.now() - start); + const passed = !timedOut && exitCode === 0; + resolve({ + passed, + exitCode, + stdout: clampOutput(Buffer.concat(stdout)), + stderr: clampOutput(Buffer.concat(stderr)), + timedOut, + durationMs, + sandbox: options.sandbox, + warning: + options.sandbox === "none" + ? "No OS sandbox available on this host; ran without isolation." + : undefined + }); + }; + + child.on("close", (code, signal) => { + if (signal && code === null) { + finalize(null); + } else { + finalize(code); + } + }); + child.on("error", (error) => { + logger.warn( + { error: error.message, cmd: options.cmd }, + "Runner subprocess errored before exit" + ); + clearTimeout(timer); + if (killTimer) { + clearTimeout(killTimer); + } + resolve({ + passed: false, + exitCode: null, + stdout: clampOutput(Buffer.concat(stdout)), + stderr: + clampOutput(Buffer.concat(stderr)) + + `\n[runner error: ${error.message}]`, + timedOut: false, + durationMs: Math.round(performance.now() - start), + sandbox: options.sandbox, + warning: undefined + }); + }); + }); + } +} diff --git a/src/types/errors.ts b/src/types/errors.ts index 971b640..b3ffad1 100644 --- a/src/types/errors.ts +++ b/src/types/errors.ts @@ -38,7 +38,37 @@ export const ErrorCode = { * particular problem slug, but no `start_problem` has been called for * it (or the session was reset). */ - SESSION_NOT_FOUND: "SESSION_NOT_FOUND" + SESSION_NOT_FOUND: "SESSION_NOT_FOUND", + /** + * `run_local_tests` was asked for a language the local runner has no + * harness for. `submit_solution` keeps working for these languages — + * the runner is purely additive. + */ + RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE: "RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", + /** + * The language is supported in principle but the required runtime + * binary (e.g. `python3`, `go`, `java`) was not found on PATH. The + * `doctor` subcommand reports which runtimes are detected. + */ + LANGUAGE_RUNTIME_NOT_FOUND: "LANGUAGE_RUNTIME_NOT_FOUND", + /** + * The user's code exceeded the per-run wall-clock budget. The runner + * killed the process; partial output (if any) is included in the + * result envelope. + */ + RUNNER_TIMEOUT: "RUNNER_TIMEOUT", + /** + * `LEETCODE_MCP_REQUIRE_SANDBOX=1` is set but no OS sandbox tool was + * found on this host. The runner refuses to fall back to the unsandboxed + * subprocess path. + */ + SANDBOX_REQUIRED: "SANDBOX_REQUIRED", + /** + * Strict mode is enabled (`LEETCODE_MCP_STRICT_MODE=1`) and + * `submit_solution` was called before `run_local_tests` last passed. + * Drives the recommended local-first practice loop. + */ + LOCAL_TESTS_NOT_PASSED: "LOCAL_TESTS_NOT_PASSED" } as const; export type ErrorCodeValue = (typeof ErrorCode)[keyof typeof ErrorCode]; diff --git a/src/types/index.ts b/src/types/index.ts index 7e56d5f..e9f0d85 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -7,6 +7,7 @@ export * from "./credentials.js"; export * from "./errors.js"; export * from "./problem.js"; +export * from "./runner.js"; export * from "./session.js"; export * from "./solution.js"; export * from "./submission.js"; diff --git a/src/types/runner.ts b/src/types/runner.ts new file mode 100644 index 0000000..8f3078c --- /dev/null +++ b/src/types/runner.ts @@ -0,0 +1,94 @@ +/** + * Wire types for the local code runner introduced in Phase 4. + * + * The runner is intentionally simple: callers hand it a string of code + * plus a language tag, and get back a result envelope describing what the + * subprocess did. There is no per-problem harness logic at this layer — + * harnesses live one floor up, in `src/runner/harnesses/*`, and inject + * test scaffolding into the source before it reaches the runner. + */ + +/** + * Languages the local runner knows how to execute. + * + * Phase 4a ships `python3` only; Phase 4b/4c add `go` and `java`. Other + * LeetCode languages remain valid for `submit_solution` but + * `run_local_tests` will reject them with + * `RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE`. + */ +export type RunnerLanguage = "python3" | "go" | "java"; + +/** + * What the runner detected when it tried to spawn an isolated subprocess. + * + * - `none` — plain subprocess, no OS-level sandbox (always available) + * - `bwrap` — Linux: bubblewrap with read-only fs + writable tmp + no net + * - `firejail` — Linux fallback when bwrap isn't installed + * - `sandbox-exec` — macOS: built-in `sandbox-exec` profile + * + * Reported alongside every `RunResult` so callers can show "ran in + * bwrap sandbox" without parsing logs. + */ +export type SandboxKind = "none" | "bwrap" | "firejail" | "sandbox-exec"; + +export interface RunInput { + /** + * LeetCode problem slug. Used by the tool layer to look up the + * active session and update `lastLocalRunPassed`. Not consumed by + * the runner itself. + */ + titleSlug: string; + /** Language to run as. */ + language: RunnerLanguage; + /** + * Source code to execute, exactly as the runner should receive it. + * The harness layer is responsible for any wrapping, scaffolding, or + * test-driver injection before this string is built. + */ + code: string; + /** + * Wall-clock budget in milliseconds. Defaults to 5_000 if omitted. + * The runner kills the subprocess when this elapses and returns + * `timedOut: true` with whatever partial output was captured. + */ + timeoutMs?: number; +} + +export interface RunResult { + /** Convenience flag: `exitCode === 0 && !timedOut`. */ + passed: boolean; + /** Subprocess exit code, or `null` when the process was killed. */ + exitCode: number | null; + /** Captured stdout, truncated to ~1 MB. */ + stdout: string; + /** Captured stderr, truncated to ~1 MB. */ + stderr: string; + /** Whether the wall-clock budget was hit. */ + timedOut: boolean; + /** Wall-clock time the subprocess ran for, in milliseconds. */ + durationMs: number; + /** Which sandbox (if any) was used. See {@link SandboxKind}. */ + sandbox: SandboxKind; + /** + * Human-readable note when something interesting happened that the + * caller should know about — e.g. "no OS sandbox available on this + * host; ran without isolation". Omitted on the happy path. + */ + warning?: string; +} + +/** Capability snapshot the `doctor` subcommand renders to the user. */ +export interface RunnerCapabilities { + /** What languages have a working runtime detected on PATH. */ + languages: Array<{ + language: RunnerLanguage; + available: boolean; + version?: string; + path?: string; + }>; + /** Sandbox tooling available on this host, in priority order. */ + sandbox: { + kind: SandboxKind; + available: boolean; + }; +} diff --git a/tests/domain/session-service.test.ts b/tests/domain/session-service.test.ts new file mode 100644 index 0000000..bea77eb --- /dev/null +++ b/tests/domain/session-service.test.ts @@ -0,0 +1,82 @@ +/** + * Unit tests for SessionService methods that don't already have + * coverage via the e2e/integration suites — primarily the Phase 4 + * additions (`requireSession`, `recordLocalRun`). + */ +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { SessionService } from "../../src/domain/session-service.js"; +import { FileSessionStore } from "../../src/domain/session-store.js"; +import { ErrorCode, isLeetCodeError } from "../../src/types/index.js"; + +describe("SessionService — Phase 4 additions", () => { + let dir: string; + let service: SessionService; + + beforeEach(async () => { + dir = await mkdtemp(join(tmpdir(), "leetcode-mcp-svc-")); + service = new SessionService(new FileSessionStore({ dir })); + }); + + afterEach(async () => { + await rm(dir, { recursive: true, force: true }); + }); + + describe("requireSession", () => { + it("returns the session when present", async () => { + const session = await service.startOrResume({ slug: "two-sum" }); + const fetched = await service.requireSession("two-sum"); + expect(fetched.slug).toBe(session.slug); + }); + + it("throws SESSION_NOT_FOUND when no session exists", async () => { + await expect(async () => { + await service.requireSession("never-opened"); + }).rejects.toSatisfy( + (error: unknown) => + isLeetCodeError(error) && + error.code === ErrorCode.SESSION_NOT_FOUND + ); + }); + }); + + describe("recordLocalRun", () => { + it("increments attempts and stores lastLocalRunPassed", async () => { + await service.startOrResume({ slug: "two-sum" }); + + const after1 = await service.recordLocalRun("two-sum", false); + expect(after1.attempts).toBe(1); + expect(after1.lastLocalRunPassed).toBe(false); + expect(after1.status).toBe("attempting"); + + const after2 = await service.recordLocalRun("two-sum", true); + expect(after2.attempts).toBe(2); + expect(after2.lastLocalRunPassed).toBe(true); + // Status should not regress from "attempting". + expect(after2.status).toBe("attempting"); + }); + + it("persists across service instances", async () => { + await service.startOrResume({ slug: "two-sum" }); + await service.recordLocalRun("two-sum", true); + + // Reload from disk via a fresh service. + const reloaded = new SessionService(new FileSessionStore({ dir })); + const session = await reloaded.requireSession("two-sum"); + expect(session.attempts).toBe(1); + expect(session.lastLocalRunPassed).toBe(true); + }); + + it("throws SESSION_NOT_FOUND when no session exists", async () => { + await expect(async () => { + await service.recordLocalRun("never-opened", true); + }).rejects.toSatisfy( + (error: unknown) => + isLeetCodeError(error) && + error.code === ErrorCode.SESSION_NOT_FOUND + ); + }); + }); +}); diff --git a/tests/e2e/lifecycle.test.ts b/tests/e2e/lifecycle.test.ts index a7851d3..aac2cd7 100644 --- a/tests/e2e/lifecycle.test.ts +++ b/tests/e2e/lifecycle.test.ts @@ -51,6 +51,8 @@ describe("e2e: server lifecycle", () => { "list_problem_solutions", "request_hint", "reset_session", + "run_local_tests", + "runner_doctor", "save_leetcode_credentials", "search_problems", "start_leetcode_auth", diff --git a/tests/e2e/runner.test.ts b/tests/e2e/runner.test.ts new file mode 100644 index 0000000..2c2bb39 --- /dev/null +++ b/tests/e2e/runner.test.ts @@ -0,0 +1,258 @@ +/** + * Local-runner e2e: spawn the real `build/index.js`, drive + * `runner_doctor` and `run_local_tests` over the wire, and assert the + * runner actually executes Python on the host. + * + * Skipped automatically on hosts without `python3` so the suite stays + * portable; the project's CI image has it. + */ +import { execFileSync } from "node:child_process"; +import { afterEach, describe, expect, it } from "vitest"; +import { spawnServer, type SpawnedServer } from "./harness/spawn-server.js"; + +interface ToolTextResult { + content: Array<{ type: string; text: string }>; +} + +const TWO_SUM_PROBLEM = { + questionId: "1", + questionFrontendId: "1", + title: "Two Sum", + titleSlug: "two-sum", + difficulty: "Easy", + isPaidOnly: false, + content: "

Two Sum problem

", + topicTags: [{ name: "Array", slug: "array" }], + codeSnippets: [ + { + lang: "Python3", + langSlug: "python3", + code: "class Solution:\n def twoSum(self, nums, target):\n pass\n" + } + ], + similarQuestions: "[]", + exampleTestcases: "[2,7,11,15]\n9", + hints: [], + stats: '{"totalAccepted":"10M","totalSubmission":"20M","acRate":"50.0%"}' +}; + +const FIXTURE = { + graphql: [ + { + operationContains: "question(titleSlug:", + response: { data: { question: TWO_SUM_PROBLEM } } + } + ] +}; + +function pythonAvailable(): boolean { + try { + execFileSync("python3", ["--version"], { stdio: "ignore" }); + return true; + } catch { + return false; + } +} + +const PYTHON_PRESENT = pythonAvailable(); + +describe.skipIf(!PYTHON_PRESENT)("e2e: local runner (python3)", () => { + let spawned: SpawnedServer | undefined; + + afterEach(async () => { + if (spawned) { + await spawned.cleanup(); + spawned = undefined; + } + }); + + it("runner_doctor reports python3 availability", async () => { + spawned = await spawnServer({ fixture: FIXTURE }); + + const doctor = (await spawned.client.callTool({ + name: "runner_doctor", + arguments: {} + })) as ToolTextResult; + + const payload = JSON.parse(doctor.content[0].text); + expect(payload.languages).toBeDefined(); + const py = payload.languages.find( + (l: { language: string }) => l.language === "python3" + ); + expect(py?.available).toBe(true); + expect(payload.sandbox).toBeDefined(); + }); + + it("rejects run_local_tests when no session is open", async () => { + spawned = await spawnServer({ fixture: FIXTURE }); + + const result = (await spawned.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: "print('ok')" + } + })) as ToolTextResult; + + const payload = JSON.parse(result.content[0].text); + expect(payload.code).toBe("SESSION_NOT_FOUND"); + }); + + it("executes a passing python script and updates the session", async () => { + spawned = await spawnServer({ fixture: FIXTURE }); + + await spawned.client.callTool({ + name: "start_problem", + arguments: { titleSlug: "two-sum", language: "python3" } + }); + + const run = (await spawned.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: 'print("hi")\nassert 1 + 1 == 2' + } + })) as ToolTextResult; + + const payload = JSON.parse(run.content[0].text); + expect(payload.titleSlug).toBe("two-sum"); + expect(payload.result.passed).toBe(true); + expect(payload.result.exitCode).toBe(0); + expect(payload.result.timedOut).toBe(false); + expect(payload.result.stdout).toContain("hi"); + + // Session state is observable via get_session_state. + const state = (await spawned.client.callTool({ + name: "get_session_state", + arguments: { titleSlug: "two-sum" } + })) as ToolTextResult; + const sessionPayload = JSON.parse(state.content[0].text); + expect(sessionPayload.session.lastLocalRunPassed).toBe(true); + expect(sessionPayload.session.attempts).toBe(1); + }); + + it("captures non-zero exit code without throwing", async () => { + spawned = await spawnServer({ fixture: FIXTURE }); + + await spawned.client.callTool({ + name: "start_problem", + arguments: { titleSlug: "two-sum", language: "python3" } + }); + + const run = (await spawned.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: "raise SystemExit(2)" + } + })) as ToolTextResult; + + const payload = JSON.parse(run.content[0].text); + expect(payload.result.passed).toBe(false); + expect(payload.result.exitCode).toBe(2); + + const state = (await spawned.client.callTool({ + name: "get_session_state", + arguments: { titleSlug: "two-sum" } + })) as ToolTextResult; + const sessionPayload = JSON.parse(state.content[0].text); + expect(sessionPayload.session.lastLocalRunPassed).toBe(false); + }); + + it("kills runaway processes after the timeout budget", async () => { + spawned = await spawnServer({ fixture: FIXTURE }); + + await spawned.client.callTool({ + name: "start_problem", + arguments: { titleSlug: "two-sum", language: "python3" } + }); + + const run = (await spawned.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: "while True: pass", + timeoutMs: 500 + } + })) as ToolTextResult; + + const payload = JSON.parse(run.content[0].text); + expect(payload.result.timedOut).toBe(true); + expect(payload.result.passed).toBe(false); + }); + + it("rejects unimplemented languages with RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", async () => { + spawned = await spawnServer({ fixture: FIXTURE }); + + await spawned.client.callTool({ + name: "start_problem", + arguments: { titleSlug: "two-sum", language: "go" } + }); + + const run = (await spawned.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "go", + code: "package main\nfunc main() {}" + } + })) as ToolTextResult; + + const payload = JSON.parse(run.content[0].text); + expect(payload.code).toBe("RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE"); + }); + + it("blocks submit_solution under strict mode until run_local_tests passes", async () => { + spawned = await spawnServer({ + fixture: FIXTURE, + env: { LEETCODE_MCP_STRICT_MODE: "1" } + }); + + await spawned.client.callTool({ + name: "start_problem", + arguments: { titleSlug: "two-sum", language: "python3" } + }); + + // First submit attempt: no run_local_tests yet → rejected. + const blocked = (await spawned.client.callTool({ + name: "submit_solution", + arguments: { + problemSlug: "two-sum", + code: "def twoSum(nums, target): pass", + language: "python3" + } + })) as ToolTextResult; + const blockedPayload = JSON.parse(blocked.content[0].text); + expect(blockedPayload.code).toBe("LOCAL_TESTS_NOT_PASSED"); + + // Run locals successfully. + const run = (await spawned.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: 'print("ok")' + } + })) as ToolTextResult; + const runPayload = JSON.parse(run.content[0].text); + expect(runPayload.result.passed).toBe(true); + + // Submit again: strict mode now permits it (the upstream + // request itself will fail via nock — we don't care; the gate + // is what we're locking down here). + const allowed = (await spawned.client.callTool({ + name: "submit_solution", + arguments: { + problemSlug: "two-sum", + code: "def twoSum(nums, target): pass", + language: "python3" + } + })) as ToolTextResult; + const allowedPayload = JSON.parse(allowed.content[0].text); + expect(allowedPayload.code).not.toBe("LOCAL_TESTS_NOT_PASSED"); + }); +}); diff --git a/tests/integration/runner-tools-integration.test.ts b/tests/integration/runner-tools-integration.test.ts new file mode 100644 index 0000000..b563913 --- /dev/null +++ b/tests/integration/runner-tools-integration.test.ts @@ -0,0 +1,251 @@ +/** + * Runner Tools Integration Tests + * + * Drives `run_local_tests` and `runner_doctor` through the MCP wire, + * with a fake `LocalRunner` that records what it was called with so we + * can assert the tool layer's behaviour without depending on `python3` + * being installed where these tests run. + */ +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { SessionService } from "../../src/domain/session-service.js"; +import { FileSessionStore } from "../../src/domain/session-store.js"; +import { registerRunnerTools } from "../../src/mcp/tools/runner-tools.js"; +import type { LocalRunner } from "../../src/runner/runner.js"; +import { + ErrorCode, + LeetCodeError, + type RunInput, + type RunResult, + type RunnerCapabilities +} from "../../src/types/index.js"; +import { createMockLeetCodeService } from "../helpers/mock-leetcode.js"; +import type { TestClientPair } from "../helpers/test-client.js"; +import { createTestClient } from "../helpers/test-client.js"; +import { INTEGRATION_TEST_TIMEOUT, assertions } from "./setup.js"; + +const HAPPY_RESULT: RunResult = { + passed: true, + exitCode: 0, + stdout: "ok\n", + stderr: "", + timedOut: false, + durationMs: 42, + sandbox: "none", + warning: "No OS sandbox available on this host; ran without isolation." +}; + +const FAKE_CAPS: RunnerCapabilities = { + languages: [ + { language: "python3", available: true, version: "Python 3.12.0" }, + { language: "go", available: false }, + { language: "java", available: false } + ], + sandbox: { kind: "none", available: false } +}; + +interface FakeRunnerOptions { + nextResult?: RunResult; + runError?: unknown; +} + +function createFakeRunner(options: FakeRunnerOptions = {}): LocalRunner & { + runs: RunInput[]; +} { + const runs: RunInput[] = []; + return { + runs, + async run(input: RunInput): Promise { + runs.push(input); + if (options.runError) { + throw options.runError; + } + return options.nextResult ?? HAPPY_RESULT; + }, + async capabilities(): Promise { + return FAKE_CAPS; + } + }; +} + +describe("Runner Tools Integration", () => { + let testClient: TestClientPair; + let mockService: ReturnType; + let sessions: SessionService; + let sessionDir: string; + let runner: ReturnType; + + beforeEach(async () => { + mockService = createMockLeetCodeService(); + sessionDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-runner-")); + sessions = new SessionService( + new FileSessionStore({ dir: sessionDir }) + ); + runner = createFakeRunner(); + + testClient = await createTestClient({}, (server) => { + registerRunnerTools(server, mockService as any, sessions, runner); + }); + }, INTEGRATION_TEST_TIMEOUT); + + afterEach(async () => { + if (testClient) { + await testClient.cleanup(); + } + await rm(sessionDir, { recursive: true, force: true }); + vi.restoreAllMocks(); + }); + + describe("run_local_tests", () => { + it( + "rejects with SESSION_NOT_FOUND when no session has been opened", + async () => { + const result: any = await testClient.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: "print('hi')" + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text); + expect(payload.code).toBe(ErrorCode.SESSION_NOT_FOUND); + expect(runner.runs).toHaveLength(0); + }, + INTEGRATION_TEST_TIMEOUT + ); + + it( + "delegates to the runner and records lastLocalRunPassed", + async () => { + await sessions.startOrResume({ slug: "two-sum" }); + + const result: any = await testClient.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: 'print("hi")' + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text); + expect(payload.titleSlug).toBe("two-sum"); + expect(payload.result.passed).toBe(true); + expect(runner.runs).toHaveLength(1); + expect(runner.runs[0].language).toBe("python3"); + expect(runner.runs[0].code).toBe('print("hi")'); + + const session = await sessions.requireSession("two-sum"); + expect(session.lastLocalRunPassed).toBe(true); + expect(session.attempts).toBe(1); + }, + INTEGRATION_TEST_TIMEOUT + ); + + it( + "records lastLocalRunPassed=false on a failing run", + async () => { + await sessions.startOrResume({ slug: "two-sum" }); + const failing = createFakeRunner({ + nextResult: { ...HAPPY_RESULT, passed: false, exitCode: 1 } + }); + // Re-build the test client with the failing runner. + await testClient.cleanup(); + testClient = await createTestClient({}, (server) => { + registerRunnerTools( + server, + mockService as any, + sessions, + failing + ); + }); + + await testClient.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "python3", + code: "raise SystemExit(1)" + } + }); + + const session = await sessions.requireSession("two-sum"); + expect(session.lastLocalRunPassed).toBe(false); + expect(session.attempts).toBe(1); + }, + INTEGRATION_TEST_TIMEOUT + ); + + it( + "surfaces RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE thrown from the runner", + async () => { + await sessions.startOrResume({ slug: "two-sum" }); + const broken = createFakeRunner({ + runError: new LeetCodeError( + ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE, + "Go runner ships in Phase 4b" + ) + }); + await testClient.cleanup(); + testClient = await createTestClient({}, (server) => { + registerRunnerTools( + server, + mockService as any, + sessions, + broken + ); + }); + + const result: any = await testClient.client.callTool({ + name: "run_local_tests", + arguments: { + titleSlug: "two-sum", + language: "go", + code: "package main" + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text); + expect(payload.code).toBe( + ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE + ); + + // The session attempt counter should NOT bump on a + // pre-run rejection. + const session = await sessions.requireSession("two-sum"); + expect(session.attempts).toBe(0); + }, + INTEGRATION_TEST_TIMEOUT + ); + }); + + describe("runner_doctor", () => { + it( + "returns the capabilities snapshot", + async () => { + const result: any = await testClient.client.callTool({ + name: "runner_doctor", + arguments: {} + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text); + expect(payload.languages).toBeDefined(); + expect(payload.sandbox).toBeDefined(); + expect( + payload.languages.find( + (l: { language: string }) => l.language === "python3" + )?.available + ).toBe(true); + }, + INTEGRATION_TEST_TIMEOUT + ); + }); +}); diff --git a/tests/integration/submission-tools-integration.test.ts b/tests/integration/submission-tools-integration.test.ts index aa60787..720ebf4 100644 --- a/tests/integration/submission-tools-integration.test.ts +++ b/tests/integration/submission-tools-integration.test.ts @@ -2,8 +2,14 @@ * Submission Tools Integration Tests * Tests all submission-related tools through MCP protocol */ +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { SessionService } from "../../src/domain/session-service.js"; +import { FileSessionStore } from "../../src/domain/session-store.js"; import { registerSubmissionTools } from "../../src/mcp/tools/submission-tools.js"; +import { ErrorCode } from "../../src/types/index.js"; import { createMockAuthenticatedService } from "../helpers/mock-leetcode.js"; import type { TestClientPair } from "../helpers/test-client.js"; import { createTestClient } from "../helpers/test-client.js"; @@ -12,13 +18,19 @@ import { INTEGRATION_TEST_TIMEOUT, assertions } from "./setup.js"; describe("Submission Tools Integration", () => { let testClient: TestClientPair; let mockService: ReturnType; + let sessions: SessionService; + let sessionDir: string; beforeEach(async () => { // Use authenticated service since submission requires authentication mockService = createMockAuthenticatedService(); + sessionDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-sub-")); + sessions = new SessionService( + new FileSessionStore({ dir: sessionDir }) + ); testClient = await createTestClient({}, (server) => { - registerSubmissionTools(server, mockService as any); + registerSubmissionTools(server, mockService as any, sessions); }); }, INTEGRATION_TEST_TIMEOUT); @@ -26,6 +38,8 @@ describe("Submission Tools Integration", () => { if (testClient) { await testClient.cleanup(); } + await rm(sessionDir, { recursive: true, force: true }); + delete process.env.LEETCODE_MCP_STRICT_MODE; }); describe("submit_solution", () => { @@ -98,4 +112,100 @@ describe("Submission Tools Integration", () => { INTEGRATION_TEST_TIMEOUT ); }); + + describe("submit_solution — strict mode", () => { + it( + "blocks submission when LEETCODE_MCP_STRICT_MODE=1 and session has not passed locals", + async () => { + process.env.LEETCODE_MCP_STRICT_MODE = "1"; + await sessions.startOrResume({ slug: "two-sum" }); + // No recordLocalRun call → lastLocalRunPassed is null. + + const result: any = await testClient.client.callTool({ + name: "submit_solution", + arguments: { + problemSlug: "two-sum", + code: "def twoSum(nums, target): pass", + language: "python3" + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text as string); + expect(payload.code).toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED); + }, + INTEGRATION_TEST_TIMEOUT + ); + + it( + "permits submission when strict mode is on and locals have passed", + async () => { + process.env.LEETCODE_MCP_STRICT_MODE = "1"; + await sessions.startOrResume({ slug: "two-sum" }); + await sessions.recordLocalRun("two-sum", true); + + const result: any = await testClient.client.callTool({ + name: "submit_solution", + arguments: { + problemSlug: "two-sum", + code: "def twoSum(nums, target): pass", + language: "python3" + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text as string); + // Mock service returns a normal submission envelope — + // we just need to confirm we didn't get the error code. + expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED); + }, + INTEGRATION_TEST_TIMEOUT + ); + + it( + "permits submission when strict mode is on but no session was opened", + async () => { + process.env.LEETCODE_MCP_STRICT_MODE = "1"; + // Deliberately no startOrResume — strict mode should + // not block ad-hoc submissions outside the tutoring + // flow. + + const result: any = await testClient.client.callTool({ + name: "submit_solution", + arguments: { + problemSlug: "two-sum", + code: "def twoSum(nums, target): pass", + language: "python3" + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text as string); + expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED); + }, + INTEGRATION_TEST_TIMEOUT + ); + + it( + "does not block by default (LEETCODE_MCP_STRICT_MODE unset)", + async () => { + // No env var; session exists with lastLocalRunPassed === null. + await sessions.startOrResume({ slug: "two-sum" }); + + const result: any = await testClient.client.callTool({ + name: "submit_solution", + arguments: { + problemSlug: "two-sum", + code: "def twoSum(nums, target): pass", + language: "python3" + } + }); + + assertions.hasToolResultStructure(result); + const payload = JSON.parse(result.content[0].text as string); + expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED); + }, + INTEGRATION_TEST_TIMEOUT + ); + }); }); diff --git a/tests/runner/subprocess-runner.test.ts b/tests/runner/subprocess-runner.test.ts new file mode 100644 index 0000000..27969dc --- /dev/null +++ b/tests/runner/subprocess-runner.test.ts @@ -0,0 +1,161 @@ +/** + * Unit tests for the subprocess runner. + * + * These tests assume `python3` is available on PATH (the project's own + * CI image already has it). The runner's own probe gates each test on + * availability; a missing python3 produces a `LANGUAGE_RUNTIME_NOT_FOUND` + * which is its own first-class assertion. + */ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { __resetSandboxCacheForTest } from "../../src/runner/sandbox.js"; +import { + SubprocessRunner, + __resetProbeCacheForTest +} from "../../src/runner/subprocess-runner.js"; +import { + ErrorCode, + isLeetCodeError, + type RunnerLanguage +} from "../../src/types/index.js"; + +describe("SubprocessRunner", () => { + let runner: SubprocessRunner; + + beforeEach(() => { + // Force re-probing per test so mutations to PATH (none here, but + // future tests may) don't leak between cases. + __resetProbeCacheForTest(); + __resetSandboxCacheForTest(); + runner = new SubprocessRunner(); + }); + + afterEach(() => { + __resetProbeCacheForTest(); + __resetSandboxCacheForTest(); + }); + + describe("capabilities", () => { + it("reports python3 as a supported language", async () => { + const caps = await runner.capabilities(); + const py = caps.languages.find((l) => l.language === "python3"); + expect(py).toBeDefined(); + // Don't assert availability — environments without python3 + // should still produce a coherent envelope. + expect(typeof py?.available).toBe("boolean"); + }); + + it("reports go and java as supported languages even before they are implemented", async () => { + const caps = await runner.capabilities(); + const langs = caps.languages.map((l) => l.language).sort(); + expect(langs).toEqual(["go", "java", "python3"]); + }); + + it("includes a sandbox descriptor", async () => { + const caps = await runner.capabilities(); + expect(caps.sandbox).toBeDefined(); + expect(["none", "bwrap", "firejail", "sandbox-exec"]).toContain( + caps.sandbox.kind + ); + }); + }); + + describe("run", () => { + it("executes a happy-path python script", async () => { + const result = await runner.run({ + titleSlug: "two-sum", + language: "python3", + code: 'print("hello"); assert 1 + 1 == 2' + }); + + expect(result.passed).toBe(true); + expect(result.exitCode).toBe(0); + expect(result.timedOut).toBe(false); + expect(result.stdout).toContain("hello"); + expect(result.stderr).toBe(""); + expect(result.durationMs).toBeGreaterThanOrEqual(0); + }); + + it("captures non-zero exit code without throwing", async () => { + const result = await runner.run({ + titleSlug: "two-sum", + language: "python3", + code: "raise SystemExit(7)" + }); + + expect(result.passed).toBe(false); + expect(result.exitCode).toBe(7); + expect(result.timedOut).toBe(false); + }); + + it("captures stderr from raised exceptions", async () => { + const result = await runner.run({ + titleSlug: "two-sum", + language: "python3", + code: 'raise ValueError("boom")' + }); + + expect(result.passed).toBe(false); + expect(result.exitCode).not.toBe(0); + expect(result.stderr).toContain("ValueError"); + expect(result.stderr).toContain("boom"); + }); + + it("kills runaway processes after the timeout budget", async () => { + const start = Date.now(); + const result = await runner.run({ + titleSlug: "two-sum", + language: "python3", + code: "while True: pass", + timeoutMs: 400 + }); + const elapsed = Date.now() - start; + + expect(result.timedOut).toBe(true); + expect(result.passed).toBe(false); + // Tolerate slow CI: budget + the 500 ms SIGTERM-then-SIGKILL + // grace + scheduler jitter. Should not run for full 5s. + expect(elapsed).toBeLessThan(2_500); + }); + + it("rejects unsupported languages with RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", async () => { + await expect(async () => { + await runner.run({ + titleSlug: "two-sum", + language: "go" as RunnerLanguage, + code: 'package main\nfunc main() { println("hi") }' + }); + }).rejects.toSatisfy((error: unknown) => { + if (!isLeetCodeError(error)) { + return false; + } + return ( + error.code === ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE + ); + }); + }); + + it("forwards a clean env (no leaking secrets)", async () => { + // Ask the child to print one of its env vars. We never set + // SECRET_ON_PARENT in the child env, so it should print + // empty even if defined on the parent. + const before = process.env.SECRET_ON_PARENT; + process.env.SECRET_ON_PARENT = "leak-me"; + try { + const result = await runner.run({ + titleSlug: "two-sum", + language: "python3", + code: 'import os; print(os.environ.get("SECRET_ON_PARENT", "MISSING"))' + }); + + expect(result.passed).toBe(true); + expect(result.stdout.trim()).toBe("MISSING"); + } finally { + if (before === undefined) { + delete process.env.SECRET_ON_PARENT; + } else { + process.env.SECRET_ON_PARENT = before; + } + } + }); + }); +});