From 0d0daae5fbfa07ca67abc2758ed964b149c81fa7 Mon Sep 17 00:00:00 2001
From: Owl <32782746+SPerekrestova@users.noreply.github.com>
Date: Fri, 8 May 2026 08:44:52 +0000
Subject: [PATCH 1/6] Phase 4a: add runner type contracts + error codes

- New types: RunnerLanguage, SandboxKind, RunInput, RunResult, RunnerCapabilities
- New error codes: RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE, LANGUAGE_RUNTIME_NOT_FOUND,
  RUNNER_TIMEOUT, SANDBOX_REQUIRED, LOCAL_TESTS_NOT_PASSED
- Re-export hub now exposes runner.ts
---
 src/types/errors.ts | 32 ++++++++++++++-
 src/types/index.ts  |  1 +
 src/types/runner.ts | 94 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 src/types/runner.ts

diff --git a/src/types/errors.ts b/src/types/errors.ts
index 971b640..b3ffad1 100644
--- a/src/types/errors.ts
+++ b/src/types/errors.ts
@@ -38,7 +38,37 @@ export const ErrorCode = {
      * particular problem slug, but no `start_problem` has been called for
      * it (or the session was reset).
      */
-    SESSION_NOT_FOUND: "SESSION_NOT_FOUND"
+    SESSION_NOT_FOUND: "SESSION_NOT_FOUND",
+    /**
+     * `run_local_tests` was asked for a language the local runner has no
+     * harness for. `submit_solution` keeps working for these languages —
+     * the runner is purely additive.
+     */
+    RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE: "RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE",
+    /**
+     * The language is supported in principle but the required runtime
+     * binary (e.g. `python3`, `go`, `java`) was not found on PATH. The
+     * `doctor` subcommand reports which runtimes are detected.
+     */
+    LANGUAGE_RUNTIME_NOT_FOUND: "LANGUAGE_RUNTIME_NOT_FOUND",
+    /**
+     * The user's code exceeded the per-run wall-clock budget. The runner
+     * killed the process; partial output (if any) is included in the
+     * result envelope.
+     */
+    RUNNER_TIMEOUT: "RUNNER_TIMEOUT",
+    /**
+     * `LEETCODE_MCP_REQUIRE_SANDBOX=1` is set but no OS sandbox tool was
+     * found on this host. The runner refuses to fall back to the unsandboxed
+     * subprocess path.
+     */
+    SANDBOX_REQUIRED: "SANDBOX_REQUIRED",
+    /**
+     * Strict mode is enabled (`LEETCODE_MCP_STRICT_MODE=1`) and
+     * `submit_solution` was called before `run_local_tests` last passed.
+     * Drives the recommended local-first practice loop.
+     */
+    LOCAL_TESTS_NOT_PASSED: "LOCAL_TESTS_NOT_PASSED"
 } as const;
 
 export type ErrorCodeValue = (typeof ErrorCode)[keyof typeof ErrorCode];
diff --git a/src/types/index.ts b/src/types/index.ts
index 7e56d5f..e9f0d85 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -7,6 +7,7 @@
 export * from "./credentials.js";
 export * from "./errors.js";
 export * from "./problem.js";
+export * from "./runner.js";
 export * from "./session.js";
 export * from "./solution.js";
 export * from "./submission.js";
diff --git a/src/types/runner.ts b/src/types/runner.ts
new file mode 100644
index 0000000..8f3078c
--- /dev/null
+++ b/src/types/runner.ts
@@ -0,0 +1,94 @@
+/**
+ * Wire types for the local code runner introduced in Phase 4.
+ *
+ * The runner is intentionally simple: callers hand it a string of code
+ * plus a language tag, and get back a result envelope describing what the
+ * subprocess did. There is no per-problem harness logic at this layer —
+ * harnesses live one floor up, in `src/runner/harnesses/*`, and inject
+ * test scaffolding into the source before it reaches the runner.
+ */
+
+/**
+ * Languages the local runner knows how to execute.
+ *
+ * Phase 4a ships `python3` only; Phase 4b/4c add `go` and `java`. Other
+ * LeetCode languages remain valid for `submit_solution` but
+ * `run_local_tests` will reject them with
+ * `RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE`.
+ */
+export type RunnerLanguage = "python3" | "go" | "java";
+
+/**
+ * What the runner detected when it tried to spawn an isolated subprocess.
+ *
+ * - `none`     — plain subprocess, no OS-level sandbox (always available)
+ * - `bwrap`    — Linux: bubblewrap with read-only fs + writable tmp + no net
+ * - `firejail` — Linux fallback when bwrap isn't installed
+ * - `sandbox-exec` — macOS: built-in `sandbox-exec` profile
+ *
+ * Reported alongside every `RunResult` so callers can show "ran in
+ * bwrap sandbox" without parsing logs.
+ */
+export type SandboxKind = "none" | "bwrap" | "firejail" | "sandbox-exec";
+
+export interface RunInput {
+    /**
+     * LeetCode problem slug. Used by the tool layer to look up the
+     * active session and update `lastLocalRunPassed`. Not consumed by
+     * the runner itself.
+     */
+    titleSlug: string;
+    /** Language to run as. */
+    language: RunnerLanguage;
+    /**
+     * Source code to execute, exactly as the runner should receive it.
+     * The harness layer is responsible for any wrapping, scaffolding, or
+     * test-driver injection before this string is built.
+     */
+    code: string;
+    /**
+     * Wall-clock budget in milliseconds. Defaults to 5_000 if omitted.
+     * The runner kills the subprocess when this elapses and returns
+     * `timedOut: true` with whatever partial output was captured.
+     */
+    timeoutMs?: number;
+}
+
+export interface RunResult {
+    /** Convenience flag: `exitCode === 0 && !timedOut`. */
+    passed: boolean;
+    /** Subprocess exit code, or `null` when the process was killed. */
+    exitCode: number | null;
+    /** Captured stdout, truncated to ~1 MB. */
+    stdout: string;
+    /** Captured stderr, truncated to ~1 MB. */
+    stderr: string;
+    /** Whether the wall-clock budget was hit. */
+    timedOut: boolean;
+    /** Wall-clock time the subprocess ran for, in milliseconds. */
+    durationMs: number;
+    /** Which sandbox (if any) was used. See {@link SandboxKind}. */
+    sandbox: SandboxKind;
+    /**
+     * Human-readable note when something interesting happened that the
+     * caller should know about — e.g. "no OS sandbox available on this
+     * host; ran without isolation". Omitted on the happy path.
+     */
+    warning?: string;
+}
+
+/** Capability snapshot the `doctor` subcommand renders to the user. */
+export interface RunnerCapabilities {
+    /** What languages have a working runtime detected on PATH. */
+    languages: Array<{
+        language: RunnerLanguage;
+        available: boolean;
+        version?: string;
+        path?: string;
+    }>;
+    /** Sandbox tooling available on this host, in priority order. */
+    sandbox: {
+        kind: SandboxKind;
+        available: boolean;
+    };
+}

From 383667e16a4388cbecb5261e85778fd775c4b624 Mon Sep 17 00:00:00 2001
From: Owl <32782746+SPerekrestova@users.noreply.github.com>
Date: Fri, 8 May 2026 08:45:08 +0000
Subject: [PATCH 2/6] Phase 4a: SubprocessRunner + sandbox auto-detection

Subprocess-based local runner with PATH probe-and-cache:
- LocalRunner interface, SUPPORTED_LANGUAGES, IMPLEMENTED_LANGUAGES (Phase 4a ships
  python3; go and java land in 4b/4c)
- SubprocessRunner: lazy probe, clean env (PATH/HOME/LANG only), temp cwd,
  5s default wall-clock timeout with SIGTERM-then-SIGKILL escalation,
  1 MB output ceiling per stream
- Sandbox detection: bwrap > firejail > sandbox-exec > none, applied
  transparently. RunResult.warning surfaces when no sandbox is found.
- Test helpers __resetProbeCacheForTest / __resetSandboxCacheForTest

No Docker, no extra runtime deps.
---
 src/runner/runner.ts            |  45 +++++
 src/runner/sandbox.ts           | 174 +++++++++++++++++
 src/runner/subprocess-runner.ts | 334 ++++++++++++++++++++++++++++++++
 3 files changed, 553 insertions(+)
 create mode 100644 src/runner/runner.ts
 create mode 100644 src/runner/sandbox.ts
 create mode 100644 src/runner/subprocess-runner.ts

diff --git a/src/runner/runner.ts b/src/runner/runner.ts
new file mode 100644
index 0000000..addb32d
--- /dev/null
+++ b/src/runner/runner.ts
@@ -0,0 +1,45 @@
+/**
+ * The local runner contract — implemented by `SubprocessRunner` for
+ * production and easily faked in tests.
+ *
+ * Tools should depend on this interface, never on the concrete
+ * implementation. Phase 4d will add an alternative implementation that
+ * delegates to a stronger sandbox; Phase 5 will compose this with the
+ * workspace abstraction.
+ */
+import type {
+    RunInput,
+    RunResult,
+    RunnerCapabilities,
+    RunnerLanguage
+} from "../types/index.js";
+
+export interface LocalRunner {
+    /** Runs the user's code; returns the result envelope (never throws on user-code failures). */
+    run(input: RunInput): Promise<RunResult>;
+    /** Snapshot of what the runner detected on this host — drives the `doctor` command. */
+    capabilities(): Promise<RunnerCapabilities>;
+}
+
+/**
+ * Languages the runner currently knows about. Used by the tool layer
+ * for early validation before spawning anything.
+ */
+export const SUPPORTED_LANGUAGES: readonly RunnerLanguage[] = [
+    "python3",
+    "go",
+    "java"
+] as const;
+
+/**
+ * The languages this build of the runner has *implemented*. Phase 4a
+ * ships `python3` only. Phase 4b/4c grow this list.
+ *
+ * Kept distinct from `SUPPORTED_LANGUAGES` so the wire-level
+ * `RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE` error has a single source of
+ * truth: anything in `SUPPORTED_LANGUAGES` but not in this list is a
+ * "coming soon" language.
+ */
+export const IMPLEMENTED_LANGUAGES: readonly RunnerLanguage[] = [
+    "python3"
+] as const;
diff --git a/src/runner/sandbox.ts b/src/runner/sandbox.ts
new file mode 100644
index 0000000..0e9049b
--- /dev/null
+++ b/src/runner/sandbox.ts
@@ -0,0 +1,174 @@
+/**
+ * Detect the strongest OS-level sandbox available on this host and turn
+ * a plain command into a sandbox-wrapped command.
+ *
+ * We deliberately ship no JS-level sandbox; the threat model is
+ * "user-running-their-own-code", not "untrusted multi-tenant input". The
+ * sandbox reduces blast radius of accidental rm-rf or runaway loops, not
+ * malicious code escapes.
+ *
+ * Priority:
+ *   - Linux: bwrap > firejail > none
+ *   - macOS: sandbox-exec > none
+ *   - Windows: none (native AppContainer wrappers are too platform-
+ *              specific to ship in v1)
+ *
+ * If nothing is detected the runner falls back to a plain subprocess and
+ * surfaces a `warning` in the `RunResult`. Users who want to refuse to
+ * run without a sandbox can set `LEETCODE_MCP_REQUIRE_SANDBOX=1`; the
+ * tool layer enforces this — the runner only reports.
+ */
+import { exec as execCb } from "node:child_process";
+import { promisify } from "node:util";
+
+import type { SandboxKind } from "../types/index.js";
+
+const execFile = promisify(execCb);
+
+interface DetectedSandbox {
+    kind: SandboxKind;
+    /** When `kind === "none"`, the absolute path to the wrapping
+     *  binary (`bwrap`, `firejail`, `sandbox-exec`) is undefined. */
+    path?: string;
+}
+
+let cached: DetectedSandbox | undefined;
+
+/**
+ * Returns whether `<bin> --version` (or equivalent) succeeds. We do a
+ * shell-out rather than `which` so the answer is uniform across platforms.
+ */
+async function probe(
+    cmd: string,
+    args: string[] = ["--version"]
+): Promise<boolean> {
+    try {
+        await execFile(`${cmd} ${args.join(" ")}`, { timeout: 1500 });
+        return true;
+    } catch {
+        return false;
+    }
+}
+
+/**
+ * Probe the host once per server lifetime. Subsequent calls return the
+ * cached result; tests can use `__resetSandboxCacheForTest` to force
+ * re-detection.
+ */
+export async function detectSandbox(): Promise<DetectedSandbox> {
+    if (cached) {
+        return cached;
+    }
+
+    const platform = process.platform;
+    if (platform === "darwin") {
+        // sandbox-exec is /usr/bin/sandbox-exec on every macOS we care
+        // about. It accepts no `--version`; probe with `-help` (any
+        // exit code is fine — it always prints to stderr).
+        try {
+            await execFile("/usr/bin/sandbox-exec -help", { timeout: 1500 });
+            cached = { kind: "sandbox-exec", path: "/usr/bin/sandbox-exec" };
+            return cached;
+        } catch {
+            /* fall through to "none" */
+        }
+    } else if (platform === "linux") {
+        if (await probe("bwrap")) {
+            cached = { kind: "bwrap" };
+            return cached;
+        }
+        if (await probe("firejail")) {
+            cached = { kind: "firejail" };
+            return cached;
+        }
+    }
+
+    cached = { kind: "none" };
+    return cached;
+}
+
+/**
+ * Wrap an existing command with the detected sandbox. Returns the new
+ * `[cmd, args]` pair plus the kind that was applied. When no sandbox is
+ * available, returns the input pair untouched and `kind: "none"`.
+ *
+ * `cwdAllowed` is the temp directory the user code is permitted to read
+ * + write — the rest of the filesystem is read-only (Linux) or denied
+ * (macOS).
+ */
+export async function wrapWithSandbox(
+    cmd: string,
+    args: string[],
+    cwdAllowed: string
+): Promise<{ cmd: string; args: string[]; kind: SandboxKind }> {
+    const detected = await detectSandbox();
+    if (detected.kind === "bwrap") {
+        return {
+            cmd: "bwrap",
+            args: [
+                "--ro-bind",
+                "/",
+                "/",
+                "--tmpfs",
+                "/tmp",
+                "--bind",
+                cwdAllowed,
+                cwdAllowed,
+                "--proc",
+                "/proc",
+                "--dev",
+                "/dev",
+                "--unshare-all",
+                "--die-with-parent",
+                "--",
+                cmd,
+                ...args
+            ],
+            kind: "bwrap"
+        };
+    }
+    if (detected.kind === "firejail") {
+        return {
+            cmd: "firejail",
+            args: [
+                "--quiet",
+                "--noprofile",
+                "--net=none",
+                "--private-tmp",
+                `--whitelist=${cwdAllowed}`,
+                "--",
+                cmd,
+                ...args
+            ],
+            kind: "firejail"
+        };
+    }
+    if (detected.kind === "sandbox-exec") {
+        // Minimal sandbox-exec profile — deny by default, allow process
+        // primitives + reads everywhere + writes only under cwdAllowed.
+        const profile = `(version 1)
+(deny default)
+(allow process-fork)
+(allow process-exec)
+(allow file-read*)
+(allow file-write* (subpath "${cwdAllowed.replace(/"/g, '\\"')}"))
+(allow file-write* (regex #"^/dev/null$"))
+(allow file-write* (regex #"^/dev/dtracehelper$"))
+(allow sysctl-read)
+(allow mach-lookup)
+(allow signal (target self))
+(allow ipc-posix-shm)
+(deny network*)`;
+        return {
+            cmd: "/usr/bin/sandbox-exec",
+            args: ["-p", profile, cmd, ...args],
+            kind: "sandbox-exec"
+        };
+    }
+    return { cmd, args, kind: "none" };
+}
+
+/** Test helper — clears the per-process cache so unit tests can re-probe. */
+export function __resetSandboxCacheForTest(): void {
+    cached = undefined;
+}
diff --git a/src/runner/subprocess-runner.ts b/src/runner/subprocess-runner.ts
new file mode 100644
index 0000000..08685f9
--- /dev/null
+++ b/src/runner/subprocess-runner.ts
@@ -0,0 +1,334 @@
+/**
+ * Plain-subprocess `LocalRunner` implementation.
+ *
+ * Per-language registry (currently `python3`) describes how to:
+ *   - probe whether the runtime is available on PATH
+ *   - spawn the runtime against a source file written to the run's
+ *     temp dir
+ *
+ * Probes run lazily on the first `run()` for the language and the
+ * results are cached for the lifetime of the process.
+ *
+ * Safety nets every run gets, even with no OS sandbox:
+ *   - per-process wall-clock timeout (default 5_000 ms; configurable
+ *     per `RunInput`)
+ *   - clean env (just PATH / HOME / LANG forwarded — secrets in the
+ *     user's shell never leak in)
+ *   - cwd is a freshly-mkdtemp'd directory under the OS tmp; it is
+ *     removed after the run regardless of outcome
+ *   - stdout/stderr captured with a 1 MB ceiling; runaway output gets
+ *     truncated with a marker rather than blowing memory
+ */
+import { exec as execCb, spawn } from "node:child_process";
+import { mkdtemp, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { performance } from "node:perf_hooks";
+import { promisify } from "node:util";
+
+import type {
+    RunInput,
+    RunResult,
+    RunnerCapabilities,
+    RunnerLanguage,
+    SandboxKind
+} from "../types/index.js";
+import { ErrorCode, LeetCodeError } from "../types/index.js";
+import logger from "../utils/logger.js";
+import type { LocalRunner } from "./runner.js";
+import { IMPLEMENTED_LANGUAGES, SUPPORTED_LANGUAGES } from "./runner.js";
+import { wrapWithSandbox } from "./sandbox.js";
+
+const execFile = promisify(execCb);
+
+const MAX_OUTPUT_BYTES = 1_000_000; // 1 MB per stream
+const DEFAULT_TIMEOUT_MS = 5_000;
+const TRUNCATION_MARKER = "\n[...output truncated at 1 MB...]";
+
+interface LanguageSpec {
+    /** File extension (without dot) used for the temp source file. */
+    extension: string;
+    /** `[binary, args]` to probe — exit code 0 means available. */
+    probe: { cmd: string; args: string[] };
+    /**
+     * Build the spawn args given the path of the source file we wrote
+     * for this run. Compiled languages (Go, Java) will hook in extra
+     * compile steps via subclassing later.
+     */
+    buildArgs(sourcePath: string): { cmd: string; args: string[] };
+}
+
+const LANGUAGES: Record<RunnerLanguage, LanguageSpec> = {
+    python3: {
+        extension: "py",
+        probe: { cmd: "python3", args: ["--version"] },
+        buildArgs: (sourcePath) => ({
+            cmd: "python3",
+            args: [sourcePath]
+        })
+    },
+    // Phase 4b/4c stubs — present in the registry so the type system
+    // requires they stay in sync with `RunnerLanguage`. The runner
+    // refuses to use these until we actually wire harnesses.
+    go: {
+        extension: "go",
+        probe: { cmd: "go", args: ["version"] },
+        buildArgs: () => {
+            throw new LeetCodeError(
+                ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                "Go runner ships in Phase 4b"
+            );
+        }
+    },
+    java: {
+        extension: "java",
+        probe: { cmd: "java", args: ["-version"] },
+        buildArgs: () => {
+            throw new LeetCodeError(
+                ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                "Java runner ships in Phase 4c"
+            );
+        }
+    }
+};
+
+interface ProbeResult {
+    available: boolean;
+    version?: string;
+    path?: string;
+}
+
+const probeCache = new Map<RunnerLanguage, ProbeResult>();
+
+async function probeLanguage(language: RunnerLanguage): Promise<ProbeResult> {
+    const cached = probeCache.get(language);
+    if (cached) {
+        return cached;
+    }
+    const spec = LANGUAGES[language];
+    try {
+        const { stdout, stderr } = await execFile(
+            `${spec.probe.cmd} ${spec.probe.args.join(" ")}`,
+            { timeout: 2000 }
+        );
+        // `python3 --version` and `go version` write to stdout; `java
+        // -version` historically writes to stderr — accept either.
+        const versionLine = (stdout || stderr || "").split("\n")[0]?.trim();
+        const result: ProbeResult = {
+            available: true,
+            version: versionLine || undefined
+        };
+        try {
+            const { stdout: which } = await execFile(
+                `which ${spec.probe.cmd}`,
+                {
+                    timeout: 1000
+                }
+            );
+            result.path = which.trim() || undefined;
+        } catch {
+            /* `which` may not exist (Windows); leave `path` undefined */
+        }
+        probeCache.set(language, result);
+        return result;
+    } catch (error) {
+        const result: ProbeResult = { available: false };
+        probeCache.set(language, result);
+        logger.debug(
+            { language, error: (error as Error)?.message },
+            "Language probe failed"
+        );
+        return result;
+    }
+}
+
+/** Test helper — clears the probe cache so unit tests can re-detect. */
+export function __resetProbeCacheForTest(): void {
+    probeCache.clear();
+}
+
+function clampOutput(buf: Buffer): string {
+    if (buf.length <= MAX_OUTPUT_BYTES) {
+        return buf.toString("utf-8");
+    }
+    return (
+        buf.subarray(0, MAX_OUTPUT_BYTES).toString("utf-8") + TRUNCATION_MARKER
+    );
+}
+
+export class SubprocessRunner implements LocalRunner {
+    async capabilities(): Promise<RunnerCapabilities> {
+        const languages = await Promise.all(
+            SUPPORTED_LANGUAGES.map(async (language) => {
+                const probe = await probeLanguage(language);
+                return {
+                    language,
+                    available: probe.available,
+                    version: probe.version,
+                    path: probe.path
+                };
+            })
+        );
+        // Sandbox detection is in `./sandbox.ts`; importing inline here
+        // avoids a dependency cycle with `subprocess-runner` ↔ `sandbox`.
+        const { detectSandbox } = await import("./sandbox.js");
+        const detected = await detectSandbox();
+        return {
+            languages,
+            sandbox: {
+                kind: detected.kind,
+                available: detected.kind !== "none"
+            }
+        };
+    }
+
+    async run(input: RunInput): Promise<RunResult> {
+        if (!IMPLEMENTED_LANGUAGES.includes(input.language)) {
+            throw new LeetCodeError(
+                ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                `Local runner has no harness for ${input.language} yet`
+            );
+        }
+
+        const probe = await probeLanguage(input.language);
+        if (!probe.available) {
+            throw new LeetCodeError(
+                ErrorCode.LANGUAGE_RUNTIME_NOT_FOUND,
+                `Required runtime for ${input.language} not found on PATH`
+            );
+        }
+
+        const spec = LANGUAGES[input.language];
+        const timeoutMs = input.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+        const workDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-run-"));
+        const sourcePath = join(workDir, `solution.${spec.extension}`);
+
+        try {
+            await writeFile(sourcePath, input.code, "utf-8");
+            const baseArgs = spec.buildArgs(sourcePath);
+            const wrapped = await wrapWithSandbox(
+                baseArgs.cmd,
+                baseArgs.args,
+                workDir
+            );
+
+            return await this.spawnAndCapture({
+                cmd: wrapped.cmd,
+                args: wrapped.args,
+                cwd: workDir,
+                timeoutMs,
+                sandbox: wrapped.kind
+            });
+        } finally {
+            await rm(workDir, { recursive: true, force: true }).catch(
+                (error) => {
+                    logger.debug(
+                        { error: (error as Error)?.message, workDir },
+                        "Failed to clean up runner workdir"
+                    );
+                }
+            );
+        }
+    }
+
+    private spawnAndCapture(options: {
+        cmd: string;
+        args: string[];
+        cwd: string;
+        timeoutMs: number;
+        sandbox: SandboxKind;
+    }): Promise<RunResult> {
+        return new Promise((resolve) => {
+            const start = performance.now();
+            const child = spawn(options.cmd, options.args, {
+                cwd: options.cwd,
+                env: {
+                    PATH: process.env.PATH ?? "",
+                    HOME: options.cwd,
+                    LANG: process.env.LANG ?? "C.UTF-8"
+                },
+                stdio: ["ignore", "pipe", "pipe"]
+            });
+
+            const stdout: Buffer[] = [];
+            const stderr: Buffer[] = [];
+            let stdoutBytes = 0;
+            let stderrBytes = 0;
+            let timedOut = false;
+            let killTimer: NodeJS.Timeout | undefined;
+
+            child.stdout?.on("data", (chunk: Buffer) => {
+                if (stdoutBytes < MAX_OUTPUT_BYTES) {
+                    stdout.push(chunk);
+                    stdoutBytes += chunk.length;
+                }
+            });
+            child.stderr?.on("data", (chunk: Buffer) => {
+                if (stderrBytes < MAX_OUTPUT_BYTES) {
+                    stderr.push(chunk);
+                    stderrBytes += chunk.length;
+                }
+            });
+
+            const timer = setTimeout(() => {
+                timedOut = true;
+                // SIGTERM first; if the child ignores it, hard SIGKILL
+                // 500 ms later. Belt + braces for runaway loops.
+                child.kill("SIGTERM");
+                killTimer = setTimeout(() => child.kill("SIGKILL"), 500);
+            }, options.timeoutMs);
+
+            const finalize = (exitCode: number | null): void => {
+                clearTimeout(timer);
+                if (killTimer) {
+                    clearTimeout(killTimer);
+                }
+                const durationMs = Math.round(performance.now() - start);
+                const passed = !timedOut && exitCode === 0;
+                resolve({
+                    passed,
+                    exitCode,
+                    stdout: clampOutput(Buffer.concat(stdout)),
+                    stderr: clampOutput(Buffer.concat(stderr)),
+                    timedOut,
+                    durationMs,
+                    sandbox: options.sandbox,
+                    warning:
+                        options.sandbox === "none"
+                            ? "No OS sandbox available on this host; ran without isolation."
+                            : undefined
+                });
+            };
+
+            child.on("close", (code, signal) => {
+                if (signal && code === null) {
+                    finalize(null);
+                } else {
+                    finalize(code);
+                }
+            });
+            child.on("error", (error) => {
+                logger.warn(
+                    { error: error.message, cmd: options.cmd },
+                    "Runner subprocess errored before exit"
+                );
+                clearTimeout(timer);
+                if (killTimer) {
+                    clearTimeout(killTimer);
+                }
+                resolve({
+                    passed: false,
+                    exitCode: null,
+                    stdout: clampOutput(Buffer.concat(stdout)),
+                    stderr:
+                        clampOutput(Buffer.concat(stderr)) +
+                        `\n[runner error: ${error.message}]`,
+                    timedOut: false,
+                    durationMs: Math.round(performance.now() - start),
+                    sandbox: options.sandbox,
+                    warning: undefined
+                });
+            });
+        });
+    }
+}

From 9e067ef521d83a52e355e00d12035fab7e99161d Mon Sep 17 00:00:00 2001
From: Owl <32782746+SPerekrestova@users.noreply.github.com>
Date: Fri, 8 May 2026 08:45:20 +0000
Subject: [PATCH 3/6] Phase 4a: SessionService gains requireSession +
 recordLocalRun

- requireSession(slug) is now public so the runner-tools layer can
  reuse the same SESSION_NOT_FOUND envelope as the rest of the
  pedagogy gate.
- recordLocalRun(slug, passed) bumps attempts, sets lastLocalRunPassed,
  and promotes status from 'started' to 'attempting' on first run.
---
 src/domain/session-service.ts | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/domain/session-service.ts b/src/domain/session-service.ts
index 7c37477..037120f 100644
--- a/src/domain/session-service.ts
+++ b/src/domain/session-service.ts
@@ -132,7 +132,13 @@ export class SessionService {
         );
     }
 
-    private async requireSession(slug: string): Promise<SessionState> {
+    /**
+     * Public variant of the session lookup — throws `SESSION_NOT_FOUND`
+     * when the user never opened the slug. Used by the runner-tools
+     * layer to keep `run_local_tests` aligned with the pedagogy state
+     * machine (no orphaned runs).
+     */
+    async requireSession(slug: string): Promise<SessionState> {
         const session = await this.store.load(slug);
         if (!session) {
             throw new LeetCodeError(
@@ -142,4 +148,24 @@ export class SessionService {
         }
         return session;
     }
+
+    /**
+     * Updates the session after a `run_local_tests` invocation.
+     * Increments `attempts`, sets `lastLocalRunPassed`, and bumps
+     * `status` to "attempting" on the first run (so subsequent
+     * resets-then-runs keep the lifecycle accurate).
+     */
+    async recordLocalRun(slug: string, passed: boolean): Promise<SessionState> {
+        const session = await this.requireSession(slug);
+        const next: SessionState = {
+            ...session,
+            attempts: session.attempts + 1,
+            lastLocalRunPassed: passed,
+            status:
+                session.status === "started" ? "attempting" : session.status,
+            updatedAt: new Date().toISOString()
+        };
+        await this.store.save(next);
+        return next;
+    }
 }

From 9ac5bfa7e7bdb37e60789c7e237dc24f5837221e Mon Sep 17 00:00:00 2001
From: Owl <32782746+SPerekrestova@users.noreply.github.com>
Date: Fri, 8 May 2026 08:45:32 +0000
Subject: [PATCH 4/6] Phase 4a: run_local_tests + runner_doctor tools,
 strict-mode submission gate

- New tool 'run_local_tests' (titleSlug, language, code, timeoutMs?):
  requires an active session, delegates to LocalRunner, records
  attempts + lastLocalRunPassed on the session.
- New tool 'runner_doctor' (no args): reports detected runtime
  versions and sandbox kind. Drives the README troubleshooting flow
  for LANGUAGE_RUNTIME_NOT_FOUND.
- LEETCODE_MCP_STRICT_MODE=1 gates submit_solution: when set, refuses
  to submit unless the session's lastLocalRunPassed === true. Off by
  default; non-disruptive when no session exists for the slug.
- index.ts wires SubprocessRunner into the server and threads the
  session service through the submission registry.
---
 src/index.ts                      |  10 +-
 src/mcp/tools/runner-tools.ts     | 188 ++++++++++++++++++++++++++++++
 src/mcp/tools/submission-tools.ts |  65 ++++++++---
 3 files changed, 246 insertions(+), 17 deletions(-)
 create mode 100644 src/mcp/tools/runner-tools.ts

diff --git a/src/index.ts b/src/index.ts
index b9c316d..4ba550b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -19,10 +19,12 @@ import { registerAuthTools } from "./mcp/tools/auth-tools.js";
 import { registerContestTools } from "./mcp/tools/contest-tools.js";
 import { registerOnboardingTools } from "./mcp/tools/onboarding-tools.js";
 import { registerProblemTools } from "./mcp/tools/problem-tools.js";
+import { registerRunnerTools } from "./mcp/tools/runner-tools.js";
 import { registerSessionTools } from "./mcp/tools/session-tools.js";
 import { registerSolutionTools } from "./mcp/tools/solution-tools.js";
 import { registerSubmissionTools } from "./mcp/tools/submission-tools.js";
 import { registerUserTools } from "./mcp/tools/user-tools.js";
+import { SubprocessRunner } from "./runner/subprocess-runner.js";
 import logger from "./utils/logger.js";
 
 /**
@@ -145,6 +147,11 @@ async function main() {
     // returning content.
     const sessions = new SessionService();
 
+    // Local subprocess runner: probes python3 / go / java on first use,
+    // wraps with bwrap / firejail / sandbox-exec where available, and
+    // backs the `run_local_tests` tool. Phase 4a ships python3 only.
+    const runner = new SubprocessRunner();
+
     // Register MCP prompts for learning mode and workspace guidance
     registerLearningPrompts(server, leetcodeService);
 
@@ -158,8 +165,9 @@ async function main() {
     registerContestTools(server, leetcodeService);
     registerSessionTools(server, leetcodeService, sessions);
     registerSolutionTools(server, leetcodeService, sessions);
+    registerRunnerTools(server, leetcodeService, sessions, runner);
     registerAuthTools(server, leetcodeService);
-    registerSubmissionTools(server, leetcodeService);
+    registerSubmissionTools(server, leetcodeService, sessions);
 
     registerProblemResources(server, leetcodeService);
     registerSolutionResources(server, leetcodeService);
diff --git a/src/mcp/tools/runner-tools.ts b/src/mcp/tools/runner-tools.ts
new file mode 100644
index 0000000..7c7fa33
--- /dev/null
+++ b/src/mcp/tools/runner-tools.ts
@@ -0,0 +1,188 @@
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { z } from "zod";
+import type { SessionService } from "../../domain/session-service.js";
+import { LeetcodeServiceInterface } from "../../leetcode/leetcode-service-interface.js";
+import {
+    IMPLEMENTED_LANGUAGES,
+    SUPPORTED_LANGUAGES,
+    type LocalRunner
+} from "../../runner/runner.js";
+import type { RunnerLanguage } from "../../types/index.js";
+import { ErrorCode, LeetCodeError } from "../../types/index.js";
+import { errorEnvelope } from "./session-tools.js";
+import { ToolRegistry } from "./tool-registry.js";
+
+/**
+ * Local-runner tools introduced in Phase 4.
+ *
+ * `run_local_tests` is the inner-loop primitive: agent passes code,
+ * runner spawns a sandboxed subprocess, captures stdout/stderr/exit
+ * code, and reports back. The session's `lastLocalRunPassed` flag is
+ * updated as a side effect so `submit_solution`'s strict-mode gate
+ * (Phase 6) and any future analytics have a stable hook.
+ *
+ * v1 deliberately does *not* parse `exampleTestcases` server-side or
+ * synthesize a per-problem harness. The agent — which already has the
+ * problem in context after `start_problem` — is responsible for adding
+ * test invocations to the code it submits to the runner. That keeps
+ * the wire surface tiny, language-agnostic, and free of LeetCode-
+ * specific signature parsing.
+ */
+export class RunnerToolRegistry extends ToolRegistry {
+    constructor(
+        server: McpServer,
+        leetcodeService: LeetcodeServiceInterface,
+        private readonly sessions: SessionService,
+        private readonly runner: LocalRunner
+    ) {
+        super(server, leetcodeService);
+    }
+
+    protected registerPublic(): void {
+        this.registerRunLocalTests();
+        this.registerDoctor();
+    }
+
+    private registerRunLocalTests(): void {
+        const supportedLiteral = z.enum(
+            SUPPORTED_LANGUAGES as unknown as [string, ...string[]]
+        );
+        this.server.registerTool(
+            "run_local_tests",
+            {
+                description:
+                    "Runs the user's code locally in an isolated subprocess, captures stdout / stderr / exit code, and updates the session's lastLocalRunPassed flag. Use this in the inner loop instead of submit_solution — it costs no LeetCode submission and turns around in seconds. The agent is responsible for including test invocations (e.g. `print(Solution().twoSum([2,7,11,15], 9))`) in the code passed in. Phase 4a ships python3; go and java land in Phase 4b/4c.",
+                inputSchema: {
+                    titleSlug: z
+                        .string()
+                        .min(1)
+                        .describe(
+                            "The URL slug of the problem (must match an active session opened with start_problem)."
+                        ),
+                    language: supportedLiteral.describe(
+                        `Language to execute as. Currently runnable: ${IMPLEMENTED_LANGUAGES.join(
+                            ", "
+                        )}. Other LeetCode languages remain valid for submit_solution.`
+                    ),
+                    code: z
+                        .string()
+                        .min(1)
+                        .describe(
+                            "Complete source code to execute. Should include test invocations that print results / raise on failure."
+                        ),
+                    timeoutMs: z
+                        .number()
+                        .int()
+                        .min(100)
+                        .max(60_000)
+                        .optional()
+                        .describe(
+                            "Optional wall-clock budget in milliseconds. Defaults to 5000."
+                        )
+                }
+            },
+            async ({ titleSlug, language, code, timeoutMs }) => {
+                try {
+                    // Require a session — keeps the runner aligned with
+                    // the pedagogy state machine (and gives us a sane
+                    // place to record `attempts` / `lastLocalRunPassed`).
+                    await this.sessions.requireSession(titleSlug);
+
+                    const result = await this.runner.run({
+                        titleSlug,
+                        language: language as RunnerLanguage,
+                        code,
+                        timeoutMs
+                    });
+
+                    await this.sessions.recordLocalRun(
+                        titleSlug,
+                        result.passed
+                    );
+
+                    return {
+                        content: [
+                            {
+                                type: "text" as const,
+                                text: JSON.stringify({
+                                    titleSlug,
+                                    language,
+                                    result
+                                })
+                            }
+                        ]
+                    };
+                } catch (error) {
+                    return errorEnvelope(
+                        "Failed to run local tests",
+                        wrapTimeout(error)
+                    );
+                }
+            }
+        );
+    }
+
+    private registerDoctor(): void {
+        this.server.registerTool(
+            "runner_doctor",
+            {
+                description:
+                    "Reports which language runtimes (python3, go, java) and OS sandbox tools (bwrap, firejail, sandbox-exec) are detected on this host. Useful for diagnosing 'LANGUAGE_RUNTIME_NOT_FOUND' errors and confirming whether run_local_tests will be sandboxed.",
+                inputSchema: {}
+            },
+            async () => {
+                try {
+                    const capabilities = await this.runner.capabilities();
+                    return {
+                        content: [
+                            {
+                                type: "text" as const,
+                                text: JSON.stringify(capabilities)
+                            }
+                        ]
+                    };
+                } catch (error) {
+                    return errorEnvelope(
+                        "Failed to inspect runner capabilities",
+                        error
+                    );
+                }
+            }
+        );
+    }
+}
+
+/**
+ * `RUNNER_TIMEOUT` is reported as a plain `RunResult` with `timedOut: true`,
+ * not as a thrown error — but `run` itself can throw for the runtime-
+ * not-found / language-not-implemented cases. Anything else is normalised
+ * into `UPSTREAM_ERROR` by the shared envelope.
+ */
+function wrapTimeout(error: unknown): unknown {
+    if (error instanceof LeetCodeError) {
+        return error;
+    }
+    if (error instanceof Error && /timed out/i.test(error.message)) {
+        return new LeetCodeError(
+            ErrorCode.RUNNER_TIMEOUT,
+            error.message,
+            error
+        );
+    }
+    return error;
+}
+
+export function registerRunnerTools(
+    server: McpServer,
+    leetcodeService: LeetcodeServiceInterface,
+    sessions: SessionService,
+    runner: LocalRunner
+): void {
+    const registry = new RunnerToolRegistry(
+        server,
+        leetcodeService,
+        sessions,
+        runner
+    );
+    registry.register();
+}
diff --git a/src/mcp/tools/submission-tools.ts b/src/mcp/tools/submission-tools.ts
index 04ea8cf..a5b8635 100644
--- a/src/mcp/tools/submission-tools.ts
+++ b/src/mcp/tools/submission-tools.ts
@@ -1,19 +1,41 @@
 import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { z } from "zod";
+import type { SessionService } from "../../domain/session-service.js";
 import { LeetcodeServiceInterface } from "../../leetcode/leetcode-service-interface.js";
+import { ErrorCode, LeetCodeError } from "../../types/index.js";
+import { errorEnvelope } from "./session-tools.js";
 import { ToolRegistry } from "./tool-registry.js";
 
 /**
  * Submission tool registry class that handles registration of LeetCode submission tools.
+ *
+ * Phase 4 wires the strict-mode gate (`LEETCODE_MCP_STRICT_MODE=1`):
+ * when enabled, `submit_solution` refuses to spend a real LeetCode
+ * submission unless the active session's `lastLocalRunPassed === true`.
+ * Default is *off* (preserves current behaviour); session is optional
+ * so existing flows without `start_problem` aren't broken.
  */
 export class SubmissionToolRegistry extends ToolRegistry {
+    constructor(
+        server: McpServer,
+        leetcodeService: LeetcodeServiceInterface,
+        private readonly sessions?: SessionService
+    ) {
+        super(server, leetcodeService);
+    }
+
+    private isStrictMode(): boolean {
+        const value = process.env.LEETCODE_MCP_STRICT_MODE;
+        return value === "1" || value === "true";
+    }
+
     protected registerPublic(): void {
         // Submission tool
         this.server.registerTool(
             "submit_solution",
             {
                 description:
-                    "Submit a solution to a LeetCode problem and get results. Returns acceptance status, runtime/memory stats, or failed test case details.",
+                    "Submit a solution to a LeetCode problem and get results. Returns acceptance status, runtime/memory stats, or failed test case details. When LEETCODE_MCP_STRICT_MODE=1 is set, requires `run_local_tests` to have last passed for the problem first — saves real LeetCode submissions for solutions that pass examples locally.",
                 inputSchema: {
                     problemSlug: z
                         .string()
@@ -51,6 +73,21 @@ export class SubmissionToolRegistry extends ToolRegistry {
             },
             async ({ problemSlug, code, language }) => {
                 try {
+                    if (this.isStrictMode() && this.sessions) {
+                        // The strict gate only fires when the user has
+                        // actually opened a session for this slug. If
+                        // they never called `start_problem`, the
+                        // pre-strict-mode behaviour is preserved (so
+                        // strict mode is non-disruptive for ad-hoc
+                        // calls outside the tutoring flow).
+                        const session = await this.sessions.get(problemSlug);
+                        if (session && session.lastLocalRunPassed !== true) {
+                            throw new LeetCodeError(
+                                ErrorCode.LOCAL_TESTS_NOT_PASSED,
+                                "Strict mode is enabled and the most recent run_local_tests for this problem did not pass. Run it again and submit only when locals are green."
+                            );
+                        }
+                    }
                     const result = await this.leetcodeService.submitSolution(
                         problemSlug,
                         code,
@@ -59,23 +96,13 @@ export class SubmissionToolRegistry extends ToolRegistry {
                     return {
                         content: [
                             {
-                                type: "text",
+                                type: "text" as const,
                                 text: JSON.stringify(result, null, 2)
                             }
                         ]
                     };
-                } catch (error: any) {
-                    return {
-                        content: [
-                            {
-                                type: "text",
-                                text: JSON.stringify({
-                                    error: "Failed to submit solution",
-                                    message: error.message
-                                })
-                            }
-                        ]
-                    };
+                } catch (error) {
+                    return errorEnvelope("Failed to submit solution", error);
                 }
             }
         );
@@ -87,11 +114,17 @@ export class SubmissionToolRegistry extends ToolRegistry {
  *
  * @param server - The MCP server instance to register tools with
  * @param leetcodeService - The LeetCode service implementation to use for API calls
+ * @param sessions - Optional session service used for the strict-mode gate
  */
 export function registerSubmissionTools(
     server: McpServer,
-    leetcodeService: LeetcodeServiceInterface
+    leetcodeService: LeetcodeServiceInterface,
+    sessions?: SessionService
 ): void {
-    const registry = new SubmissionToolRegistry(server, leetcodeService);
+    const registry = new SubmissionToolRegistry(
+        server,
+        leetcodeService,
+        sessions
+    );
     registry.register();
 }

From 4cb974b2c2a30ae01114fe156dc9c170f6ceb1b7 Mon Sep 17 00:00:00 2001
From: Owl <32782746+SPerekrestova@users.noreply.github.com>
Date: Fri, 8 May 2026 08:45:49 +0000
Subject: [PATCH 5/6] Phase 4a: tests for runner, session-service,
 runner-tools, strict mode

- tests/runner/subprocess-runner.test.ts (9): happy path, exit codes,
  stderr capture, timeout (SIGTERM+SIGKILL), language-not-implemented,
  clean env (no parent secrets leak)
- tests/domain/session-service.test.ts (5): requireSession +
  recordLocalRun unit coverage
- tests/integration/runner-tools-integration.test.ts (5): SESSION_NOT_FOUND
  pre-run, happy delegate, attempts bump on failure, error surface for
  RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE, doctor capability snapshot
- tests/integration/submission-tools-integration.test.ts (+4):
  strict-mode gate (blocks/permits/no-session-passthrough/default-off)
- tests/e2e/runner.test.ts (7): doctor over the wire, no-session,
  pass, fail, timeout, unimplemented language, end-to-end strict-mode
  gate; skipped automatically on hosts without python3
- tests/e2e/lifecycle.test.ts: extend tool list to include run_local_tests
  and runner_doctor (22 -> 24)

Totals: 178 -> 201 unit/integration; 9 -> 16 e2e.
---
 tests/domain/session-service.test.ts          |  82 ++++++
 tests/e2e/lifecycle.test.ts                   |   2 +
 tests/e2e/runner.test.ts                      | 258 ++++++++++++++++++
 .../runner-tools-integration.test.ts          | 251 +++++++++++++++++
 .../submission-tools-integration.test.ts      | 112 +++++++-
 tests/runner/subprocess-runner.test.ts        | 161 +++++++++++
 6 files changed, 865 insertions(+), 1 deletion(-)
 create mode 100644 tests/domain/session-service.test.ts
 create mode 100644 tests/e2e/runner.test.ts
 create mode 100644 tests/integration/runner-tools-integration.test.ts
 create mode 100644 tests/runner/subprocess-runner.test.ts

diff --git a/tests/domain/session-service.test.ts b/tests/domain/session-service.test.ts
new file mode 100644
index 0000000..bea77eb
--- /dev/null
+++ b/tests/domain/session-service.test.ts
@@ -0,0 +1,82 @@
+/**
+ * Unit tests for SessionService methods that don't already have
+ * coverage via the e2e/integration suites — primarily the Phase 4
+ * additions (`requireSession`, `recordLocalRun`).
+ */
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { SessionService } from "../../src/domain/session-service.js";
+import { FileSessionStore } from "../../src/domain/session-store.js";
+import { ErrorCode, isLeetCodeError } from "../../src/types/index.js";
+
+describe("SessionService — Phase 4 additions", () => {
+    let dir: string;
+    let service: SessionService;
+
+    beforeEach(async () => {
+        dir = await mkdtemp(join(tmpdir(), "leetcode-mcp-svc-"));
+        service = new SessionService(new FileSessionStore({ dir }));
+    });
+
+    afterEach(async () => {
+        await rm(dir, { recursive: true, force: true });
+    });
+
+    describe("requireSession", () => {
+        it("returns the session when present", async () => {
+            const session = await service.startOrResume({ slug: "two-sum" });
+            const fetched = await service.requireSession("two-sum");
+            expect(fetched.slug).toBe(session.slug);
+        });
+
+        it("throws SESSION_NOT_FOUND when no session exists", async () => {
+            await expect(async () => {
+                await service.requireSession("never-opened");
+            }).rejects.toSatisfy(
+                (error: unknown) =>
+                    isLeetCodeError(error) &&
+                    error.code === ErrorCode.SESSION_NOT_FOUND
+            );
+        });
+    });
+
+    describe("recordLocalRun", () => {
+        it("increments attempts and stores lastLocalRunPassed", async () => {
+            await service.startOrResume({ slug: "two-sum" });
+
+            const after1 = await service.recordLocalRun("two-sum", false);
+            expect(after1.attempts).toBe(1);
+            expect(after1.lastLocalRunPassed).toBe(false);
+            expect(after1.status).toBe("attempting");
+
+            const after2 = await service.recordLocalRun("two-sum", true);
+            expect(after2.attempts).toBe(2);
+            expect(after2.lastLocalRunPassed).toBe(true);
+            // Status should not regress from "attempting".
+            expect(after2.status).toBe("attempting");
+        });
+
+        it("persists across service instances", async () => {
+            await service.startOrResume({ slug: "two-sum" });
+            await service.recordLocalRun("two-sum", true);
+
+            // Reload from disk via a fresh service.
+            const reloaded = new SessionService(new FileSessionStore({ dir }));
+            const session = await reloaded.requireSession("two-sum");
+            expect(session.attempts).toBe(1);
+            expect(session.lastLocalRunPassed).toBe(true);
+        });
+
+        it("throws SESSION_NOT_FOUND when no session exists", async () => {
+            await expect(async () => {
+                await service.recordLocalRun("never-opened", true);
+            }).rejects.toSatisfy(
+                (error: unknown) =>
+                    isLeetCodeError(error) &&
+                    error.code === ErrorCode.SESSION_NOT_FOUND
+            );
+        });
+    });
+});
diff --git a/tests/e2e/lifecycle.test.ts b/tests/e2e/lifecycle.test.ts
index a7851d3..aac2cd7 100644
--- a/tests/e2e/lifecycle.test.ts
+++ b/tests/e2e/lifecycle.test.ts
@@ -51,6 +51,8 @@ describe("e2e: server lifecycle", () => {
             "list_problem_solutions",
             "request_hint",
             "reset_session",
+            "run_local_tests",
+            "runner_doctor",
             "save_leetcode_credentials",
             "search_problems",
             "start_leetcode_auth",
diff --git a/tests/e2e/runner.test.ts b/tests/e2e/runner.test.ts
new file mode 100644
index 0000000..2c2bb39
--- /dev/null
+++ b/tests/e2e/runner.test.ts
@@ -0,0 +1,258 @@
+/**
+ * Local-runner e2e: spawn the real `build/index.js`, drive
+ * `runner_doctor` and `run_local_tests` over the wire, and assert the
+ * runner actually executes Python on the host.
+ *
+ * Skipped automatically on hosts without `python3` so the suite stays
+ * portable; the project's CI image has it.
+ */
+import { execFileSync } from "node:child_process";
+import { afterEach, describe, expect, it } from "vitest";
+import { spawnServer, type SpawnedServer } from "./harness/spawn-server.js";
+
+interface ToolTextResult {
+    content: Array<{ type: string; text: string }>;
+}
+
+const TWO_SUM_PROBLEM = {
+    questionId: "1",
+    questionFrontendId: "1",
+    title: "Two Sum",
+    titleSlug: "two-sum",
+    difficulty: "Easy",
+    isPaidOnly: false,
+    content: "<p>Two Sum problem</p>",
+    topicTags: [{ name: "Array", slug: "array" }],
+    codeSnippets: [
+        {
+            lang: "Python3",
+            langSlug: "python3",
+            code: "class Solution:\n    def twoSum(self, nums, target):\n        pass\n"
+        }
+    ],
+    similarQuestions: "[]",
+    exampleTestcases: "[2,7,11,15]\n9",
+    hints: [],
+    stats: '{"totalAccepted":"10M","totalSubmission":"20M","acRate":"50.0%"}'
+};
+
+const FIXTURE = {
+    graphql: [
+        {
+            operationContains: "question(titleSlug:",
+            response: { data: { question: TWO_SUM_PROBLEM } }
+        }
+    ]
+};
+
+function pythonAvailable(): boolean {
+    try {
+        execFileSync("python3", ["--version"], { stdio: "ignore" });
+        return true;
+    } catch {
+        return false;
+    }
+}
+
+const PYTHON_PRESENT = pythonAvailable();
+
+describe.skipIf(!PYTHON_PRESENT)("e2e: local runner (python3)", () => {
+    let spawned: SpawnedServer | undefined;
+
+    afterEach(async () => {
+        if (spawned) {
+            await spawned.cleanup();
+            spawned = undefined;
+        }
+    });
+
+    it("runner_doctor reports python3 availability", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        const doctor = (await spawned.client.callTool({
+            name: "runner_doctor",
+            arguments: {}
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(doctor.content[0].text);
+        expect(payload.languages).toBeDefined();
+        const py = payload.languages.find(
+            (l: { language: string }) => l.language === "python3"
+        );
+        expect(py?.available).toBe(true);
+        expect(payload.sandbox).toBeDefined();
+    });
+
+    it("rejects run_local_tests when no session is open", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        const result = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "print('ok')"
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(result.content[0].text);
+        expect(payload.code).toBe("SESSION_NOT_FOUND");
+    });
+
+    it("executes a passing python script and updates the session", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'print("hi")\nassert 1 + 1 == 2'
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.titleSlug).toBe("two-sum");
+        expect(payload.result.passed).toBe(true);
+        expect(payload.result.exitCode).toBe(0);
+        expect(payload.result.timedOut).toBe(false);
+        expect(payload.result.stdout).toContain("hi");
+
+        // Session state is observable via get_session_state.
+        const state = (await spawned.client.callTool({
+            name: "get_session_state",
+            arguments: { titleSlug: "two-sum" }
+        })) as ToolTextResult;
+        const sessionPayload = JSON.parse(state.content[0].text);
+        expect(sessionPayload.session.lastLocalRunPassed).toBe(true);
+        expect(sessionPayload.session.attempts).toBe(1);
+    });
+
+    it("captures non-zero exit code without throwing", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "raise SystemExit(2)"
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.result.passed).toBe(false);
+        expect(payload.result.exitCode).toBe(2);
+
+        const state = (await spawned.client.callTool({
+            name: "get_session_state",
+            arguments: { titleSlug: "two-sum" }
+        })) as ToolTextResult;
+        const sessionPayload = JSON.parse(state.content[0].text);
+        expect(sessionPayload.session.lastLocalRunPassed).toBe(false);
+    });
+
+    it("kills runaway processes after the timeout budget", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "while True: pass",
+                timeoutMs: 500
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.result.timedOut).toBe(true);
+        expect(payload.result.passed).toBe(false);
+    });
+
+    it("rejects unimplemented languages with RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "go" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "go",
+                code: "package main\nfunc main() {}"
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.code).toBe("RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE");
+    });
+
+    it("blocks submit_solution under strict mode until run_local_tests passes", async () => {
+        spawned = await spawnServer({
+            fixture: FIXTURE,
+            env: { LEETCODE_MCP_STRICT_MODE: "1" }
+        });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        // First submit attempt: no run_local_tests yet → rejected.
+        const blocked = (await spawned.client.callTool({
+            name: "submit_solution",
+            arguments: {
+                problemSlug: "two-sum",
+                code: "def twoSum(nums, target): pass",
+                language: "python3"
+            }
+        })) as ToolTextResult;
+        const blockedPayload = JSON.parse(blocked.content[0].text);
+        expect(blockedPayload.code).toBe("LOCAL_TESTS_NOT_PASSED");
+
+        // Run locals successfully.
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'print("ok")'
+            }
+        })) as ToolTextResult;
+        const runPayload = JSON.parse(run.content[0].text);
+        expect(runPayload.result.passed).toBe(true);
+
+        // Submit again: strict mode now permits it (the upstream
+        // request itself will fail via nock — we don't care; the gate
+        // is what we're locking down here).
+        const allowed = (await spawned.client.callTool({
+            name: "submit_solution",
+            arguments: {
+                problemSlug: "two-sum",
+                code: "def twoSum(nums, target): pass",
+                language: "python3"
+            }
+        })) as ToolTextResult;
+        const allowedPayload = JSON.parse(allowed.content[0].text);
+        expect(allowedPayload.code).not.toBe("LOCAL_TESTS_NOT_PASSED");
+    });
+});
diff --git a/tests/integration/runner-tools-integration.test.ts b/tests/integration/runner-tools-integration.test.ts
new file mode 100644
index 0000000..b563913
--- /dev/null
+++ b/tests/integration/runner-tools-integration.test.ts
@@ -0,0 +1,251 @@
+/**
+ * Runner Tools Integration Tests
+ *
+ * Drives `run_local_tests` and `runner_doctor` through the MCP wire,
+ * with a fake `LocalRunner` that records what it was called with so we
+ * can assert the tool layer's behaviour without depending on `python3`
+ * being installed where these tests run.
+ */
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { SessionService } from "../../src/domain/session-service.js";
+import { FileSessionStore } from "../../src/domain/session-store.js";
+import { registerRunnerTools } from "../../src/mcp/tools/runner-tools.js";
+import type { LocalRunner } from "../../src/runner/runner.js";
+import {
+    ErrorCode,
+    LeetCodeError,
+    type RunInput,
+    type RunResult,
+    type RunnerCapabilities
+} from "../../src/types/index.js";
+import { createMockLeetCodeService } from "../helpers/mock-leetcode.js";
+import type { TestClientPair } from "../helpers/test-client.js";
+import { createTestClient } from "../helpers/test-client.js";
+import { INTEGRATION_TEST_TIMEOUT, assertions } from "./setup.js";
+
+const HAPPY_RESULT: RunResult = {
+    passed: true,
+    exitCode: 0,
+    stdout: "ok\n",
+    stderr: "",
+    timedOut: false,
+    durationMs: 42,
+    sandbox: "none",
+    warning: "No OS sandbox available on this host; ran without isolation."
+};
+
+const FAKE_CAPS: RunnerCapabilities = {
+    languages: [
+        { language: "python3", available: true, version: "Python 3.12.0" },
+        { language: "go", available: false },
+        { language: "java", available: false }
+    ],
+    sandbox: { kind: "none", available: false }
+};
+
+interface FakeRunnerOptions {
+    nextResult?: RunResult;
+    runError?: unknown;
+}
+
+function createFakeRunner(options: FakeRunnerOptions = {}): LocalRunner & {
+    runs: RunInput[];
+} {
+    const runs: RunInput[] = [];
+    return {
+        runs,
+        async run(input: RunInput): Promise<RunResult> {
+            runs.push(input);
+            if (options.runError) {
+                throw options.runError;
+            }
+            return options.nextResult ?? HAPPY_RESULT;
+        },
+        async capabilities(): Promise<RunnerCapabilities> {
+            return FAKE_CAPS;
+        }
+    };
+}
+
+describe("Runner Tools Integration", () => {
+    let testClient: TestClientPair;
+    let mockService: ReturnType<typeof createMockLeetCodeService>;
+    let sessions: SessionService;
+    let sessionDir: string;
+    let runner: ReturnType<typeof createFakeRunner>;
+
+    beforeEach(async () => {
+        mockService = createMockLeetCodeService();
+        sessionDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-runner-"));
+        sessions = new SessionService(
+            new FileSessionStore({ dir: sessionDir })
+        );
+        runner = createFakeRunner();
+
+        testClient = await createTestClient({}, (server) => {
+            registerRunnerTools(server, mockService as any, sessions, runner);
+        });
+    }, INTEGRATION_TEST_TIMEOUT);
+
+    afterEach(async () => {
+        if (testClient) {
+            await testClient.cleanup();
+        }
+        await rm(sessionDir, { recursive: true, force: true });
+        vi.restoreAllMocks();
+    });
+
+    describe("run_local_tests", () => {
+        it(
+            "rejects with SESSION_NOT_FOUND when no session has been opened",
+            async () => {
+                const result: any = await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "python3",
+                        code: "print('hi')"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.code).toBe(ErrorCode.SESSION_NOT_FOUND);
+                expect(runner.runs).toHaveLength(0);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "delegates to the runner and records lastLocalRunPassed",
+            async () => {
+                await sessions.startOrResume({ slug: "two-sum" });
+
+                const result: any = await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "python3",
+                        code: 'print("hi")'
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.titleSlug).toBe("two-sum");
+                expect(payload.result.passed).toBe(true);
+                expect(runner.runs).toHaveLength(1);
+                expect(runner.runs[0].language).toBe("python3");
+                expect(runner.runs[0].code).toBe('print("hi")');
+
+                const session = await sessions.requireSession("two-sum");
+                expect(session.lastLocalRunPassed).toBe(true);
+                expect(session.attempts).toBe(1);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "records lastLocalRunPassed=false on a failing run",
+            async () => {
+                await sessions.startOrResume({ slug: "two-sum" });
+                const failing = createFakeRunner({
+                    nextResult: { ...HAPPY_RESULT, passed: false, exitCode: 1 }
+                });
+                // Re-build the test client with the failing runner.
+                await testClient.cleanup();
+                testClient = await createTestClient({}, (server) => {
+                    registerRunnerTools(
+                        server,
+                        mockService as any,
+                        sessions,
+                        failing
+                    );
+                });
+
+                await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "python3",
+                        code: "raise SystemExit(1)"
+                    }
+                });
+
+                const session = await sessions.requireSession("two-sum");
+                expect(session.lastLocalRunPassed).toBe(false);
+                expect(session.attempts).toBe(1);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "surfaces RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE thrown from the runner",
+            async () => {
+                await sessions.startOrResume({ slug: "two-sum" });
+                const broken = createFakeRunner({
+                    runError: new LeetCodeError(
+                        ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                        "Go runner ships in Phase 4b"
+                    )
+                });
+                await testClient.cleanup();
+                testClient = await createTestClient({}, (server) => {
+                    registerRunnerTools(
+                        server,
+                        mockService as any,
+                        sessions,
+                        broken
+                    );
+                });
+
+                const result: any = await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "go",
+                        code: "package main"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.code).toBe(
+                    ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE
+                );
+
+                // The session attempt counter should NOT bump on a
+                // pre-run rejection.
+                const session = await sessions.requireSession("two-sum");
+                expect(session.attempts).toBe(0);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+    });
+
+    describe("runner_doctor", () => {
+        it(
+            "returns the capabilities snapshot",
+            async () => {
+                const result: any = await testClient.client.callTool({
+                    name: "runner_doctor",
+                    arguments: {}
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.languages).toBeDefined();
+                expect(payload.sandbox).toBeDefined();
+                expect(
+                    payload.languages.find(
+                        (l: { language: string }) => l.language === "python3"
+                    )?.available
+                ).toBe(true);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+    });
+});
diff --git a/tests/integration/submission-tools-integration.test.ts b/tests/integration/submission-tools-integration.test.ts
index aa60787..720ebf4 100644
--- a/tests/integration/submission-tools-integration.test.ts
+++ b/tests/integration/submission-tools-integration.test.ts
@@ -2,8 +2,14 @@
  * Submission Tools Integration Tests
  * Tests all submission-related tools through MCP protocol
  */
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
 import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { SessionService } from "../../src/domain/session-service.js";
+import { FileSessionStore } from "../../src/domain/session-store.js";
 import { registerSubmissionTools } from "../../src/mcp/tools/submission-tools.js";
+import { ErrorCode } from "../../src/types/index.js";
 import { createMockAuthenticatedService } from "../helpers/mock-leetcode.js";
 import type { TestClientPair } from "../helpers/test-client.js";
 import { createTestClient } from "../helpers/test-client.js";
@@ -12,13 +18,19 @@ import { INTEGRATION_TEST_TIMEOUT, assertions } from "./setup.js";
 describe("Submission Tools Integration", () => {
     let testClient: TestClientPair;
     let mockService: ReturnType<typeof createMockAuthenticatedService>;
+    let sessions: SessionService;
+    let sessionDir: string;
 
     beforeEach(async () => {
         // Use authenticated service since submission requires authentication
         mockService = createMockAuthenticatedService();
+        sessionDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-sub-"));
+        sessions = new SessionService(
+            new FileSessionStore({ dir: sessionDir })
+        );
 
         testClient = await createTestClient({}, (server) => {
-            registerSubmissionTools(server, mockService as any);
+            registerSubmissionTools(server, mockService as any, sessions);
         });
     }, INTEGRATION_TEST_TIMEOUT);
 
@@ -26,6 +38,8 @@ describe("Submission Tools Integration", () => {
         if (testClient) {
             await testClient.cleanup();
         }
+        await rm(sessionDir, { recursive: true, force: true });
+        delete process.env.LEETCODE_MCP_STRICT_MODE;
     });
 
     describe("submit_solution", () => {
@@ -98,4 +112,100 @@ describe("Submission Tools Integration", () => {
             INTEGRATION_TEST_TIMEOUT
         );
     });
+
+    describe("submit_solution — strict mode", () => {
+        it(
+            "blocks submission when LEETCODE_MCP_STRICT_MODE=1 and session has not passed locals",
+            async () => {
+                process.env.LEETCODE_MCP_STRICT_MODE = "1";
+                await sessions.startOrResume({ slug: "two-sum" });
+                // No recordLocalRun call → lastLocalRunPassed is null.
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                expect(payload.code).toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "permits submission when strict mode is on and locals have passed",
+            async () => {
+                process.env.LEETCODE_MCP_STRICT_MODE = "1";
+                await sessions.startOrResume({ slug: "two-sum" });
+                await sessions.recordLocalRun("two-sum", true);
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                // Mock service returns a normal submission envelope —
+                // we just need to confirm we didn't get the error code.
+                expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "permits submission when strict mode is on but no session was opened",
+            async () => {
+                process.env.LEETCODE_MCP_STRICT_MODE = "1";
+                // Deliberately no startOrResume — strict mode should
+                // not block ad-hoc submissions outside the tutoring
+                // flow.
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "does not block by default (LEETCODE_MCP_STRICT_MODE unset)",
+            async () => {
+                // No env var; session exists with lastLocalRunPassed === null.
+                await sessions.startOrResume({ slug: "two-sum" });
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+    });
 });
diff --git a/tests/runner/subprocess-runner.test.ts b/tests/runner/subprocess-runner.test.ts
new file mode 100644
index 0000000..27969dc
--- /dev/null
+++ b/tests/runner/subprocess-runner.test.ts
@@ -0,0 +1,161 @@
+/**
+ * Unit tests for the subprocess runner.
+ *
+ * These tests assume `python3` is available on PATH (the project's own
+ * CI image already has it). The runner's own probe gates each test on
+ * availability; a missing python3 produces a `LANGUAGE_RUNTIME_NOT_FOUND`
+ * which is its own first-class assertion.
+ */
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { __resetSandboxCacheForTest } from "../../src/runner/sandbox.js";
+import {
+    SubprocessRunner,
+    __resetProbeCacheForTest
+} from "../../src/runner/subprocess-runner.js";
+import {
+    ErrorCode,
+    isLeetCodeError,
+    type RunnerLanguage
+} from "../../src/types/index.js";
+
+describe("SubprocessRunner", () => {
+    let runner: SubprocessRunner;
+
+    beforeEach(() => {
+        // Force re-probing per test so mutations to PATH (none here, but
+        // future tests may) don't leak between cases.
+        __resetProbeCacheForTest();
+        __resetSandboxCacheForTest();
+        runner = new SubprocessRunner();
+    });
+
+    afterEach(() => {
+        __resetProbeCacheForTest();
+        __resetSandboxCacheForTest();
+    });
+
+    describe("capabilities", () => {
+        it("reports python3 as a supported language", async () => {
+            const caps = await runner.capabilities();
+            const py = caps.languages.find((l) => l.language === "python3");
+            expect(py).toBeDefined();
+            // Don't assert availability — environments without python3
+            // should still produce a coherent envelope.
+            expect(typeof py?.available).toBe("boolean");
+        });
+
+        it("reports go and java as supported languages even before they are implemented", async () => {
+            const caps = await runner.capabilities();
+            const langs = caps.languages.map((l) => l.language).sort();
+            expect(langs).toEqual(["go", "java", "python3"]);
+        });
+
+        it("includes a sandbox descriptor", async () => {
+            const caps = await runner.capabilities();
+            expect(caps.sandbox).toBeDefined();
+            expect(["none", "bwrap", "firejail", "sandbox-exec"]).toContain(
+                caps.sandbox.kind
+            );
+        });
+    });
+
+    describe("run", () => {
+        it("executes a happy-path python script", async () => {
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'print("hello"); assert 1 + 1 == 2'
+            });
+
+            expect(result.passed).toBe(true);
+            expect(result.exitCode).toBe(0);
+            expect(result.timedOut).toBe(false);
+            expect(result.stdout).toContain("hello");
+            expect(result.stderr).toBe("");
+            expect(result.durationMs).toBeGreaterThanOrEqual(0);
+        });
+
+        it("captures non-zero exit code without throwing", async () => {
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "raise SystemExit(7)"
+            });
+
+            expect(result.passed).toBe(false);
+            expect(result.exitCode).toBe(7);
+            expect(result.timedOut).toBe(false);
+        });
+
+        it("captures stderr from raised exceptions", async () => {
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'raise ValueError("boom")'
+            });
+
+            expect(result.passed).toBe(false);
+            expect(result.exitCode).not.toBe(0);
+            expect(result.stderr).toContain("ValueError");
+            expect(result.stderr).toContain("boom");
+        });
+
+        it("kills runaway processes after the timeout budget", async () => {
+            const start = Date.now();
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "while True: pass",
+                timeoutMs: 400
+            });
+            const elapsed = Date.now() - start;
+
+            expect(result.timedOut).toBe(true);
+            expect(result.passed).toBe(false);
+            // Tolerate slow CI: budget + the 500 ms SIGTERM-then-SIGKILL
+            // grace + scheduler jitter. Should not run for full 5s.
+            expect(elapsed).toBeLessThan(2_500);
+        });
+
+        it("rejects unsupported languages with RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", async () => {
+            await expect(async () => {
+                await runner.run({
+                    titleSlug: "two-sum",
+                    language: "go" as RunnerLanguage,
+                    code: 'package main\nfunc main() { println("hi") }'
+                });
+            }).rejects.toSatisfy((error: unknown) => {
+                if (!isLeetCodeError(error)) {
+                    return false;
+                }
+                return (
+                    error.code === ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE
+                );
+            });
+        });
+
+        it("forwards a clean env (no leaking secrets)", async () => {
+            // Ask the child to print one of its env vars. We never set
+            // SECRET_ON_PARENT in the child env, so it should print
+            // empty even if defined on the parent.
+            const before = process.env.SECRET_ON_PARENT;
+            process.env.SECRET_ON_PARENT = "leak-me";
+            try {
+                const result = await runner.run({
+                    titleSlug: "two-sum",
+                    language: "python3",
+                    code: 'import os; print(os.environ.get("SECRET_ON_PARENT", "MISSING"))'
+                });
+
+                expect(result.passed).toBe(true);
+                expect(result.stdout.trim()).toBe("MISSING");
+            } finally {
+                if (before === undefined) {
+                    delete process.env.SECRET_ON_PARENT;
+                } else {
+                    process.env.SECRET_ON_PARENT = before;
+                }
+            }
+        });
+    });
+});

From dbabfbea6153efb8b6553ba25cf09a8cf0cd6c69 Mon Sep 17 00:00:00 2001
From: Owl <32782746+SPerekrestova@users.noreply.github.com>
Date: Fri, 8 May 2026 08:53:47 +0000
Subject: [PATCH 6/6] Phase 4a review: fix sandbox-exec detection, drop
 shell-exec probes, tighten output guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three issues raised on PR #39:

1. sandbox-exec detection on macOS was effectively broken. The probe
   spawned 'sandbox-exec -help' and trusted any non-zero exit to mean
   'not installed', but '-help' is undocumented and exits non-zero on
   some macOS versions. The catch swallowed it and we fell through to
   kind: 'none', misleading users that no sandbox was available.
   Now probe by file existence + executable bit (fs.access X_OK on
   /usr/bin/sandbox-exec), which is what the comment always said the
   intent was.

2. The probes in sandbox.ts and subprocess-runner.ts aliased
   promisify(exec) as 'execFile'. The name was misleading: exec routes
   through /bin/sh -c and is a shell-expansion foot-gun if anyone ever
   interpolates a dynamic value. Switched to the actual node
   execFile (no shell) for every probe — same call sites, just the
   no-shell variant. Static inputs only today; hardened for future.

3. The per-chunk capture in subprocess-runner allowed the buffer to
   grow up to MAX_OUTPUT_BYTES + chunk_size before stopping (the guard
   only checked 'is bytes already at the cap?'). Now slice the
   overflowing chunk to the exact remaining headroom and drop the
   rest, so the buffered total never exceeds MAX_OUTPUT_BYTES.
   clampOutput stays as a final cap.

No behaviour change for the happy path. 201/201 unit/integration,
16/16 e2e.
---
 src/runner/sandbox.ts           | 27 +++++++++++-------
 src/runner/subprocess-runner.ts | 49 +++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/src/runner/sandbox.ts b/src/runner/sandbox.ts
index 0e9049b..5fc866c 100644
--- a/src/runner/sandbox.ts
+++ b/src/runner/sandbox.ts
@@ -18,12 +18,13 @@
  * run without a sandbox can set `LEETCODE_MCP_REQUIRE_SANDBOX=1`; the
  * tool layer enforces this — the runner only reports.
  */
-import { exec as execCb } from "node:child_process";
+import { execFile as execFileCb } from "node:child_process";
+import { access, constants as fsConstants } from "node:fs/promises";
 import { promisify } from "node:util";
 
 import type { SandboxKind } from "../types/index.js";
 
-const execFile = promisify(execCb);
+const execFile = promisify(execFileCb);
 
 interface DetectedSandbox {
     kind: SandboxKind;
@@ -35,15 +36,18 @@ interface DetectedSandbox {
 let cached: DetectedSandbox | undefined;
 
 /**
- * Returns whether `<bin> --version` (or equivalent) succeeds. We do a
- * shell-out rather than `which` so the answer is uniform across platforms.
+ * Returns whether `<bin> --version` succeeds. Uses the no-shell
+ * `execFile` so the probe never re-interprets `bin`/`args` through
+ * `/bin/sh -c` — important because future callers might be tempted to
+ * pass dynamic values, and the default `child_process.exec` is a
+ * shell-expansion foot-gun.
  */
 async function probe(
-    cmd: string,
+    bin: string,
     args: string[] = ["--version"]
 ): Promise<boolean> {
     try {
-        await execFile(`${cmd} ${args.join(" ")}`, { timeout: 1500 });
+        await execFile(bin, args, { timeout: 1500 });
         return true;
     } catch {
         return false;
@@ -62,11 +66,14 @@ export async function detectSandbox(): Promise<DetectedSandbox> {
 
     const platform = process.platform;
     if (platform === "darwin") {
-        // sandbox-exec is /usr/bin/sandbox-exec on every macOS we care
-        // about. It accepts no `--version`; probe with `-help` (any
-        // exit code is fine — it always prints to stderr).
+        // sandbox-exec lives at /usr/bin/sandbox-exec on every macOS
+        // version we care about. Detect by file existence + executable
+        // bit rather than spawning the binary — its `-help` flag is
+        // undocumented and exits non-zero on some macOS versions, which
+        // would silently fall through to `kind: "none"` and lie to
+        // users that no sandbox is available.
         try {
-            await execFile("/usr/bin/sandbox-exec -help", { timeout: 1500 });
+            await access("/usr/bin/sandbox-exec", fsConstants.X_OK);
             cached = { kind: "sandbox-exec", path: "/usr/bin/sandbox-exec" };
             return cached;
         } catch {
diff --git a/src/runner/subprocess-runner.ts b/src/runner/subprocess-runner.ts
index 08685f9..8249131 100644
--- a/src/runner/subprocess-runner.ts
+++ b/src/runner/subprocess-runner.ts
@@ -19,7 +19,7 @@
  *   - stdout/stderr captured with a 1 MB ceiling; runaway output gets
  *     truncated with a marker rather than blowing memory
  */
-import { exec as execCb, spawn } from "node:child_process";
+import { execFile as execFileCb, spawn } from "node:child_process";
 import { mkdtemp, rm, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
@@ -39,7 +39,10 @@ import type { LocalRunner } from "./runner.js";
 import { IMPLEMENTED_LANGUAGES, SUPPORTED_LANGUAGES } from "./runner.js";
 import { wrapWithSandbox } from "./sandbox.js";
 
-const execFile = promisify(execCb);
+// `execFile` (no shell) — never `promisify(exec)`, which routes through
+// `/bin/sh -c` and is a shell-expansion foot-gun if anyone interpolates
+// a dynamic value into a probe in the future.
+const execFile = promisify(execFileCb);
 
 const MAX_OUTPUT_BYTES = 1_000_000; // 1 MB per stream
 const DEFAULT_TIMEOUT_MS = 5_000;
@@ -108,7 +111,8 @@ async function probeLanguage(language: RunnerLanguage): Promise<ProbeResult> {
     const spec = LANGUAGES[language];
     try {
         const { stdout, stderr } = await execFile(
-            `${spec.probe.cmd} ${spec.probe.args.join(" ")}`,
+            spec.probe.cmd,
+            spec.probe.args,
             { timeout: 2000 }
         );
         // `python3 --version` and `go version` write to stdout; `java
@@ -120,10 +124,9 @@ async function probeLanguage(language: RunnerLanguage): Promise<ProbeResult> {
         };
         try {
             const { stdout: which } = await execFile(
-                `which ${spec.probe.cmd}`,
-                {
-                    timeout: 1000
-                }
+                "which",
+                [spec.probe.cmd],
+                { timeout: 1000 }
             );
             result.path = which.trim() || undefined;
         } catch {
@@ -257,17 +260,33 @@ export class SubprocessRunner implements LocalRunner {
             let timedOut = false;
             let killTimer: NodeJS.Timeout | undefined;
 
-            child.stdout?.on("data", (chunk: Buffer) => {
-                if (stdoutBytes < MAX_OUTPUT_BYTES) {
-                    stdout.push(chunk);
-                    stdoutBytes += chunk.length;
+            // Tight guard: never let the buffered total exceed
+            // `MAX_OUTPUT_BYTES` even by a chunk. We slice the
+            // overflowing chunk to the exact remaining headroom and
+            // drop the rest. `clampOutput` still runs at finalize as a
+            // belt-and-braces final cap.
+            const captureChunk = (
+                buffers: Buffer[],
+                bytes: number,
+                chunk: Buffer
+            ): number => {
+                const remaining = MAX_OUTPUT_BYTES - bytes;
+                if (remaining <= 0) {
+                    return bytes;
                 }
+                if (chunk.length <= remaining) {
+                    buffers.push(chunk);
+                    return bytes + chunk.length;
+                }
+                buffers.push(chunk.subarray(0, remaining));
+                return bytes + remaining;
+            };
+
+            child.stdout?.on("data", (chunk: Buffer) => {
+                stdoutBytes = captureChunk(stdout, stdoutBytes, chunk);
             });
             child.stderr?.on("data", (chunk: Buffer) => {
-                if (stderrBytes < MAX_OUTPUT_BYTES) {
-                    stderr.push(chunk);
-                    stderrBytes += chunk.length;
-                }
+                stderrBytes = captureChunk(stderr, stderrBytes, chunk);
             });
 
             const timer = setTimeout(() => {