diff --git a/src/domain/session-service.ts b/src/domain/session-service.ts
index 7c37477..037120f 100644
--- a/src/domain/session-service.ts
+++ b/src/domain/session-service.ts
@@ -132,7 +132,13 @@ export class SessionService {
         );
     }
 
-    private async requireSession(slug: string): Promise<SessionState> {
+    /**
+     * Public variant of the session lookup — throws `SESSION_NOT_FOUND`
+     * when the user never opened the slug. Used by the runner-tools
+     * layer to keep `run_local_tests` aligned with the pedagogy state
+     * machine (no orphaned runs).
+     */
+    async requireSession(slug: string): Promise<SessionState> {
         const session = await this.store.load(slug);
         if (!session) {
             throw new LeetCodeError(
@@ -142,4 +148,24 @@ export class SessionService {
         }
         return session;
     }
+
+    /**
+     * Updates the session after a `run_local_tests` invocation.
+     * Increments `attempts`, sets `lastLocalRunPassed`, and bumps
+     * `status` to "attempting" on the first run (so subsequent
+     * resets-then-runs keep the lifecycle accurate).
+     */
+    async recordLocalRun(slug: string, passed: boolean): Promise<SessionState> {
+        const session = await this.requireSession(slug);
+        const next: SessionState = {
+            ...session,
+            attempts: session.attempts + 1,
+            lastLocalRunPassed: passed,
+            status:
+                session.status === "started" ? "attempting" : session.status,
+            updatedAt: new Date().toISOString()
+        };
+        await this.store.save(next);
+        return next;
+    }
 }
diff --git a/src/index.ts b/src/index.ts
index b9c316d..4ba550b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -19,10 +19,12 @@ import { registerAuthTools } from "./mcp/tools/auth-tools.js";
 import { registerContestTools } from "./mcp/tools/contest-tools.js";
 import { registerOnboardingTools } from "./mcp/tools/onboarding-tools.js";
 import { registerProblemTools } from "./mcp/tools/problem-tools.js";
+import { registerRunnerTools } from "./mcp/tools/runner-tools.js";
 import { registerSessionTools } from "./mcp/tools/session-tools.js";
 import { registerSolutionTools } from "./mcp/tools/solution-tools.js";
 import { registerSubmissionTools } from "./mcp/tools/submission-tools.js";
 import { registerUserTools } from "./mcp/tools/user-tools.js";
+import { SubprocessRunner } from "./runner/subprocess-runner.js";
 import logger from "./utils/logger.js";
 
 /**
@@ -145,6 +147,11 @@ async function main() {
     // returning content.
     const sessions = new SessionService();
 
+    // Local subprocess runner: probes python3 / go / java on first use,
+    // wraps with bwrap / firejail / sandbox-exec where available, and
+    // backs the `run_local_tests` tool. Phase 4a ships python3 only.
+    const runner = new SubprocessRunner();
+
     // Register MCP prompts for learning mode and workspace guidance
     registerLearningPrompts(server, leetcodeService);
 
@@ -158,8 +165,9 @@ async function main() {
     registerContestTools(server, leetcodeService);
     registerSessionTools(server, leetcodeService, sessions);
     registerSolutionTools(server, leetcodeService, sessions);
+    registerRunnerTools(server, leetcodeService, sessions, runner);
     registerAuthTools(server, leetcodeService);
-    registerSubmissionTools(server, leetcodeService);
+    registerSubmissionTools(server, leetcodeService, sessions);
 
     registerProblemResources(server, leetcodeService);
     registerSolutionResources(server, leetcodeService);
diff --git a/src/mcp/tools/runner-tools.ts b/src/mcp/tools/runner-tools.ts
new file mode 100644
index 0000000..7c7fa33
--- /dev/null
+++ b/src/mcp/tools/runner-tools.ts
@@ -0,0 +1,188 @@
+import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import { z } from "zod";
+import type { SessionService } from "../../domain/session-service.js";
+import { LeetcodeServiceInterface } from "../../leetcode/leetcode-service-interface.js";
+import {
+    IMPLEMENTED_LANGUAGES,
+    SUPPORTED_LANGUAGES,
+    type LocalRunner
+} from "../../runner/runner.js";
+import type { RunnerLanguage } from "../../types/index.js";
+import { ErrorCode, LeetCodeError } from "../../types/index.js";
+import { errorEnvelope } from "./session-tools.js";
+import { ToolRegistry } from "./tool-registry.js";
+
+/**
+ * Local-runner tools introduced in Phase 4.
+ *
+ * `run_local_tests` is the inner-loop primitive: agent passes code,
+ * runner spawns a sandboxed subprocess, captures stdout/stderr/exit
+ * code, and reports back. The session's `lastLocalRunPassed` flag is
+ * updated as a side effect so `submit_solution`'s strict-mode gate
+ * (Phase 6) and any future analytics have a stable hook.
+ *
+ * v1 deliberately does *not* parse `exampleTestcases` server-side or
+ * synthesize a per-problem harness. The agent — which already has the
+ * problem in context after `start_problem` — is responsible for adding
+ * test invocations to the code it submits to the runner. That keeps
+ * the wire surface tiny, language-agnostic, and free of LeetCode-
+ * specific signature parsing.
+ */
+export class RunnerToolRegistry extends ToolRegistry {
+    constructor(
+        server: McpServer,
+        leetcodeService: LeetcodeServiceInterface,
+        private readonly sessions: SessionService,
+        private readonly runner: LocalRunner
+    ) {
+        super(server, leetcodeService);
+    }
+
+    protected registerPublic(): void {
+        this.registerRunLocalTests();
+        this.registerDoctor();
+    }
+
+    private registerRunLocalTests(): void {
+        const supportedLiteral = z.enum(
+            SUPPORTED_LANGUAGES as unknown as [string, ...string[]]
+        );
+        this.server.registerTool(
+            "run_local_tests",
+            {
+                description:
+                    "Runs the user's code locally in an isolated subprocess, captures stdout / stderr / exit code, and updates the session's lastLocalRunPassed flag. Use this in the inner loop instead of submit_solution — it costs no LeetCode submission and turns around in seconds. The agent is responsible for including test invocations (e.g. `print(Solution().twoSum([2,7,11,15], 9))`) in the code passed in. Phase 4a ships python3; go and java land in Phase 4b/4c.",
+                inputSchema: {
+                    titleSlug: z
+                        .string()
+                        .min(1)
+                        .describe(
+                            "The URL slug of the problem (must match an active session opened with start_problem)."
+                        ),
+                    language: supportedLiteral.describe(
+                        `Language to execute as. Currently runnable: ${IMPLEMENTED_LANGUAGES.join(
+                            ", "
+                        )}. Other LeetCode languages remain valid for submit_solution.`
+                    ),
+                    code: z
+                        .string()
+                        .min(1)
+                        .describe(
+                            "Complete source code to execute. Should include test invocations that print results / raise on failure."
+                        ),
+                    timeoutMs: z
+                        .number()
+                        .int()
+                        .min(100)
+                        .max(60_000)
+                        .optional()
+                        .describe(
+                            "Optional wall-clock budget in milliseconds. Defaults to 5000."
+                        )
+                }
+            },
+            async ({ titleSlug, language, code, timeoutMs }) => {
+                try {
+                    // Require a session — keeps the runner aligned with
+                    // the pedagogy state machine (and gives us a sane
+                    // place to record `attempts` / `lastLocalRunPassed`).
+                    await this.sessions.requireSession(titleSlug);
+
+                    const result = await this.runner.run({
+                        titleSlug,
+                        language: language as RunnerLanguage,
+                        code,
+                        timeoutMs
+                    });
+
+                    await this.sessions.recordLocalRun(
+                        titleSlug,
+                        result.passed
+                    );
+
+                    return {
+                        content: [
+                            {
+                                type: "text" as const,
+                                text: JSON.stringify({
+                                    titleSlug,
+                                    language,
+                                    result
+                                })
+                            }
+                        ]
+                    };
+                } catch (error) {
+                    return errorEnvelope(
+                        "Failed to run local tests",
+                        wrapTimeout(error)
+                    );
+                }
+            }
+        );
+    }
+
+    private registerDoctor(): void {
+        this.server.registerTool(
+            "runner_doctor",
+            {
+                description:
+                    "Reports which language runtimes (python3, go, java) and OS sandbox tools (bwrap, firejail, sandbox-exec) are detected on this host. Useful for diagnosing 'LANGUAGE_RUNTIME_NOT_FOUND' errors and confirming whether run_local_tests will be sandboxed.",
+                inputSchema: {}
+            },
+            async () => {
+                try {
+                    const capabilities = await this.runner.capabilities();
+                    return {
+                        content: [
+                            {
+                                type: "text" as const,
+                                text: JSON.stringify(capabilities)
+                            }
+                        ]
+                    };
+                } catch (error) {
+                    return errorEnvelope(
+                        "Failed to inspect runner capabilities",
+                        error
+                    );
+                }
+            }
+        );
+    }
+}
+
+/**
+ * `RUNNER_TIMEOUT` is reported as a plain `RunResult` with `timedOut: true`,
+ * not as a thrown error — but `run` itself can throw for the runtime-
+ * not-found / language-not-implemented cases. Anything else is normalised
+ * into `UPSTREAM_ERROR` by the shared envelope.
+ */
+function wrapTimeout(error: unknown): unknown {
+    if (error instanceof LeetCodeError) {
+        return error;
+    }
+    if (error instanceof Error && /timed out/i.test(error.message)) {
+        return new LeetCodeError(
+            ErrorCode.RUNNER_TIMEOUT,
+            error.message,
+            error
+        );
+    }
+    return error;
+}
+
+export function registerRunnerTools(
+    server: McpServer,
+    leetcodeService: LeetcodeServiceInterface,
+    sessions: SessionService,
+    runner: LocalRunner
+): void {
+    const registry = new RunnerToolRegistry(
+        server,
+        leetcodeService,
+        sessions,
+        runner
+    );
+    registry.register();
+}
diff --git a/src/mcp/tools/submission-tools.ts b/src/mcp/tools/submission-tools.ts
index 04ea8cf..a5b8635 100644
--- a/src/mcp/tools/submission-tools.ts
+++ b/src/mcp/tools/submission-tools.ts
@@ -1,19 +1,41 @@
 import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { z } from "zod";
+import type { SessionService } from "../../domain/session-service.js";
 import { LeetcodeServiceInterface } from "../../leetcode/leetcode-service-interface.js";
+import { ErrorCode, LeetCodeError } from "../../types/index.js";
+import { errorEnvelope } from "./session-tools.js";
 import { ToolRegistry } from "./tool-registry.js";
 
 /**
  * Submission tool registry class that handles registration of LeetCode submission tools.
+ *
+ * Phase 4 wires the strict-mode gate (`LEETCODE_MCP_STRICT_MODE=1`):
+ * when enabled, `submit_solution` refuses to spend a real LeetCode
+ * submission unless the active session's `lastLocalRunPassed === true`.
+ * Default is *off* (preserves current behaviour); session is optional
+ * so existing flows without `start_problem` aren't broken.
  */
 export class SubmissionToolRegistry extends ToolRegistry {
+    constructor(
+        server: McpServer,
+        leetcodeService: LeetcodeServiceInterface,
+        private readonly sessions?: SessionService
+    ) {
+        super(server, leetcodeService);
+    }
+
+    private isStrictMode(): boolean {
+        const value = process.env.LEETCODE_MCP_STRICT_MODE;
+        return value === "1" || value === "true";
+    }
+
     protected registerPublic(): void {
         // Submission tool
         this.server.registerTool(
             "submit_solution",
             {
                 description:
-                    "Submit a solution to a LeetCode problem and get results. Returns acceptance status, runtime/memory stats, or failed test case details.",
+                    "Submit a solution to a LeetCode problem and get results. Returns acceptance status, runtime/memory stats, or failed test case details. When LEETCODE_MCP_STRICT_MODE=1 is set, requires `run_local_tests` to have last passed for the problem first — saves real LeetCode submissions for solutions that pass examples locally.",
                 inputSchema: {
                     problemSlug: z
                         .string()
@@ -51,6 +73,21 @@ export class SubmissionToolRegistry extends ToolRegistry {
             },
             async ({ problemSlug, code, language }) => {
                 try {
+                    if (this.isStrictMode() && this.sessions) {
+                        // The strict gate only fires when the user has
+                        // actually opened a session for this slug. If
+                        // they never called `start_problem`, the
+                        // pre-strict-mode behaviour is preserved (so
+                        // strict mode is non-disruptive for ad-hoc
+                        // calls outside the tutoring flow).
+                        const session = await this.sessions.get(problemSlug);
+                        if (session && session.lastLocalRunPassed !== true) {
+                            throw new LeetCodeError(
+                                ErrorCode.LOCAL_TESTS_NOT_PASSED,
+                                "Strict mode is enabled and the most recent run_local_tests for this problem did not pass. Run it again and submit only when locals are green."
+                            );
+                        }
+                    }
                     const result = await this.leetcodeService.submitSolution(
                         problemSlug,
                         code,
@@ -59,23 +96,13 @@ export class SubmissionToolRegistry extends ToolRegistry {
                     return {
                         content: [
                             {
-                                type: "text",
+                                type: "text" as const,
                                 text: JSON.stringify(result, null, 2)
                             }
                         ]
                     };
-                } catch (error: any) {
-                    return {
-                        content: [
-                            {
-                                type: "text",
-                                text: JSON.stringify({
-                                    error: "Failed to submit solution",
-                                    message: error.message
-                                })
-                            }
-                        ]
-                    };
+                } catch (error) {
+                    return errorEnvelope("Failed to submit solution", error);
                 }
             }
         );
@@ -87,11 +114,17 @@ export class SubmissionToolRegistry extends ToolRegistry {
  *
  * @param server - The MCP server instance to register tools with
  * @param leetcodeService - The LeetCode service implementation to use for API calls
+ * @param sessions - Optional session service used for the strict-mode gate
  */
 export function registerSubmissionTools(
     server: McpServer,
-    leetcodeService: LeetcodeServiceInterface
+    leetcodeService: LeetcodeServiceInterface,
+    sessions?: SessionService
 ): void {
-    const registry = new SubmissionToolRegistry(server, leetcodeService);
+    const registry = new SubmissionToolRegistry(
+        server,
+        leetcodeService,
+        sessions
+    );
     registry.register();
 }
diff --git a/src/runner/runner.ts b/src/runner/runner.ts
new file mode 100644
index 0000000..addb32d
--- /dev/null
+++ b/src/runner/runner.ts
@@ -0,0 +1,45 @@
+/**
+ * The local runner contract — implemented by `SubprocessRunner` for
+ * production and easily faked in tests.
+ *
+ * Tools should depend on this interface, never on the concrete
+ * implementation. Phase 4d will add an alternative implementation that
+ * delegates to a stronger sandbox; Phase 5 will compose this with the
+ * workspace abstraction.
+ */
+import type {
+    RunInput,
+    RunResult,
+    RunnerCapabilities,
+    RunnerLanguage
+} from "../types/index.js";
+
+export interface LocalRunner {
+    /** Runs the user's code; returns the result envelope (never throws on user-code failures). */
+    run(input: RunInput): Promise<RunResult>;
+    /** Snapshot of what the runner detected on this host — drives the `doctor` command. */
+    capabilities(): Promise<RunnerCapabilities>;
+}
+
+/**
+ * Languages the runner currently knows about. Used by the tool layer
+ * for early validation before spawning anything.
+ */
+export const SUPPORTED_LANGUAGES: readonly RunnerLanguage[] = [
+    "python3",
+    "go",
+    "java"
+] as const;
+
+/**
+ * The languages this build of the runner has *implemented*. Phase 4a
+ * ships `python3` only. Phase 4b/4c grow this list.
+ *
+ * Kept distinct from `SUPPORTED_LANGUAGES` so the wire-level
+ * `RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE` error has a single source of
+ * truth: anything in `SUPPORTED_LANGUAGES` but not in this list is a
+ * "coming soon" language.
+ */
+export const IMPLEMENTED_LANGUAGES: readonly RunnerLanguage[] = [
+    "python3"
+] as const;
diff --git a/src/runner/sandbox.ts b/src/runner/sandbox.ts
new file mode 100644
index 0000000..5fc866c
--- /dev/null
+++ b/src/runner/sandbox.ts
@@ -0,0 +1,181 @@
+/**
+ * Detect the strongest OS-level sandbox available on this host and turn
+ * a plain command into a sandbox-wrapped command.
+ *
+ * We deliberately ship no JS-level sandbox; the threat model is
+ * "user-running-their-own-code", not "untrusted multi-tenant input". The
+ * sandbox reduces blast radius of accidental rm-rf or runaway loops, not
+ * malicious code escapes.
+ *
+ * Priority:
+ *   - Linux: bwrap > firejail > none
+ *   - macOS: sandbox-exec > none
+ *   - Windows: none (native AppContainer wrappers are too platform-
+ *              specific to ship in v1)
+ *
+ * If nothing is detected the runner falls back to a plain subprocess and
+ * surfaces a `warning` in the `RunResult`. Users who want to refuse to
+ * run without a sandbox can set `LEETCODE_MCP_REQUIRE_SANDBOX=1`; the
+ * tool layer enforces this — the runner only reports.
+ */
+import { execFile as execFileCb } from "node:child_process";
+import { access, constants as fsConstants } from "node:fs/promises";
+import { promisify } from "node:util";
+
+import type { SandboxKind } from "../types/index.js";
+
+const execFile = promisify(execFileCb);
+
+interface DetectedSandbox {
+    kind: SandboxKind;
+    /** When `kind === "none"`, the absolute path to the wrapping
+     *  binary (`bwrap`, `firejail`, `sandbox-exec`) is undefined. */
+    path?: string;
+}
+
+let cached: DetectedSandbox | undefined;
+
+/**
+ * Returns whether `<bin> --version` succeeds. Uses the no-shell
+ * `execFile` so the probe never re-interprets `bin`/`args` through
+ * `/bin/sh -c` — important because future callers might be tempted to
+ * pass dynamic values, and the default `child_process.exec` is a
+ * shell-expansion foot-gun.
+ */
+async function probe(
+    bin: string,
+    args: string[] = ["--version"]
+): Promise<boolean> {
+    try {
+        await execFile(bin, args, { timeout: 1500 });
+        return true;
+    } catch {
+        return false;
+    }
+}
+
+/**
+ * Probe the host once per server lifetime. Subsequent calls return the
+ * cached result; tests can use `__resetSandboxCacheForTest` to force
+ * re-detection.
+ */
+export async function detectSandbox(): Promise<DetectedSandbox> {
+    if (cached) {
+        return cached;
+    }
+
+    const platform = process.platform;
+    if (platform === "darwin") {
+        // sandbox-exec lives at /usr/bin/sandbox-exec on every macOS
+        // version we care about. Detect by file existence + executable
+        // bit rather than spawning the binary — its `-help` flag is
+        // undocumented and exits non-zero on some macOS versions, which
+        // would silently fall through to `kind: "none"` and lie to
+        // users that no sandbox is available.
+        try {
+            await access("/usr/bin/sandbox-exec", fsConstants.X_OK);
+            cached = { kind: "sandbox-exec", path: "/usr/bin/sandbox-exec" };
+            return cached;
+        } catch {
+            /* fall through to "none" */
+        }
+    } else if (platform === "linux") {
+        if (await probe("bwrap")) {
+            cached = { kind: "bwrap" };
+            return cached;
+        }
+        if (await probe("firejail")) {
+            cached = { kind: "firejail" };
+            return cached;
+        }
+    }
+
+    cached = { kind: "none" };
+    return cached;
+}
+
+/**
+ * Wrap an existing command with the detected sandbox. Returns the new
+ * `[cmd, args]` pair plus the kind that was applied. When no sandbox is
+ * available, returns the input pair untouched and `kind: "none"`.
+ *
+ * `cwdAllowed` is the temp directory the user code is permitted to read
+ * + write — the rest of the filesystem is read-only (Linux) or denied
+ * (macOS).
+ */
+export async function wrapWithSandbox(
+    cmd: string,
+    args: string[],
+    cwdAllowed: string
+): Promise<{ cmd: string; args: string[]; kind: SandboxKind }> {
+    const detected = await detectSandbox();
+    if (detected.kind === "bwrap") {
+        return {
+            cmd: "bwrap",
+            args: [
+                "--ro-bind",
+                "/",
+                "/",
+                "--tmpfs",
+                "/tmp",
+                "--bind",
+                cwdAllowed,
+                cwdAllowed,
+                "--proc",
+                "/proc",
+                "--dev",
+                "/dev",
+                "--unshare-all",
+                "--die-with-parent",
+                "--",
+                cmd,
+                ...args
+            ],
+            kind: "bwrap"
+        };
+    }
+    if (detected.kind === "firejail") {
+        return {
+            cmd: "firejail",
+            args: [
+                "--quiet",
+                "--noprofile",
+                "--net=none",
+                "--private-tmp",
+                `--whitelist=${cwdAllowed}`,
+                "--",
+                cmd,
+                ...args
+            ],
+            kind: "firejail"
+        };
+    }
+    if (detected.kind === "sandbox-exec") {
+        // Minimal sandbox-exec profile — deny by default, allow process
+        // primitives + reads everywhere + writes only under cwdAllowed.
+        const profile = `(version 1)
+(deny default)
+(allow process-fork)
+(allow process-exec)
+(allow file-read*)
+(allow file-write* (subpath "${cwdAllowed.replace(/"/g, '\\"')}"))
+(allow file-write* (regex #"^/dev/null$"))
+(allow file-write* (regex #"^/dev/dtracehelper$"))
+(allow sysctl-read)
+(allow mach-lookup)
+(allow signal (target self))
+(allow ipc-posix-shm)
+(deny network*)`;
+        return {
+            cmd: "/usr/bin/sandbox-exec",
+            args: ["-p", profile, cmd, ...args],
+            kind: "sandbox-exec"
+        };
+    }
+    return { cmd, args, kind: "none" };
+}
+
+/** Test helper — clears the per-process cache so unit tests can re-probe. */
+export function __resetSandboxCacheForTest(): void {
+    cached = undefined;
+}
diff --git a/src/runner/subprocess-runner.ts b/src/runner/subprocess-runner.ts
new file mode 100644
index 0000000..8249131
--- /dev/null
+++ b/src/runner/subprocess-runner.ts
@@ -0,0 +1,353 @@
+/**
+ * Plain-subprocess `LocalRunner` implementation.
+ *
+ * Per-language registry (currently `python3`) describes how to:
+ *   - probe whether the runtime is available on PATH
+ *   - spawn the runtime against a source file written to the run's
+ *     temp dir
+ *
+ * Probes run lazily on the first `run()` for the language and the
+ * results are cached for the lifetime of the process.
+ *
+ * Safety nets every run gets, even with no OS sandbox:
+ *   - per-process wall-clock timeout (default 5_000 ms; configurable
+ *     per `RunInput`)
+ *   - clean env (just PATH / HOME / LANG forwarded — secrets in the
+ *     user's shell never leak in)
+ *   - cwd is a freshly-mkdtemp'd directory under the OS tmp; it is
+ *     removed after the run regardless of outcome
+ *   - stdout/stderr captured with a 1 MB ceiling; runaway output gets
+ *     truncated with a marker rather than blowing memory
+ */
+import { execFile as execFileCb, spawn } from "node:child_process";
+import { mkdtemp, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { performance } from "node:perf_hooks";
+import { promisify } from "node:util";
+
+import type {
+    RunInput,
+    RunResult,
+    RunnerCapabilities,
+    RunnerLanguage,
+    SandboxKind
+} from "../types/index.js";
+import { ErrorCode, LeetCodeError } from "../types/index.js";
+import logger from "../utils/logger.js";
+import type { LocalRunner } from "./runner.js";
+import { IMPLEMENTED_LANGUAGES, SUPPORTED_LANGUAGES } from "./runner.js";
+import { wrapWithSandbox } from "./sandbox.js";
+
+// `execFile` (no shell) — never `promisify(exec)`, which routes through
+// `/bin/sh -c` and is a shell-expansion foot-gun if anyone interpolates
+// a dynamic value into a probe in the future.
+const execFile = promisify(execFileCb);
+
+const MAX_OUTPUT_BYTES = 1_000_000; // 1 MB per stream
+const DEFAULT_TIMEOUT_MS = 5_000;
+const TRUNCATION_MARKER = "\n[...output truncated at 1 MB...]";
+
+interface LanguageSpec {
+    /** File extension (without dot) used for the temp source file. */
+    extension: string;
+    /** `[binary, args]` to probe — exit code 0 means available. */
+    probe: { cmd: string; args: string[] };
+    /**
+     * Build the spawn args given the path of the source file we wrote
+     * for this run. Compiled languages (Go, Java) will hook in extra
+     * compile steps via subclassing later.
+     */
+    buildArgs(sourcePath: string): { cmd: string; args: string[] };
+}
+
+const LANGUAGES: Record<RunnerLanguage, LanguageSpec> = {
+    python3: {
+        extension: "py",
+        probe: { cmd: "python3", args: ["--version"] },
+        buildArgs: (sourcePath) => ({
+            cmd: "python3",
+            args: [sourcePath]
+        })
+    },
+    // Phase 4b/4c stubs — present in the registry so the type system
+    // requires they stay in sync with `RunnerLanguage`. The runner
+    // refuses to use these until we actually wire harnesses.
+    go: {
+        extension: "go",
+        probe: { cmd: "go", args: ["version"] },
+        buildArgs: () => {
+            throw new LeetCodeError(
+                ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                "Go runner ships in Phase 4b"
+            );
+        }
+    },
+    java: {
+        extension: "java",
+        probe: { cmd: "java", args: ["-version"] },
+        buildArgs: () => {
+            throw new LeetCodeError(
+                ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                "Java runner ships in Phase 4c"
+            );
+        }
+    }
+};
+
+interface ProbeResult {
+    available: boolean;
+    version?: string;
+    path?: string;
+}
+
+const probeCache = new Map<RunnerLanguage, ProbeResult>();
+
+async function probeLanguage(language: RunnerLanguage): Promise<ProbeResult> {
+    const cached = probeCache.get(language);
+    if (cached) {
+        return cached;
+    }
+    const spec = LANGUAGES[language];
+    try {
+        const { stdout, stderr } = await execFile(
+            spec.probe.cmd,
+            spec.probe.args,
+            { timeout: 2000 }
+        );
+        // `python3 --version` and `go version` write to stdout; `java
+        // -version` historically writes to stderr — accept either.
+        const versionLine = (stdout || stderr || "").split("\n")[0]?.trim();
+        const result: ProbeResult = {
+            available: true,
+            version: versionLine || undefined
+        };
+        try {
+            const { stdout: which } = await execFile(
+                "which",
+                [spec.probe.cmd],
+                { timeout: 1000 }
+            );
+            result.path = which.trim() || undefined;
+        } catch {
+            /* `which` may not exist (Windows); leave `path` undefined */
+        }
+        probeCache.set(language, result);
+        return result;
+    } catch (error) {
+        const result: ProbeResult = { available: false };
+        probeCache.set(language, result);
+        logger.debug(
+            { language, error: (error as Error)?.message },
+            "Language probe failed"
+        );
+        return result;
+    }
+}
+
+/** Test helper — clears the probe cache so unit tests can re-detect. */
+export function __resetProbeCacheForTest(): void {
+    probeCache.clear();
+}
+
+function clampOutput(buf: Buffer): string {
+    if (buf.length <= MAX_OUTPUT_BYTES) {
+        return buf.toString("utf-8");
+    }
+    return (
+        buf.subarray(0, MAX_OUTPUT_BYTES).toString("utf-8") + TRUNCATION_MARKER
+    );
+}
+
+export class SubprocessRunner implements LocalRunner {
+    async capabilities(): Promise<RunnerCapabilities> {
+        const languages = await Promise.all(
+            SUPPORTED_LANGUAGES.map(async (language) => {
+                const probe = await probeLanguage(language);
+                return {
+                    language,
+                    available: probe.available,
+                    version: probe.version,
+                    path: probe.path
+                };
+            })
+        );
+        // Sandbox detection is in `./sandbox.ts`; importing inline here
+        // avoids a dependency cycle with `subprocess-runner` ↔ `sandbox`.
+        const { detectSandbox } = await import("./sandbox.js");
+        const detected = await detectSandbox();
+        return {
+            languages,
+            sandbox: {
+                kind: detected.kind,
+                available: detected.kind !== "none"
+            }
+        };
+    }
+
+    async run(input: RunInput): Promise<RunResult> {
+        if (!IMPLEMENTED_LANGUAGES.includes(input.language)) {
+            throw new LeetCodeError(
+                ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                `Local runner has no harness for ${input.language} yet`
+            );
+        }
+
+        const probe = await probeLanguage(input.language);
+        if (!probe.available) {
+            throw new LeetCodeError(
+                ErrorCode.LANGUAGE_RUNTIME_NOT_FOUND,
+                `Required runtime for ${input.language} not found on PATH`
+            );
+        }
+
+        const spec = LANGUAGES[input.language];
+        const timeoutMs = input.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+        const workDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-run-"));
+        const sourcePath = join(workDir, `solution.${spec.extension}`);
+
+        try {
+            await writeFile(sourcePath, input.code, "utf-8");
+            const baseArgs = spec.buildArgs(sourcePath);
+            const wrapped = await wrapWithSandbox(
+                baseArgs.cmd,
+                baseArgs.args,
+                workDir
+            );
+
+            return await this.spawnAndCapture({
+                cmd: wrapped.cmd,
+                args: wrapped.args,
+                cwd: workDir,
+                timeoutMs,
+                sandbox: wrapped.kind
+            });
+        } finally {
+            await rm(workDir, { recursive: true, force: true }).catch(
+                (error) => {
+                    logger.debug(
+                        { error: (error as Error)?.message, workDir },
+                        "Failed to clean up runner workdir"
+                    );
+                }
+            );
+        }
+    }
+
+    private spawnAndCapture(options: {
+        cmd: string;
+        args: string[];
+        cwd: string;
+        timeoutMs: number;
+        sandbox: SandboxKind;
+    }): Promise<RunResult> {
+        return new Promise((resolve) => {
+            const start = performance.now();
+            const child = spawn(options.cmd, options.args, {
+                cwd: options.cwd,
+                env: {
+                    PATH: process.env.PATH ?? "",
+                    HOME: options.cwd,
+                    LANG: process.env.LANG ?? "C.UTF-8"
+                },
+                stdio: ["ignore", "pipe", "pipe"]
+            });
+
+            const stdout: Buffer[] = [];
+            const stderr: Buffer[] = [];
+            let stdoutBytes = 0;
+            let stderrBytes = 0;
+            let timedOut = false;
+            let killTimer: NodeJS.Timeout | undefined;
+
+            // Tight guard: never let the buffered total exceed
+            // `MAX_OUTPUT_BYTES` even by a chunk. We slice the
+            // overflowing chunk to the exact remaining headroom and
+            // drop the rest. `clampOutput` still runs at finalize as a
+            // belt-and-braces final cap.
+            const captureChunk = (
+                buffers: Buffer[],
+                bytes: number,
+                chunk: Buffer
+            ): number => {
+                const remaining = MAX_OUTPUT_BYTES - bytes;
+                if (remaining <= 0) {
+                    return bytes;
+                }
+                if (chunk.length <= remaining) {
+                    buffers.push(chunk);
+                    return bytes + chunk.length;
+                }
+                buffers.push(chunk.subarray(0, remaining));
+                return bytes + remaining;
+            };
+
+            child.stdout?.on("data", (chunk: Buffer) => {
+                stdoutBytes = captureChunk(stdout, stdoutBytes, chunk);
+            });
+            child.stderr?.on("data", (chunk: Buffer) => {
+                stderrBytes = captureChunk(stderr, stderrBytes, chunk);
+            });
+
+            const timer = setTimeout(() => {
+                timedOut = true;
+                // SIGTERM first; if the child ignores it, hard SIGKILL
+                // 500 ms later. Belt + braces for runaway loops.
+                child.kill("SIGTERM");
+                killTimer = setTimeout(() => child.kill("SIGKILL"), 500);
+            }, options.timeoutMs);
+
+            const finalize = (exitCode: number | null): void => {
+                clearTimeout(timer);
+                if (killTimer) {
+                    clearTimeout(killTimer);
+                }
+                const durationMs = Math.round(performance.now() - start);
+                const passed = !timedOut && exitCode === 0;
+                resolve({
+                    passed,
+                    exitCode,
+                    stdout: clampOutput(Buffer.concat(stdout)),
+                    stderr: clampOutput(Buffer.concat(stderr)),
+                    timedOut,
+                    durationMs,
+                    sandbox: options.sandbox,
+                    warning:
+                        options.sandbox === "none"
+                            ? "No OS sandbox available on this host; ran without isolation."
+                            : undefined
+                });
+            };
+
+            child.on("close", (code, signal) => {
+                if (signal && code === null) {
+                    finalize(null);
+                } else {
+                    finalize(code);
+                }
+            });
+            child.on("error", (error) => {
+                logger.warn(
+                    { error: error.message, cmd: options.cmd },
+                    "Runner subprocess errored before exit"
+                );
+                clearTimeout(timer);
+                if (killTimer) {
+                    clearTimeout(killTimer);
+                }
+                resolve({
+                    passed: false,
+                    exitCode: null,
+                    stdout: clampOutput(Buffer.concat(stdout)),
+                    stderr:
+                        clampOutput(Buffer.concat(stderr)) +
+                        `\n[runner error: ${error.message}]`,
+                    timedOut: false,
+                    durationMs: Math.round(performance.now() - start),
+                    sandbox: options.sandbox,
+                    warning: undefined
+                });
+            });
+        });
+    }
+}
diff --git a/src/types/errors.ts b/src/types/errors.ts
index 971b640..b3ffad1 100644
--- a/src/types/errors.ts
+++ b/src/types/errors.ts
@@ -38,7 +38,37 @@ export const ErrorCode = {
      * particular problem slug, but no `start_problem` has been called for
      * it (or the session was reset).
      */
-    SESSION_NOT_FOUND: "SESSION_NOT_FOUND"
+    SESSION_NOT_FOUND: "SESSION_NOT_FOUND",
+    /**
+     * `run_local_tests` was asked for a language the local runner has no
+     * harness for. `submit_solution` keeps working for these languages —
+     * the runner is purely additive.
+     */
+    RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE: "RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE",
+    /**
+     * The language is supported in principle but the required runtime
+     * binary (e.g. `python3`, `go`, `java`) was not found on PATH. The
+     * `doctor` subcommand reports which runtimes are detected.
+     */
+    LANGUAGE_RUNTIME_NOT_FOUND: "LANGUAGE_RUNTIME_NOT_FOUND",
+    /**
+     * The user's code exceeded the per-run wall-clock budget. The runner
+     * killed the process; partial output (if any) is included in the
+     * result envelope.
+     */
+    RUNNER_TIMEOUT: "RUNNER_TIMEOUT",
+    /**
+     * `LEETCODE_MCP_REQUIRE_SANDBOX=1` is set but no OS sandbox tool was
+     * found on this host. The runner refuses to fall back to the unsandboxed
+     * subprocess path.
+     */
+    SANDBOX_REQUIRED: "SANDBOX_REQUIRED",
+    /**
+     * Strict mode is enabled (`LEETCODE_MCP_STRICT_MODE=1`) and
+     * `submit_solution` was called before `run_local_tests` last passed.
+     * Drives the recommended local-first practice loop.
+     */
+    LOCAL_TESTS_NOT_PASSED: "LOCAL_TESTS_NOT_PASSED"
 } as const;
 
 export type ErrorCodeValue = (typeof ErrorCode)[keyof typeof ErrorCode];
diff --git a/src/types/index.ts b/src/types/index.ts
index 7e56d5f..e9f0d85 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -7,6 +7,7 @@
 export * from "./credentials.js";
 export * from "./errors.js";
 export * from "./problem.js";
+export * from "./runner.js";
 export * from "./session.js";
 export * from "./solution.js";
 export * from "./submission.js";
diff --git a/src/types/runner.ts b/src/types/runner.ts
new file mode 100644
index 0000000..8f3078c
--- /dev/null
+++ b/src/types/runner.ts
@@ -0,0 +1,94 @@
+/**
+ * Wire types for the local code runner introduced in Phase 4.
+ *
+ * The runner is intentionally simple: callers hand it a string of code
+ * plus a language tag, and get back a result envelope describing what the
+ * subprocess did. There is no per-problem harness logic at this layer —
+ * harnesses live one floor up, in `src/runner/harnesses/*`, and inject
+ * test scaffolding into the source before it reaches the runner.
+ */
+
+/**
+ * Languages the local runner knows how to execute.
+ *
+ * Phase 4a ships `python3` only; Phase 4b/4c add `go` and `java`. Other
+ * LeetCode languages remain valid for `submit_solution` but
+ * `run_local_tests` will reject them with
+ * `RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE`.
+ */
+export type RunnerLanguage = "python3" | "go" | "java";
+
+/**
+ * What the runner detected when it tried to spawn an isolated subprocess.
+ *
+ * - `none`     — plain subprocess, no OS-level sandbox (always available)
+ * - `bwrap`    — Linux: bubblewrap with read-only fs + writable tmp + no net
+ * - `firejail` — Linux fallback when bwrap isn't installed
+ * - `sandbox-exec` — macOS: built-in `sandbox-exec` profile
+ *
+ * Reported alongside every `RunResult` so callers can show "ran in
+ * bwrap sandbox" without parsing logs.
+ */
+export type SandboxKind = "none" | "bwrap" | "firejail" | "sandbox-exec";
+
+export interface RunInput {
+    /**
+     * LeetCode problem slug. Used by the tool layer to look up the
+     * active session and update `lastLocalRunPassed`. Not consumed by
+     * the runner itself.
+     */
+    titleSlug: string;
+    /** Language to run as. */
+    language: RunnerLanguage;
+    /**
+     * Source code to execute, exactly as the runner should receive it.
+     * The harness layer is responsible for any wrapping, scaffolding, or
+     * test-driver injection before this string is built.
+     */
+    code: string;
+    /**
+     * Wall-clock budget in milliseconds. Defaults to 5_000 if omitted.
+     * The runner kills the subprocess when this elapses and returns
+     * `timedOut: true` with whatever partial output was captured.
+     */
+    timeoutMs?: number;
+}
+
+export interface RunResult {
+    /** Convenience flag: `exitCode === 0 && !timedOut`. */
+    passed: boolean;
+    /** Subprocess exit code, or `null` when the process was killed. */
+    exitCode: number | null;
+    /** Captured stdout, truncated to ~1 MB. */
+    stdout: string;
+    /** Captured stderr, truncated to ~1 MB. */
+    stderr: string;
+    /** Whether the wall-clock budget was hit. */
+    timedOut: boolean;
+    /** Wall-clock time the subprocess ran for, in milliseconds. */
+    durationMs: number;
+    /** Which sandbox (if any) was used. See {@link SandboxKind}. */
+    sandbox: SandboxKind;
+    /**
+     * Human-readable note when something interesting happened that the
+     * caller should know about — e.g. "no OS sandbox available on this
+     * host; ran without isolation". Omitted on the happy path.
+     */
+    warning?: string;
+}
+
+/** Capability snapshot the `doctor` subcommand renders to the user. */
+export interface RunnerCapabilities {
+    /** What languages have a working runtime detected on PATH. */
+    languages: Array<{
+        language: RunnerLanguage;
+        available: boolean;
+        version?: string;
+        path?: string;
+    }>;
+    /** Sandbox tooling available on this host, in priority order. */
+    sandbox: {
+        kind: SandboxKind;
+        available: boolean;
+    };
+}
diff --git a/tests/domain/session-service.test.ts b/tests/domain/session-service.test.ts
new file mode 100644
index 0000000..bea77eb
--- /dev/null
+++ b/tests/domain/session-service.test.ts
@@ -0,0 +1,82 @@
+/**
+ * Unit tests for SessionService methods that don't already have
+ * coverage via the e2e/integration suites — primarily the Phase 4
+ * additions (`requireSession`, `recordLocalRun`).
+ */
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { SessionService } from "../../src/domain/session-service.js";
+import { FileSessionStore } from "../../src/domain/session-store.js";
+import { ErrorCode, isLeetCodeError } from "../../src/types/index.js";
+
+describe("SessionService — Phase 4 additions", () => {
+    let dir: string;
+    let service: SessionService;
+
+    beforeEach(async () => {
+        dir = await mkdtemp(join(tmpdir(), "leetcode-mcp-svc-"));
+        service = new SessionService(new FileSessionStore({ dir }));
+    });
+
+    afterEach(async () => {
+        await rm(dir, { recursive: true, force: true });
+    });
+
+    describe("requireSession", () => {
+        it("returns the session when present", async () => {
+            const session = await service.startOrResume({ slug: "two-sum" });
+            const fetched = await service.requireSession("two-sum");
+            expect(fetched.slug).toBe(session.slug);
+        });
+
+        it("throws SESSION_NOT_FOUND when no session exists", async () => {
+            await expect(async () => {
+                await service.requireSession("never-opened");
+            }).rejects.toSatisfy(
+                (error: unknown) =>
+                    isLeetCodeError(error) &&
+                    error.code === ErrorCode.SESSION_NOT_FOUND
+            );
+        });
+    });
+
+    describe("recordLocalRun", () => {
+        it("increments attempts and stores lastLocalRunPassed", async () => {
+            await service.startOrResume({ slug: "two-sum" });
+
+            const after1 = await service.recordLocalRun("two-sum", false);
+            expect(after1.attempts).toBe(1);
+            expect(after1.lastLocalRunPassed).toBe(false);
+            expect(after1.status).toBe("attempting");
+
+            const after2 = await service.recordLocalRun("two-sum", true);
+            expect(after2.attempts).toBe(2);
+            expect(after2.lastLocalRunPassed).toBe(true);
+            // Status should not regress from "attempting".
+            expect(after2.status).toBe("attempting");
+        });
+
+        it("persists across service instances", async () => {
+            await service.startOrResume({ slug: "two-sum" });
+            await service.recordLocalRun("two-sum", true);
+
+            // Reload from disk via a fresh service.
+            const reloaded = new SessionService(new FileSessionStore({ dir }));
+            const session = await reloaded.requireSession("two-sum");
+            expect(session.attempts).toBe(1);
+            expect(session.lastLocalRunPassed).toBe(true);
+        });
+
+        it("throws SESSION_NOT_FOUND when no session exists", async () => {
+            await expect(async () => {
+                await service.recordLocalRun("never-opened", true);
+            }).rejects.toSatisfy(
+                (error: unknown) =>
+                    isLeetCodeError(error) &&
+                    error.code === ErrorCode.SESSION_NOT_FOUND
+            );
+        });
+    });
+});
diff --git a/tests/e2e/lifecycle.test.ts b/tests/e2e/lifecycle.test.ts
index a7851d3..aac2cd7 100644
--- a/tests/e2e/lifecycle.test.ts
+++ b/tests/e2e/lifecycle.test.ts
@@ -51,6 +51,8 @@ describe("e2e: server lifecycle", () => {
             "list_problem_solutions",
             "request_hint",
             "reset_session",
+            "run_local_tests",
+            "runner_doctor",
             "save_leetcode_credentials",
             "search_problems",
             "start_leetcode_auth",
diff --git a/tests/e2e/runner.test.ts b/tests/e2e/runner.test.ts
new file mode 100644
index 0000000..2c2bb39
--- /dev/null
+++ b/tests/e2e/runner.test.ts
@@ -0,0 +1,258 @@
+/**
+ * Local-runner e2e: spawn the real `build/index.js`, drive
+ * `runner_doctor` and `run_local_tests` over the wire, and assert the
+ * runner actually executes Python on the host.
+ *
+ * Skipped automatically on hosts without `python3` so the suite stays
+ * portable; the project's CI image has it.
+ */
+import { execFileSync } from "node:child_process";
+import { afterEach, describe, expect, it } from "vitest";
+import { spawnServer, type SpawnedServer } from "./harness/spawn-server.js";
+
+interface ToolTextResult {
+    content: Array<{ type: string; text: string }>;
+}
+
+const TWO_SUM_PROBLEM = {
+    questionId: "1",
+    questionFrontendId: "1",
+    title: "Two Sum",
+    titleSlug: "two-sum",
+    difficulty: "Easy",
+    isPaidOnly: false,
+    content: "<p>Two Sum problem</p>",
+    topicTags: [{ name: "Array", slug: "array" }],
+    codeSnippets: [
+        {
+            lang: "Python3",
+            langSlug: "python3",
+            code: "class Solution:\n    def twoSum(self, nums, target):\n        pass\n"
+        }
+    ],
+    similarQuestions: "[]",
+    exampleTestcases: "[2,7,11,15]\n9",
+    hints: [],
+    stats: '{"totalAccepted":"10M","totalSubmission":"20M","acRate":"50.0%"}'
+};
+
+const FIXTURE = {
+    graphql: [
+        {
+            operationContains: "question(titleSlug:",
+            response: { data: { question: TWO_SUM_PROBLEM } }
+        }
+    ]
+};
+
+function pythonAvailable(): boolean {
+    try {
+        execFileSync("python3", ["--version"], { stdio: "ignore" });
+        return true;
+    } catch {
+        return false;
+    }
+}
+
+const PYTHON_PRESENT = pythonAvailable();
+
+describe.skipIf(!PYTHON_PRESENT)("e2e: local runner (python3)", () => {
+    let spawned: SpawnedServer | undefined;
+
+    afterEach(async () => {
+        if (spawned) {
+            await spawned.cleanup();
+            spawned = undefined;
+        }
+    });
+
+    it("runner_doctor reports python3 availability", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        const doctor = (await spawned.client.callTool({
+            name: "runner_doctor",
+            arguments: {}
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(doctor.content[0].text);
+        expect(payload.languages).toBeDefined();
+        const py = payload.languages.find(
+            (l: { language: string }) => l.language === "python3"
+        );
+        expect(py?.available).toBe(true);
+        expect(payload.sandbox).toBeDefined();
+    });
+
+    it("rejects run_local_tests when no session is open", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        const result = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "print('ok')"
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(result.content[0].text);
+        expect(payload.code).toBe("SESSION_NOT_FOUND");
+    });
+
+    it("executes a passing python script and updates the session", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'print("hi")\nassert 1 + 1 == 2'
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.titleSlug).toBe("two-sum");
+        expect(payload.result.passed).toBe(true);
+        expect(payload.result.exitCode).toBe(0);
+        expect(payload.result.timedOut).toBe(false);
+        expect(payload.result.stdout).toContain("hi");
+
+        // Session state is observable via get_session_state.
+        const state = (await spawned.client.callTool({
+            name: "get_session_state",
+            arguments: { titleSlug: "two-sum" }
+        })) as ToolTextResult;
+        const sessionPayload = JSON.parse(state.content[0].text);
+        expect(sessionPayload.session.lastLocalRunPassed).toBe(true);
+        expect(sessionPayload.session.attempts).toBe(1);
+    });
+
+    it("captures non-zero exit code without throwing", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "raise SystemExit(2)"
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.result.passed).toBe(false);
+        expect(payload.result.exitCode).toBe(2);
+
+        const state = (await spawned.client.callTool({
+            name: "get_session_state",
+            arguments: { titleSlug: "two-sum" }
+        })) as ToolTextResult;
+        const sessionPayload = JSON.parse(state.content[0].text);
+        expect(sessionPayload.session.lastLocalRunPassed).toBe(false);
+    });
+
+    it("kills runaway processes after the timeout budget", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "while True: pass",
+                timeoutMs: 500
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.result.timedOut).toBe(true);
+        expect(payload.result.passed).toBe(false);
+    });
+
+    it("rejects unimplemented languages with RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", async () => {
+        spawned = await spawnServer({ fixture: FIXTURE });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "go" }
+        });
+
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "go",
+                code: "package main\nfunc main() {}"
+            }
+        })) as ToolTextResult;
+
+        const payload = JSON.parse(run.content[0].text);
+        expect(payload.code).toBe("RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE");
+    });
+
+    it("blocks submit_solution under strict mode until run_local_tests passes", async () => {
+        spawned = await spawnServer({
+            fixture: FIXTURE,
+            env: { LEETCODE_MCP_STRICT_MODE: "1" }
+        });
+
+        await spawned.client.callTool({
+            name: "start_problem",
+            arguments: { titleSlug: "two-sum", language: "python3" }
+        });
+
+        // First submit attempt: no run_local_tests yet → rejected.
+        const blocked = (await spawned.client.callTool({
+            name: "submit_solution",
+            arguments: {
+                problemSlug: "two-sum",
+                code: "def twoSum(nums, target): pass",
+                language: "python3"
+            }
+        })) as ToolTextResult;
+        const blockedPayload = JSON.parse(blocked.content[0].text);
+        expect(blockedPayload.code).toBe("LOCAL_TESTS_NOT_PASSED");
+
+        // Run locals successfully.
+        const run = (await spawned.client.callTool({
+            name: "run_local_tests",
+            arguments: {
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'print("ok")'
+            }
+        })) as ToolTextResult;
+        const runPayload = JSON.parse(run.content[0].text);
+        expect(runPayload.result.passed).toBe(true);
+
+        // Submit again: strict mode now permits it (the upstream
+        // request itself will fail via nock — we don't care; the gate
+        // is what we're locking down here).
+        const allowed = (await spawned.client.callTool({
+            name: "submit_solution",
+            arguments: {
+                problemSlug: "two-sum",
+                code: "def twoSum(nums, target): pass",
+                language: "python3"
+            }
+        })) as ToolTextResult;
+        const allowedPayload = JSON.parse(allowed.content[0].text);
+        expect(allowedPayload.code).not.toBe("LOCAL_TESTS_NOT_PASSED");
+    });
+});
diff --git a/tests/integration/runner-tools-integration.test.ts b/tests/integration/runner-tools-integration.test.ts
new file mode 100644
index 0000000..b563913
--- /dev/null
+++ b/tests/integration/runner-tools-integration.test.ts
@@ -0,0 +1,251 @@
+/**
+ * Runner Tools Integration Tests
+ *
+ * Drives `run_local_tests` and `runner_doctor` through the MCP wire,
+ * with a fake `LocalRunner` that records what it was called with so we
+ * can assert the tool layer's behaviour without depending on `python3`
+ * being installed where these tests run.
+ */
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
+import { SessionService } from "../../src/domain/session-service.js";
+import { FileSessionStore } from "../../src/domain/session-store.js";
+import { registerRunnerTools } from "../../src/mcp/tools/runner-tools.js";
+import type { LocalRunner } from "../../src/runner/runner.js";
+import {
+    ErrorCode,
+    LeetCodeError,
+    type RunInput,
+    type RunResult,
+    type RunnerCapabilities
+} from "../../src/types/index.js";
+import { createMockLeetCodeService } from "../helpers/mock-leetcode.js";
+import type { TestClientPair } from "../helpers/test-client.js";
+import { createTestClient } from "../helpers/test-client.js";
+import { INTEGRATION_TEST_TIMEOUT, assertions } from "./setup.js";
+
+const HAPPY_RESULT: RunResult = {
+    passed: true,
+    exitCode: 0,
+    stdout: "ok\n",
+    stderr: "",
+    timedOut: false,
+    durationMs: 42,
+    sandbox: "none",
+    warning: "No OS sandbox available on this host; ran without isolation."
+};
+
+const FAKE_CAPS: RunnerCapabilities = {
+    languages: [
+        { language: "python3", available: true, version: "Python 3.12.0" },
+        { language: "go", available: false },
+        { language: "java", available: false }
+    ],
+    sandbox: { kind: "none", available: false }
+};
+
+interface FakeRunnerOptions {
+    nextResult?: RunResult;
+    runError?: unknown;
+}
+
+function createFakeRunner(options: FakeRunnerOptions = {}): LocalRunner & {
+    runs: RunInput[];
+} {
+    const runs: RunInput[] = [];
+    return {
+        runs,
+        async run(input: RunInput): Promise<RunResult> {
+            runs.push(input);
+            if (options.runError) {
+                throw options.runError;
+            }
+            return options.nextResult ?? HAPPY_RESULT;
+        },
+        async capabilities(): Promise<RunnerCapabilities> {
+            return FAKE_CAPS;
+        }
+    };
+}
+
+describe("Runner Tools Integration", () => {
+    let testClient: TestClientPair;
+    let mockService: ReturnType<typeof createMockLeetCodeService>;
+    let sessions: SessionService;
+    let sessionDir: string;
+    let runner: ReturnType<typeof createFakeRunner>;
+
+    beforeEach(async () => {
+        mockService = createMockLeetCodeService();
+        sessionDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-runner-"));
+        sessions = new SessionService(
+            new FileSessionStore({ dir: sessionDir })
+        );
+        runner = createFakeRunner();
+
+        testClient = await createTestClient({}, (server) => {
+            registerRunnerTools(server, mockService as any, sessions, runner);
+        });
+    }, INTEGRATION_TEST_TIMEOUT);
+
+    afterEach(async () => {
+        if (testClient) {
+            await testClient.cleanup();
+        }
+        await rm(sessionDir, { recursive: true, force: true });
+        vi.restoreAllMocks();
+    });
+
+    describe("run_local_tests", () => {
+        it(
+            "rejects with SESSION_NOT_FOUND when no session has been opened",
+            async () => {
+                const result: any = await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "python3",
+                        code: "print('hi')"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.code).toBe(ErrorCode.SESSION_NOT_FOUND);
+                expect(runner.runs).toHaveLength(0);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "delegates to the runner and records lastLocalRunPassed",
+            async () => {
+                await sessions.startOrResume({ slug: "two-sum" });
+
+                const result: any = await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "python3",
+                        code: 'print("hi")'
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.titleSlug).toBe("two-sum");
+                expect(payload.result.passed).toBe(true);
+                expect(runner.runs).toHaveLength(1);
+                expect(runner.runs[0].language).toBe("python3");
+                expect(runner.runs[0].code).toBe('print("hi")');
+
+                const session = await sessions.requireSession("two-sum");
+                expect(session.lastLocalRunPassed).toBe(true);
+                expect(session.attempts).toBe(1);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "records lastLocalRunPassed=false on a failing run",
+            async () => {
+                await sessions.startOrResume({ slug: "two-sum" });
+                const failing = createFakeRunner({
+                    nextResult: { ...HAPPY_RESULT, passed: false, exitCode: 1 }
+                });
+                // Re-build the test client with the failing runner.
+                await testClient.cleanup();
+                testClient = await createTestClient({}, (server) => {
+                    registerRunnerTools(
+                        server,
+                        mockService as any,
+                        sessions,
+                        failing
+                    );
+                });
+
+                await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "python3",
+                        code: "raise SystemExit(1)"
+                    }
+                });
+
+                const session = await sessions.requireSession("two-sum");
+                expect(session.lastLocalRunPassed).toBe(false);
+                expect(session.attempts).toBe(1);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "surfaces RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE thrown from the runner",
+            async () => {
+                await sessions.startOrResume({ slug: "two-sum" });
+                const broken = createFakeRunner({
+                    runError: new LeetCodeError(
+                        ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE,
+                        "Go runner ships in Phase 4b"
+                    )
+                });
+                await testClient.cleanup();
+                testClient = await createTestClient({}, (server) => {
+                    registerRunnerTools(
+                        server,
+                        mockService as any,
+                        sessions,
+                        broken
+                    );
+                });
+
+                const result: any = await testClient.client.callTool({
+                    name: "run_local_tests",
+                    arguments: {
+                        titleSlug: "two-sum",
+                        language: "go",
+                        code: "package main"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.code).toBe(
+                    ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE
+                );
+
+                // The session attempt counter should NOT bump on a
+                // pre-run rejection.
+                const session = await sessions.requireSession("two-sum");
+                expect(session.attempts).toBe(0);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+    });
+
+    describe("runner_doctor", () => {
+        it(
+            "returns the capabilities snapshot",
+            async () => {
+                const result: any = await testClient.client.callTool({
+                    name: "runner_doctor",
+                    arguments: {}
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text);
+                expect(payload.languages).toBeDefined();
+                expect(payload.sandbox).toBeDefined();
+                expect(
+                    payload.languages.find(
+                        (l: { language: string }) => l.language === "python3"
+                    )?.available
+                ).toBe(true);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+    });
+});
diff --git a/tests/integration/submission-tools-integration.test.ts b/tests/integration/submission-tools-integration.test.ts
index aa60787..720ebf4 100644
--- a/tests/integration/submission-tools-integration.test.ts
+++ b/tests/integration/submission-tools-integration.test.ts
@@ -2,8 +2,14 @@
  * Submission Tools Integration Tests
  * Tests all submission-related tools through MCP protocol
  */
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
 import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { SessionService } from "../../src/domain/session-service.js";
+import { FileSessionStore } from "../../src/domain/session-store.js";
 import { registerSubmissionTools } from "../../src/mcp/tools/submission-tools.js";
+import { ErrorCode } from "../../src/types/index.js";
 import { createMockAuthenticatedService } from "../helpers/mock-leetcode.js";
 import type { TestClientPair } from "../helpers/test-client.js";
 import { createTestClient } from "../helpers/test-client.js";
@@ -12,13 +18,19 @@ import { INTEGRATION_TEST_TIMEOUT, assertions } from "./setup.js";
 describe("Submission Tools Integration", () => {
     let testClient: TestClientPair;
     let mockService: ReturnType<typeof createMockAuthenticatedService>;
+    let sessions: SessionService;
+    let sessionDir: string;
 
     beforeEach(async () => {
         // Use authenticated service since submission requires authentication
         mockService = createMockAuthenticatedService();
+        sessionDir = await mkdtemp(join(tmpdir(), "leetcode-mcp-sub-"));
+        sessions = new SessionService(
+            new FileSessionStore({ dir: sessionDir })
+        );
 
         testClient = await createTestClient({}, (server) => {
-            registerSubmissionTools(server, mockService as any);
+            registerSubmissionTools(server, mockService as any, sessions);
         });
     }, INTEGRATION_TEST_TIMEOUT);
 
@@ -26,6 +38,8 @@ describe("Submission Tools Integration", () => {
         if (testClient) {
             await testClient.cleanup();
         }
+        await rm(sessionDir, { recursive: true, force: true });
+        delete process.env.LEETCODE_MCP_STRICT_MODE;
     });
 
     describe("submit_solution", () => {
@@ -98,4 +112,100 @@ describe("Submission Tools Integration", () => {
             INTEGRATION_TEST_TIMEOUT
         );
     });
+
+    describe("submit_solution — strict mode", () => {
+        it(
+            "blocks submission when LEETCODE_MCP_STRICT_MODE=1 and session has not passed locals",
+            async () => {
+                process.env.LEETCODE_MCP_STRICT_MODE = "1";
+                await sessions.startOrResume({ slug: "two-sum" });
+                // No recordLocalRun call → lastLocalRunPassed is null.
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                expect(payload.code).toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "permits submission when strict mode is on and locals have passed",
+            async () => {
+                process.env.LEETCODE_MCP_STRICT_MODE = "1";
+                await sessions.startOrResume({ slug: "two-sum" });
+                await sessions.recordLocalRun("two-sum", true);
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                // Mock service returns a normal submission envelope —
+                // we just need to confirm we didn't get the error code.
+                expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "permits submission when strict mode is on but no session was opened",
+            async () => {
+                process.env.LEETCODE_MCP_STRICT_MODE = "1";
+                // Deliberately no startOrResume — strict mode should
+                // not block ad-hoc submissions outside the tutoring
+                // flow.
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+
+        it(
+            "does not block by default (LEETCODE_MCP_STRICT_MODE unset)",
+            async () => {
+                // No env var; session exists with lastLocalRunPassed === null.
+                await sessions.startOrResume({ slug: "two-sum" });
+
+                const result: any = await testClient.client.callTool({
+                    name: "submit_solution",
+                    arguments: {
+                        problemSlug: "two-sum",
+                        code: "def twoSum(nums, target): pass",
+                        language: "python3"
+                    }
+                });
+
+                assertions.hasToolResultStructure(result);
+                const payload = JSON.parse(result.content[0].text as string);
+                expect(payload.code).not.toBe(ErrorCode.LOCAL_TESTS_NOT_PASSED);
+            },
+            INTEGRATION_TEST_TIMEOUT
+        );
+    });
 });
diff --git a/tests/runner/subprocess-runner.test.ts b/tests/runner/subprocess-runner.test.ts
new file mode 100644
index 0000000..27969dc
--- /dev/null
+++ b/tests/runner/subprocess-runner.test.ts
@@ -0,0 +1,161 @@
+/**
+ * Unit tests for the subprocess runner.
+ *
+ * These tests assume `python3` is available on PATH (the project's own
+ * CI image already has it). The runner's own probe gates each test on
+ * availability; a missing python3 produces a `LANGUAGE_RUNTIME_NOT_FOUND`
+ * which is its own first-class assertion.
+ */
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import { __resetSandboxCacheForTest } from "../../src/runner/sandbox.js";
+import {
+    SubprocessRunner,
+    __resetProbeCacheForTest
+} from "../../src/runner/subprocess-runner.js";
+import {
+    ErrorCode,
+    isLeetCodeError,
+    type RunnerLanguage
+} from "../../src/types/index.js";
+
+describe("SubprocessRunner", () => {
+    let runner: SubprocessRunner;
+
+    beforeEach(() => {
+        // Force re-probing per test so mutations to PATH (none here, but
+        // future tests may) don't leak between cases.
+        __resetProbeCacheForTest();
+        __resetSandboxCacheForTest();
+        runner = new SubprocessRunner();
+    });
+
+    afterEach(() => {
+        __resetProbeCacheForTest();
+        __resetSandboxCacheForTest();
+    });
+
+    describe("capabilities", () => {
+        it("reports python3 as a supported language", async () => {
+            const caps = await runner.capabilities();
+            const py = caps.languages.find((l) => l.language === "python3");
+            expect(py).toBeDefined();
+            // Don't assert availability — environments without python3
+            // should still produce a coherent envelope.
+            expect(typeof py?.available).toBe("boolean");
+        });
+
+        it("reports go and java as supported languages even before they are implemented", async () => {
+            const caps = await runner.capabilities();
+            const langs = caps.languages.map((l) => l.language).sort();
+            expect(langs).toEqual(["go", "java", "python3"]);
+        });
+
+        it("includes a sandbox descriptor", async () => {
+            const caps = await runner.capabilities();
+            expect(caps.sandbox).toBeDefined();
+            expect(["none", "bwrap", "firejail", "sandbox-exec"]).toContain(
+                caps.sandbox.kind
+            );
+        });
+    });
+
+    describe("run", () => {
+        it("executes a happy-path python script", async () => {
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'print("hello"); assert 1 + 1 == 2'
+            });
+
+            expect(result.passed).toBe(true);
+            expect(result.exitCode).toBe(0);
+            expect(result.timedOut).toBe(false);
+            expect(result.stdout).toContain("hello");
+            expect(result.stderr).toBe("");
+            expect(result.durationMs).toBeGreaterThanOrEqual(0);
+        });
+
+        it("captures non-zero exit code without throwing", async () => {
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "raise SystemExit(7)"
+            });
+
+            expect(result.passed).toBe(false);
+            expect(result.exitCode).toBe(7);
+            expect(result.timedOut).toBe(false);
+        });
+
+        it("captures stderr from raised exceptions", async () => {
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: 'raise ValueError("boom")'
+            });
+
+            expect(result.passed).toBe(false);
+            expect(result.exitCode).not.toBe(0);
+            expect(result.stderr).toContain("ValueError");
+            expect(result.stderr).toContain("boom");
+        });
+
+        it("kills runaway processes after the timeout budget", async () => {
+            const start = Date.now();
+            const result = await runner.run({
+                titleSlug: "two-sum",
+                language: "python3",
+                code: "while True: pass",
+                timeoutMs: 400
+            });
+            const elapsed = Date.now() - start;
+
+            expect(result.timedOut).toBe(true);
+            expect(result.passed).toBe(false);
+            // Tolerate slow CI: budget + the 500 ms SIGTERM-then-SIGKILL
+            // grace + scheduler jitter. Should not run for full 5s.
+            expect(elapsed).toBeLessThan(2_500);
+        });
+
+        it("rejects unsupported languages with RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE", async () => {
+            await expect(async () => {
+                await runner.run({
+                    titleSlug: "two-sum",
+                    language: "go" as RunnerLanguage,
+                    code: 'package main\nfunc main() { println("hi") }'
+                });
+            }).rejects.toSatisfy((error: unknown) => {
+                if (!isLeetCodeError(error)) {
+                    return false;
+                }
+                return (
+                    error.code === ErrorCode.RUNNER_NOT_IMPLEMENTED_FOR_LANGUAGE
+                );
+            });
+        });
+
+        it("forwards a clean env (no leaking secrets)", async () => {
+            // Ask the child to print one of its env vars. We never set
+            // SECRET_ON_PARENT in the child env, so it should print
+            // empty even if defined on the parent.
+            const before = process.env.SECRET_ON_PARENT;
+            process.env.SECRET_ON_PARENT = "leak-me";
+            try {
+                const result = await runner.run({
+                    titleSlug: "two-sum",
+                    language: "python3",
+                    code: 'import os; print(os.environ.get("SECRET_ON_PARENT", "MISSING"))'
+                });
+
+                expect(result.passed).toBe(true);
+                expect(result.stdout.trim()).toBe("MISSING");
+            } finally {
+                if (before === undefined) {
+                    delete process.env.SECRET_ON_PARENT;
+                } else {
+                    process.env.SECRET_ON_PARENT = before;
+                }
+            }
+        });
+    });
+});