diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 068404a..1facbb0 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - node-version: [22.x, 24.x] + node-version: [24.x, 26.x] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/ui-tests.yml b/.github/workflows/ui-tests.yml index 6841104..615b2de 100644 --- a/.github/workflows/ui-tests.yml +++ b/.github/workflows/ui-tests.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - node-version: [22.x, 24.x] + node-version: [24.x, 26.x] services: postgres: diff --git a/.yarn/changelogs/service.9a1a533d.md b/.yarn/changelogs/service.9a1a533d.md new file mode 100644 index 0000000..7f7bcf0 --- /dev/null +++ b/.yarn/changelogs/service.9a1a533d.md @@ -0,0 +1,67 @@ + +# service + + + +## ๐Ÿ—‘๏ธ Deprecated + + +## โœจ Features + +### Supervised service processes survive abrupt parent termination + +Managed services launched by `ProcessRunner.spawnCommand()` now run inside a small Node supervisor (the `process-supervisor` module) instead of a bare shell. The supervisor watches the parent stack-craft process and, when the parent disappears without a graceful stop (`kill -9`, IDE force-stop, abrupt WSL exit, or a crash), tears down the entire child process tree. + +- Parent death is detected via EOF on the supervisor's stdin pipe (the parent holds the write end and never writes to it). This fires the instant the parent dies and is immune to PID reuse, unlike polling `process.kill(parentPid, 0)`. +- On POSIX, the supervisor is the process-group leader and escalates SIGTERM โ†’ SIGKILL across the group, mirroring `ServiceLifecycleManager.shutdownAll()`. +- On Windows, it uses `taskkill /T` (then `/F`) to walk and kill the descendant tree, since process groups are unavailable. +- SIGTERM/SIGINT/SIGHUP from the parent are forwarded into the same kill cascade, so `killProcessGroup()` keeps working unchanged. +- The grace period before the SIGKILL escalation is configurable via the `WATCHDOG_GRACE_MS` environment variable (default: 5000 ms). + +The supervisor is a first-class TypeScript module (type-checked and linted), resolved next to `process-runner` with the same extension โ€” `.js` under `dist/` in production and the source `.ts` in dev/test (Node strips types natively). + +This prevents orphaned service processes from lingering when stack-craft itself goes away unexpectedly. + +## ๐Ÿ› Bug Fixes + + +## ๐Ÿ“š Documentation + + +## โšก Performance + + +## โ™ป๏ธ Refactoring + + +## ๐Ÿงช Tests + +- Updated `process-runner` spec coverage to assert that user commands are wrapped in the supervisor module (resolved path + stdin/pipe wiring) on both POSIX and Windows. +- Added `process-supervisor` unit specs covering the POSIX and Windows kill-tree branches in-process. +- Added a POSIX integration spec that spawns a real supervised process tree, drops the parent (closes stdin), and asserts the descendant is reaped. + +## ๐Ÿ“ฆ Build + + +## ๐Ÿ‘ท CI + + +## โฌ†๏ธ Dependencies + + +## ๐Ÿ”ง Chores + diff --git a/.yarn/changelogs/stack-craft.9a1a533d.md b/.yarn/changelogs/stack-craft.9a1a533d.md new file mode 100644 index 0000000..a8f8d1b --- /dev/null +++ b/.yarn/changelogs/stack-craft.9a1a533d.md @@ -0,0 +1,51 @@ + +# stack-craft + + + +## โœจ Features + + +## ๐Ÿ› Bug Fixes + +- Managed services are now reliably terminated when stack-craft exits abruptly (`kill -9`, IDE force-stop, crash, or WSL shutdown), preventing orphaned service processes from lingering after the app is gone. + +## ๐Ÿ“š Documentation + + +## โšก Performance + + +## โ™ป๏ธ Refactoring + + +## ๐Ÿงช Tests + + +## ๐Ÿ“ฆ Build + +- Raise the minimum Node version to `>=24.0.0` so the process-supervisor module can be spawned directly (native TypeScript type-stripping in dev/test). + +## ๐Ÿ‘ท CI + +- Pin the CI Node matrix and Azure pipeline to Node 24.x to match the new engines floor. + +## โฌ†๏ธ Dependencies + + +## ๐Ÿ”ง Chores + diff --git a/.yarn/versions/9a1a533d.yml b/.yarn/versions/9a1a533d.yml new file mode 100644 index 0000000..7d4c5dc --- /dev/null +++ b/.yarn/versions/9a1a533d.yml @@ -0,0 +1,3 @@ +releases: + service: minor + stack-craft: patch diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3952404..8182674 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,7 +17,7 @@ pool: steps: - task: NodeTool@0 inputs: - versionSpec: '20.x' + versionSpec: '24.x' displayName: 'Install Node.js' - script: yarn install displayName: 'Yarn install' diff --git a/package.json b/package.json index a2e86c6..c697127 100644 --- a/package.json +++ b/package.json @@ -62,7 +62,7 @@ "format:check": "prettier --check ." }, "engines": { - "node": ">=22.0.0" + "node": ">=24.0.0" }, "packageManager": "yarn@4.15.0" } diff --git a/service/src/services/process-runner.spec.ts b/service/src/services/process-runner.spec.ts index 52d1626..2067ac7 100644 --- a/service/src/services/process-runner.spec.ts +++ b/service/src/services/process-runner.spec.ts @@ -94,6 +94,19 @@ describe('ProcessRunner', () => { process.env = original })) + it('should forward WATCHDOG_GRACE_MS so supervisor overrides take effect', () => + withRunnerContext(async () => { + const original = process.env + process.env = { WATCHDOG_GRACE_MS: '250', SECRET_KEY: 'nope' } + + const env = ProcessRunnerImpl.getSafeEnv() + + expect(env.WATCHDOG_GRACE_MS).toBe('250') + expect(env).not.toHaveProperty('SECRET_KEY') + + process.env = original + })) + it('should handle case-insensitive matching for safe keys', () => withRunnerContext(async () => { const original = process.env @@ -304,24 +317,28 @@ describe('ProcessRunner', () => { }) describe('spawnCommand', () => { - it('should call spawn with correct shell arguments', () => + it('should wrap the user command in the node supervisor on POSIX', () => withRunnerContext(async ({ runner }) => { const originalPlatform = process.platform Object.defineProperty(process, 'platform', { value: 'linux', writable: true }) const { spawn } = await import('child_process') + vi.mocked(spawn).mockClear() const mockChild = { pid: 1, stdout: null, stderr: null } as unknown as ChildProcess vi.mocked(spawn).mockReturnValue(mockChild) const result = runner.spawnCommand('echo hello', '/tmp') expect(result).toBe(mockChild) - expect(spawn).toHaveBeenCalledWith('/bin/sh', ['-c', 'echo hello'], { - cwd: '/tmp', - stdio: ['ignore', 'pipe', 'pipe'], - env: expect.objectContaining({}), - detached: true, - }) + expect(spawn).toHaveBeenCalledWith( + process.execPath, + [expect.stringMatching(/process-supervisor\.(ts|js)$/), '/bin/sh', '-c', 'echo hello'], + expect.objectContaining({ + cwd: '/tmp', + stdio: ['pipe', 'pipe', 'pipe'], + detached: true, + }), + ) Object.defineProperty(process, 'platform', { value: originalPlatform, writable: true }) })) @@ -332,14 +349,15 @@ describe('ProcessRunner', () => { Object.defineProperty(process, 'platform', { value: 'linux', writable: true }) const { spawn } = await import('child_process') + vi.mocked(spawn).mockClear() const mockChild = { pid: 1, stdout: null, stderr: null } as unknown as ChildProcess vi.mocked(spawn).mockReturnValue(mockChild) runner.spawnCommand('echo hello', '/tmp', { MY_VAR: 'test' }) expect(spawn).toHaveBeenCalledWith( - '/bin/sh', - ['-c', 'echo hello'], + process.execPath, + [expect.stringMatching(/process-supervisor\.(ts|js)$/), '/bin/sh', '-c', 'echo hello'], expect.objectContaining({ env: expect.objectContaining({ MY_VAR: 'test' }), }), @@ -348,20 +366,21 @@ describe('ProcessRunner', () => { Object.defineProperty(process, 'platform', { value: originalPlatform, writable: true }) })) - it('should use cmd.exe on Windows', () => + it('should hand the supervisor cmd.exe on Windows', () => withRunnerContext(async ({ runner }) => { const originalPlatform = process.platform Object.defineProperty(process, 'platform', { value: 'win32', writable: true }) const { spawn } = await import('child_process') + vi.mocked(spawn).mockClear() const mockChild = { pid: 1, stdout: null, stderr: null } as unknown as ChildProcess vi.mocked(spawn).mockReturnValue(mockChild) runner.spawnCommand('echo hello', 'C:\\temp') expect(spawn).toHaveBeenCalledWith( - 'cmd.exe', - ['/c', 'echo hello'], + process.execPath, + [expect.stringMatching(/process-supervisor\.(ts|js)$/), 'cmd.exe', '/c', 'echo hello'], expect.objectContaining({ cwd: 'C:\\temp' }), ) diff --git a/service/src/services/process-runner.ts b/service/src/services/process-runner.ts index d6b6fde..9f4e166 100644 --- a/service/src/services/process-runner.ts +++ b/service/src/services/process-runner.ts @@ -1,7 +1,10 @@ -import { defineService, type Token } from '@furystack/inject' import { type ChildProcess, spawn, spawnSync } from 'child_process' +import { dirname, extname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { defineService, type Token } from '@furystack/inject' import { LogStorageService } from './log-storage-service.js' +import type { killProcessTree } from './process-supervisor.js' import type { ServiceLifecycleManager } from './service-lifecycle-manager.js' export type ManagedProcess = { @@ -46,6 +49,8 @@ const SAFE_ENV_KEYS = new Set([ 'TMPDIR', 'TMP', 'TEMP', + // Supervisor tuning โ€” forwarded so WATCHDOG_GRACE_MS overrides reach node -e. + 'WATCHDOG_GRACE_MS', // Windows-specific 'USERPROFILE', 'APPDATA', @@ -98,17 +103,39 @@ export class ProcessRunnerImpl { return env } + /** + * Spawns the user command inside the {@link killProcessTree process-supervisor} + * module, which watches the parent stack-craft process via its stdin pipe and + * tears the child tree down if stack-craft dies before it can issue a graceful + * stop โ€” handles `kill -9`, IDE force-stop, abrupt WSL exits, and any other + * path that bypasses {@link ServiceLifecycleManager.shutdownAll}. + * + * The supervisor sibling is resolved with the same extension as this module so + * it works both under `dist/` (`.js`) in production and under the source `.ts` + * in dev/test (Node strips types natively, hence the `engines.node >= 24`). + */ public spawnCommand(command: string, cwd: string, extraEnv?: Record): ChildProcess { const isWindows = process.platform === 'win32' const shell = isWindows ? 'cmd.exe' : '/bin/sh' const shellFlag = isWindows ? '/c' : '-c' - return spawn(shell, [shellFlag, command], { + const here = fileURLToPath(import.meta.url) + const supervisorPath = join(dirname(here), `process-supervisor${extname(here)}`) + + const child = spawn(process.execPath, [supervisorPath, shell, shellFlag, command], { cwd, - stdio: ['ignore', 'pipe', 'pipe'], + // stdin is a pipe the supervisor watches for EOF (parent-death signal); + // stdout/stderr carry the child's output back for log capture. + stdio: ['pipe', 'pipe', 'pipe'], env: { ...ProcessRunnerImpl.getSafeEnv(), ...extraEnv }, detached: true, }) + + // The supervisor watches this stdin pipe for EOF as its parent-death signal, + // so leave the write end open. Swallow EPIPE so it can't crash us at teardown. + child.stdin?.on('error', () => {}) + + return child } /** diff --git a/service/src/services/process-runner.watchdog.spec.ts b/service/src/services/process-runner.watchdog.spec.ts new file mode 100644 index 0000000..968d3dd --- /dev/null +++ b/service/src/services/process-runner.watchdog.spec.ts @@ -0,0 +1,74 @@ +import { usingAsync } from '@furystack/utils' +import { describe, expect, it } from 'vitest' + +import type { LogStorageService } from './log-storage-service.js' +import { ProcessRunnerImpl } from './process-runner.js' +import '../test-shims.js' + +const isAlive = (pid: number): boolean => { + try { + process.kill(pid, 0) + return true + } catch { + return false + } +} + +const waitFor = async (predicate: () => boolean, timeoutMs: number): Promise => { + const deadline = Date.now() + timeoutMs + while (Date.now() < deadline) { + if (predicate()) return true + await new Promise((resolve) => setTimeout(resolve, 25)) + } + return predicate() +} + +const noopLogStorage = { addEntry: async () => undefined } as unknown as LogStorageService + +// The supervisor is spawned as a `.ts` file under vitest, so the real-spawn path +// needs a Node that strips types natively (>= 24, the project's engines floor). +const hasNativeTypeScript = Boolean((process.features as { typescript?: unknown }).typescript) + +// POSIX-only: the watchdog reaps via process-group signals. The Windows path +// uses taskkill and cannot be exercised on the Linux CI runner. +describe.skipIf(process.platform === 'win32' || !hasNativeTypeScript)('ProcessRunner watchdog (integration)', () => { + it('reaps the child tree when the parent (stdin) goes away', async () => { + const original = process.env.WATCHDOG_GRACE_MS + process.env.WATCHDOG_GRACE_MS = '200' + + try { + await usingAsync(new ProcessRunnerImpl(noopLogStorage), async (runner) => { + // Background a long sleep and print its pid; the supervisor's group includes it. + const child = runner.spawnCommand('sleep 30 & echo "GRANDCHILD:$!"; wait', process.cwd()) + + let output = '' + child.stdout?.on('data', (data: Buffer) => { + output += data.toString() + }) + + const sawPid = await waitFor(() => /GRANDCHILD:\d+/.test(output), 3000) + expect(sawPid).toBe(true) + + const grandchildPid = Number.parseInt(/GRANDCHILD:(\d+)/.exec(output)?.[1] ?? '', 10) + expect(Number.isFinite(grandchildPid)).toBe(true) + expect(isAlive(grandchildPid)).toBe(true) + + // Simulate abrupt parent death: closing the stdin write end is exactly what + // the OS does to the supervisor's stdin when the real parent process dies. + child.stdin?.end() + + const grandchildReaped = await waitFor(() => !isAlive(grandchildPid), 3000) + const supervisorReaped = await waitFor(() => child.exitCode !== null || child.killed, 3000) + + expect(grandchildReaped).toBe(true) + expect(supervisorReaped).toBe(true) + }) + } finally { + if (original === undefined) { + delete process.env.WATCHDOG_GRACE_MS + } else { + process.env.WATCHDOG_GRACE_MS = original + } + } + }) +}) diff --git a/service/src/services/process-supervisor.spec.ts b/service/src/services/process-supervisor.spec.ts new file mode 100644 index 0000000..b9f812f --- /dev/null +++ b/service/src/services/process-supervisor.spec.ts @@ -0,0 +1,88 @@ +import { type ChildProcess, spawnSync } from 'node:child_process' +import { afterEach, describe, expect, it, vi } from 'vitest' + +import { killProcessTree } from './process-supervisor.js' + +vi.mock('node:child_process', () => ({ + spawn: vi.fn(), + spawnSync: vi.fn(), +})) + +const withPlatform = (platform: NodeJS.Platform, fn: () => void) => { + const original = process.platform + Object.defineProperty(process, 'platform', { value: platform, writable: true }) + try { + fn() + } finally { + Object.defineProperty(process, 'platform', { value: original, writable: true }) + } +} + +describe('process-supervisor killProcessTree', () => { + afterEach(() => { + vi.clearAllMocks() + }) + + it('signals the supervisor process group with SIGTERM on POSIX (graceful)', () => + withPlatform('linux', () => { + const killSpy = vi.spyOn(process, 'kill').mockImplementation(() => true) + const child = { pid: 4321, kill: vi.fn() } as unknown as ChildProcess + + killProcessTree(child, false) + + expect(killSpy).toHaveBeenCalledWith(-process.pid, 'SIGTERM') + killSpy.mockRestore() + })) + + it('escalates to SIGKILL on POSIX when forced', () => + withPlatform('linux', () => { + const killSpy = vi.spyOn(process, 'kill').mockImplementation(() => true) + const child = { pid: 4321, kill: vi.fn() } as unknown as ChildProcess + + killProcessTree(child, true) + + expect(killSpy).toHaveBeenCalledWith(-process.pid, 'SIGKILL') + killSpy.mockRestore() + })) + + it('falls back to child.kill when the group signal throws on POSIX', () => + withPlatform('linux', () => { + const killSpy = vi.spyOn(process, 'kill').mockImplementation(() => { + throw new Error('ESRCH') + }) + const childKill = vi.fn() + const child = { pid: 4321, kill: childKill } as unknown as ChildProcess + + killProcessTree(child, false) + + expect(childKill).toHaveBeenCalledWith('SIGTERM') + killSpy.mockRestore() + })) + + it('uses taskkill /T without /F for graceful termination on Windows', () => + withPlatform('win32', () => { + const child = { pid: 777, kill: vi.fn() } as unknown as ChildProcess + + killProcessTree(child, false) + + expect(spawnSync).toHaveBeenCalledWith('taskkill', ['/pid', '777', '/T'], { stdio: 'ignore' }) + })) + + it('uses taskkill /T /F when forced on Windows', () => + withPlatform('win32', () => { + const child = { pid: 777, kill: vi.fn() } as unknown as ChildProcess + + killProcessTree(child, true) + + expect(spawnSync).toHaveBeenCalledWith('taskkill', ['/pid', '777', '/T', '/F'], { stdio: 'ignore' }) + })) + + it('does nothing on Windows when the child has no pid', () => + withPlatform('win32', () => { + const child = { pid: undefined, kill: vi.fn() } as unknown as ChildProcess + + killProcessTree(child, false) + + expect(spawnSync).not.toHaveBeenCalled() + })) +}) diff --git a/service/src/services/process-supervisor.ts b/service/src/services/process-supervisor.ts new file mode 100644 index 0000000..24098c5 --- /dev/null +++ b/service/src/services/process-supervisor.ts @@ -0,0 +1,97 @@ +import { type ChildProcess, spawn, spawnSync } from 'node:child_process' +import { pathToFileURL } from 'node:url' + +const DEFAULT_WATCHDOG_GRACE_MS = 5000 + +/** + * Sends a termination signal to the whole descendant tree of `child`. + * + * On POSIX this signals the supervisor's own process group (`-process.pid`), + * which the shell and its descendants share because the supervisor was started + * detached as the group leader. The supervisor is therefore signalled too: + * SIGTERM is caught by {@link runSupervisor} and ignored once cleanup is in + * flight, so the supervisor survives long enough to mirror the child's exit; + * the uncatchable grace SIGKILL takes the whole group down if the child outlives + * SIGTERM. On Windows there are no process groups, so `taskkill /T` walks the + * descendant tree by pid and `/F` forces termination. + * + * @param child The shell process spawned by the supervisor. + * @param force `true` escalates to SIGKILL / `taskkill /F`; `false` is graceful. + */ +export const killProcessTree = (child: ChildProcess, force: boolean): void => { + if (process.platform === 'win32') { + if (child.pid == null) return + const args = force ? ['/pid', String(child.pid), '/T', '/F'] : ['/pid', String(child.pid), '/T'] + spawnSync('taskkill', args, { stdio: 'ignore' }) + return + } + try { + process.kill(-process.pid, force ? 'SIGKILL' : 'SIGTERM') + } catch { + if (child.pid != null) { + try { + child.kill(force ? 'SIGKILL' : 'SIGTERM') + } catch { + // Child already gone โ€” nothing left to signal. + } + } + } +} + +/** + * Entry point executed when this module is spawned as a child process by + * {@link ProcessRunnerImpl.spawnCommand}. Wraps the user command so abrupt + * termination of the parent (stack-craft crashing, IDE force-stop, `kill -9`, + * abrupt WSL exit) still tears down the service tree. + * + * Parent death is detected via EOF on stdin: the parent owns the write end and + * never writes to it, so the OS closes it the instant the parent dies. This is + * immune to PID reuse, unlike polling `process.kill(parentPid, 0)`. On detection + * (or a forwarded SIGTERM/SIGINT/SIGHUP) it escalates SIGTERM โ†’ SIGKILL across + * the tree after `WATCHDOG_GRACE_MS` (default {@link DEFAULT_WATCHDOG_GRACE_MS}). + * + * @see {@link ProcessRunnerImpl.spawnCommand} for the parent-side wiring. + */ +const runSupervisor = (): void => { + // node