From 9d61d9dfc50c2dbcb8570cb20406713500bbd1eb Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 09:55:43 +0100 Subject: [PATCH 1/7] feat(chaos): add DB restart chaos test with docker-compose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - tests/chaos/docker-compose.chaos.yml: isolated stack (wraith_chaos_db on port 5433, wraith_chaos_indexer on port 3001) so chaos runs never interfere with a local dev environment - tests/chaos/db-restart.test.ts: full scenario — 1. Start Wraith + Postgres via Docker Compose 2. Wait for /readyz (startup + migrate + first poll) 3. Warm up 30s to accumulate real testnet events 4. Record lastIndexedLedger + totalIndexed (pre-pause snapshot) 5. docker pause wraith_chaos_db (simulates DB crash) 6. Hold 15s — assert indexer stays alive via /healthz 7. docker unpause wraith_chaos_db 8. Poll /readyz until indexer reports fully recovered 9. Assert: lastIndexedLedger >= checkpoint (resumed, not reset) Assert: lastIndexedLedger > checkpoint (actually progressed) Assert: totalIndexed >= pre-pause count (no data lost) Assert: /healthz ok (no crash) Skips automatically when Docker is not available in the environment - jest.chaos.config.js: separate Jest config with 5-min timeout, roots in tests/chaos/ so unit tests stay unaffected - package.json: test:chaos script - .github/workflows/chaos.yml: chaos CI job triggered on PR changes to src/, prisma/, or tests/chaos/ Closes #80 --- .github/workflows/chaos.yml | 37 +++++ jest.chaos.config.js | 10 ++ package.json | 1 + tests/chaos/db-restart.test.ts | 220 +++++++++++++++++++++++++++ tests/chaos/docker-compose.chaos.yml | 42 +++++ 5 files changed, 310 insertions(+) create mode 100644 .github/workflows/chaos.yml create mode 100644 jest.chaos.config.js create mode 100644 tests/chaos/db-restart.test.ts create mode 100644 tests/chaos/docker-compose.chaos.yml diff --git a/.github/workflows/chaos.yml b/.github/workflows/chaos.yml new file mode 100644 index 00000000..fe7b75f3 --- /dev/null +++ b/.github/workflows/chaos.yml @@ -0,0 +1,37 @@ +name: Chaos tests + +on: + pull_request: + paths: + - "tests/chaos/**" + - "src/**" + - "prisma/**" + - "Dockerfile" + - ".github/workflows/chaos.yml" + +jobs: + chaos: + name: DB restart chaos test + runs-on: ubuntu-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: "20" + cache: "npm" + + - name: Install dependencies + run: npm ci + + - name: Verify Docker is available + run: docker info + + - name: Run chaos tests + run: npm run test:chaos + env: + # Forward testnet RPC so the indexer can fetch real events + STELLAR_NETWORK: testnet diff --git a/jest.chaos.config.js b/jest.chaos.config.js new file mode 100644 index 00000000..177c654b --- /dev/null +++ b/jest.chaos.config.js @@ -0,0 +1,10 @@ +/** @type {import('jest').Config} */ +module.exports = { + preset: "ts-jest", + testEnvironment: "node", + roots: ["/tests/chaos"], + testMatch: ["**/*.test.ts"], + moduleFileExtensions: ["ts", "js", "json"], + clearMocks: true, + testTimeout: 300_000, // 5 minutes — Docker build + full chaos scenario +}; diff --git a/package.json b/package.json index b9a7f605..bfcea712 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "start": "node dist/index.js", "test": "jest --runInBand", "test:watch": "jest --watch", + "test:chaos": "jest --runInBand --testPathPattern=tests/chaos --testTimeout=300000 --config=jest.chaos.config.js", "db:generate": "prisma generate", "db:migrate": "prisma migrate dev", "db:push": "prisma db push", diff --git a/tests/chaos/db-restart.test.ts b/tests/chaos/db-restart.test.ts new file mode 100644 index 00000000..361e5ec6 --- /dev/null +++ b/tests/chaos/db-restart.test.ts @@ -0,0 +1,220 @@ +/** + * Chaos test: DB restart mid-ingest + * ───────────────────────────────── + * Spins up a Wraith + Postgres stack, lets the indexer ingest live testnet + * events, pauses the DB container to simulate a crash, then resumes and + * asserts: + * 1. The indexer process stayed alive during the outage. + * 2. After recovery, lastIndexedLedger advanced beyond the pre-pause value. + * 3. No data was lost or duplicated (transfer count only ever increases; + * the DB-level UNIQUE constraint on eventId would have caught any + * double-write and the indexer's skipDuplicates guard handles it + * gracefully without crashing). + * + * Prerequisites: Docker + Docker Compose v2 must be available in PATH. + * The test skips automatically when Docker is not detected. + * + * Run with: + * npm run test:chaos + */ + +import { execSync, spawnSync } from "child_process"; +import path from "path"; + +// ─── Config ─────────────────────────────────────────────────────────────────── +const COMPOSE_FILE = path.resolve(__dirname, "docker-compose.chaos.yml"); +const COMPOSE_CMD = `docker compose -f "${COMPOSE_FILE}"`; +const API_BASE = "http://localhost:3001"; + +const STARTUP_TIMEOUT_MS = 120_000; // 2 min — build + migrate + first poll +const INGEST_WARMUP_MS = 30_000; // let it accumulate some ledgers +const PAUSE_DURATION_MS = 15_000; // DB outage window +const RECOVERY_TIMEOUT_MS = 60_000; // how long we wait for the indexer to catch up +const POLL_INTERVAL_MS = 2_000; // how often we poll the API + +// Jest timeout covers the full scenario end-to-end +jest.setTimeout(300_000); // 5 minutes + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +function dockerAvailable(): boolean { + const result = spawnSync("docker", ["info"], { stdio: "pipe" }); + return result.status === 0; +} + +function exec(cmd: string): string { + return execSync(cmd, { encoding: "utf8" }).trim(); +} + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +async function fetchJson(path: string): Promise { + const res = await fetch(`${API_BASE}${path}`); + if (!res.ok) throw new Error(`HTTP ${res.status} from ${path}`); + return res.json() as Promise; +} + +interface StatusResponse { + ok: boolean; + lastIndexedLedger: number | null; + latestLedger: number; + lagLedgers: number; + totalIndexed: number; +} + +interface ReadyzResponse { + ok: boolean; + checks: { db: boolean; rpc: boolean; indexerCaughtUp: boolean }; +} + +/** + * Poll until fn() resolves to true or timeoutMs elapses. + * Throws if the deadline is exceeded. + */ +async function waitUntil( + fn: () => Promise, + timeoutMs: number, + description: string +): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + if (await fn()) return; + } catch { + // transient error — keep polling + } + await sleep(POLL_INTERVAL_MS); + } + throw new Error(`Timed out waiting for: ${description}`); +} + +// ─── Test lifecycle ─────────────────────────────────────────────────────────── + +let composeStarted = false; + +afterAll(async () => { + if (!composeStarted) return; + console.log("[chaos] Tearing down containers…"); + try { + exec(`${COMPOSE_CMD} down --volumes --remove-orphans`); + console.log("[chaos] Containers removed."); + } catch (e) { + console.error("[chaos] Cleanup failed (manual removal may be needed):", e); + } +}); + +// ─── Main test ──────────────────────────────────────────────────────────────── + +describe("Chaos: DB restart mid-ingest", () => { + it("indexer resumes from checkpoint with no data loss after DB pause", async () => { + // ── 0. Skip if Docker unavailable ───────────────────────────────────────── + if (!dockerAvailable()) { + console.warn("[chaos] Docker not available — skipping chaos test."); + return; + } + + // ── 1. Start the chaos stack ─────────────────────────────────────────────── + console.log("[chaos] Building and starting containers…"); + exec(`${COMPOSE_CMD} up -d --build`); + composeStarted = true; + + // ── 2. Wait for Wraith to become healthy ────────────────────────────────── + console.log("[chaos] Waiting for Wraith to be ready…"); + await waitUntil( + async () => { + const data = await fetchJson("/readyz"); + return data.ok && data.checks.db && data.checks.rpc; + }, + STARTUP_TIMEOUT_MS, + "/readyz returns ok=true" + ); + console.log("[chaos] Wraith is healthy."); + + // ── 3. Let the indexer warm up and ingest some data ─────────────────────── + console.log(`[chaos] Warming up for ${INGEST_WARMUP_MS / 1000}s…`); + await sleep(INGEST_WARMUP_MS); + + const beforeStatus = await fetchJson("/status"); + const ledgerBefore = beforeStatus.lastIndexedLedger ?? 0; + const countBefore = beforeStatus.totalIndexed; + + console.log( + `[chaos] Pre-pause state — lastIndexedLedger: ${ledgerBefore}, totalIndexed: ${countBefore}` + ); + + // The indexer must have made progress before we pause. + expect(ledgerBefore).toBeGreaterThan(0); + + // ── 4. Pause the DB container ───────────────────────────────────────────── + console.log("[chaos] Pausing DB container…"); + exec("docker pause wraith_chaos_db"); + console.log("[chaos] DB paused."); + + // ── 5. Wait during the outage — indexer must stay alive ────────────────── + console.log(`[chaos] Holding pause for ${PAUSE_DURATION_MS / 1000}s…`); + await sleep(PAUSE_DURATION_MS); + + // The indexer process must still be alive (liveness probe doesn't need DB) + const liveness = await fetchJson<{ ok: boolean }>("/healthz"); + expect(liveness.ok).toBe(true); + console.log("[chaos] Indexer process is alive during DB outage ✓"); + + // Record the last ledger the indexer *knew about* before the crash + const checkpointLedger = ledgerBefore; + + // ── 6. Resume the DB container ──────────────────────────────────────────── + console.log("[chaos] Resuming DB container…"); + exec("docker unpause wraith_chaos_db"); + console.log("[chaos] DB resumed."); + + // ── 7. Wait for the indexer to recover and catch up ─────────────────────── + console.log("[chaos] Waiting for indexer recovery…"); + await waitUntil( + async () => { + const data = await fetchJson("/readyz"); + return data.ok && data.checks.db && data.checks.indexerCaughtUp; + }, + RECOVERY_TIMEOUT_MS, + "indexer fully recovered and caught up" + ); + console.log("[chaos] Indexer recovered ✓"); + + // ── 8. Collect post-recovery state ──────────────────────────────────────── + // Give it one more poll cycle to persist state + await sleep(POLL_INTERVAL_MS * 2); + const afterStatus = await fetchJson("/status"); + const ledgerAfter = afterStatus.lastIndexedLedger ?? 0; + const countAfter = afterStatus.totalIndexed; + + console.log( + `[chaos] Post-recovery state — lastIndexedLedger: ${ledgerAfter}, totalIndexed: ${countAfter}` + ); + + // ── 9. Assertions ───────────────────────────────────────────────────────── + + // 9a. Indexer resumed from its checkpoint, not from ledger 0 + expect(ledgerAfter).toBeGreaterThanOrEqual(checkpointLedger); + console.log(`[chaos] Checkpoint preserved: ${checkpointLedger} → ${ledgerAfter} ✓`); + + // 9b. Ledger advanced after recovery — indexer didn't stall + expect(ledgerAfter).toBeGreaterThan(checkpointLedger); + console.log("[chaos] Ledger progressed after recovery ✓"); + + // 9c. Transfer count only increased — no data was lost + expect(countAfter).toBeGreaterThanOrEqual(countBefore); + console.log(`[chaos] Data integrity: ${countBefore} → ${countAfter} transfers ✓`); + + // 9d. Uniqueness: the DB UNIQUE constraint on eventId is the ultimate guard. + // Verify by querying the duplicate-check endpoint — if the indexer had + // silently double-written any event, the count would equal or exceed + // the expected value but the DB would have skipped it via skipDuplicates. + // We assert the indexer is still ok (not crashed) as the final signal. + const finalHealth = await fetchJson<{ ok: boolean; uptime: number }>("/healthz"); + expect(finalHealth.ok).toBe(true); + console.log("[chaos] Indexer healthy post-recovery ✓"); + + console.log("[chaos] All assertions passed — chaos test complete."); + }); +}); diff --git a/tests/chaos/docker-compose.chaos.yml b/tests/chaos/docker-compose.chaos.yml new file mode 100644 index 00000000..c6710f13 --- /dev/null +++ b/tests/chaos/docker-compose.chaos.yml @@ -0,0 +1,42 @@ +version: "3.9" + +# Dedicated compose stack for the DB-restart chaos test. +# Uses a separate container name and port (5433 / 3001) so it never +# collides with a local development stack running on the default ports. + +services: + # ── Postgres ────────────────────────────────────────────────────────────────── + db: + image: postgres:16-alpine + container_name: wraith_chaos_db + environment: + POSTGRES_USER: wraith + POSTGRES_PASSWORD: wraith + POSTGRES_DB: wraith + ports: + - "5433:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U wraith"] + interval: 3s + timeout: 3s + retries: 20 + + # ── Wraith indexer + API ─────────────────────────────────────────────────────── + wraith: + build: + context: ../.. + dockerfile: Dockerfile + container_name: wraith_chaos_indexer + depends_on: + db: + condition: service_healthy + environment: + DATABASE_URL: postgresql://wraith:wraith@db:5432/wraith + DIRECT_DATABASE_URL: postgresql://wraith:wraith@db:5432/wraith + STELLAR_NETWORK: testnet + # Fast poll cycle so the test completes in reasonable time + POLL_INTERVAL_MS: "3000" + EVENTS_BATCH_SIZE: "500" + PORT: "3001" + ports: + - "3001:3001" From 223e6708bfebc7b46f1f8ebfe38dcb2dd26da290 Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 10:11:25 +0100 Subject: [PATCH 2/7] fix(chaos): use --testPathPatterns (Jest 30 renamed from --testPathPattern) --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index bfcea712..ff3777d0 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,7 @@ "start": "node dist/index.js", "test": "jest --runInBand", "test:watch": "jest --watch", - "test:chaos": "jest --runInBand --testPathPattern=tests/chaos --testTimeout=300000 --config=jest.chaos.config.js", + "test:chaos": "jest --runInBand --testPathPatterns=tests/chaos --testTimeout=300000 --config=jest.chaos.config.js", "db:generate": "prisma generate", "db:migrate": "prisma migrate dev", "db:push": "prisma db push", From b8c7874ddd17a2c62e3449f5d0a6e68ff258f5cd Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 10:56:48 +0100 Subject: [PATCH 3/7] fix(chaos): fix startup timeout and readyz check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs caused the 2-minute timeout: 1. waitUntil polled /readyz which requires indexerCaughtUp=true — on a cold start lastIndexedLedger is null so lag=Infinity and the check never passes. Split into two phases: - Phase 1: poll /healthz (Express up, no DB needed) with 1min timeout - Phase 2: poll /status until lastIndexedLedger>0 (covers prisma push + first RPC poll) with 3min timeout Then wait for forward progress (>checkpoint) after unpause. 2. docker-compose.chaos.yml had obsolete 'version' key (Compose v2 warning). Removed it. Also raised jest.setTimeout to 10min and recovery timeout to 90s. --- tests/chaos/db-restart.test.ts | 129 ++++++++++++--------------- tests/chaos/docker-compose.chaos.yml | 3 +- 2 files changed, 60 insertions(+), 72 deletions(-) diff --git a/tests/chaos/db-restart.test.ts b/tests/chaos/db-restart.test.ts index 361e5ec6..fe14c9f3 100644 --- a/tests/chaos/db-restart.test.ts +++ b/tests/chaos/db-restart.test.ts @@ -6,10 +6,7 @@ * asserts: * 1. The indexer process stayed alive during the outage. * 2. After recovery, lastIndexedLedger advanced beyond the pre-pause value. - * 3. No data was lost or duplicated (transfer count only ever increases; - * the DB-level UNIQUE constraint on eventId would have caught any - * double-write and the indexer's skipDuplicates guard handles it - * gracefully without crashing). + * 3. No data was lost (transfer count only ever increases). * * Prerequisites: Docker + Docker Compose v2 must be available in PATH. * The test skips automatically when Docker is not detected. @@ -26,14 +23,15 @@ const COMPOSE_FILE = path.resolve(__dirname, "docker-compose.chaos.yml"); const COMPOSE_CMD = `docker compose -f "${COMPOSE_FILE}"`; const API_BASE = "http://localhost:3001"; -const STARTUP_TIMEOUT_MS = 120_000; // 2 min — build + migrate + first poll -const INGEST_WARMUP_MS = 30_000; // let it accumulate some ledgers -const PAUSE_DURATION_MS = 15_000; // DB outage window -const RECOVERY_TIMEOUT_MS = 60_000; // how long we wait for the indexer to catch up -const POLL_INTERVAL_MS = 2_000; // how often we poll the API +// Phase 1: wait for /healthz (process alive, no DB needed) +const HEALTHZ_TIMEOUT_MS = 60_000; // 1 min — container start + node boot +// Phase 2: wait for first ledger indexed (DB connected + first poll done) +const FIRST_LEDGER_TIMEOUT_MS = 180_000; // 3 min — prisma push + first RPC poll +const PAUSE_DURATION_MS = 15_000; // DB outage window +const RECOVERY_TIMEOUT_MS = 90_000; // wait for indexer to advance past checkpoint +const POLL_INTERVAL_MS = 3_000; // polling cadence -// Jest timeout covers the full scenario end-to-end -jest.setTimeout(300_000); // 5 minutes +jest.setTimeout(600_000); // 10 minutes total budget // ─── Helpers ────────────────────────────────────────────────────────────────── @@ -50,9 +48,9 @@ function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } -async function fetchJson(path: string): Promise { - const res = await fetch(`${API_BASE}${path}`); - if (!res.ok) throw new Error(`HTTP ${res.status} from ${path}`); +async function fetchJson(urlPath: string): Promise { + const res = await fetch(`${API_BASE}${urlPath}`); + if (!res.ok) throw new Error(`HTTP ${res.status} from ${urlPath}`); return res.json() as Promise; } @@ -60,18 +58,11 @@ interface StatusResponse { ok: boolean; lastIndexedLedger: number | null; latestLedger: number; - lagLedgers: number; totalIndexed: number; } -interface ReadyzResponse { - ok: boolean; - checks: { db: boolean; rpc: boolean; indexerCaughtUp: boolean }; -} - /** - * Poll until fn() resolves to true or timeoutMs elapses. - * Throws if the deadline is exceeded. + * Poll fn() every POLL_INTERVAL_MS until it returns true or the deadline passes. */ async function waitUntil( fn: () => Promise, @@ -83,14 +74,14 @@ async function waitUntil( try { if (await fn()) return; } catch { - // transient error — keep polling + // transient — keep polling } await sleep(POLL_INTERVAL_MS); } throw new Error(`Timed out waiting for: ${description}`); } -// ─── Test lifecycle ─────────────────────────────────────────────────────────── +// ─── Lifecycle ──────────────────────────────────────────────────────────────── let composeStarted = false; @@ -109,6 +100,7 @@ afterAll(async () => { describe("Chaos: DB restart mid-ingest", () => { it("indexer resumes from checkpoint with no data loss after DB pause", async () => { + // ── 0. Skip if Docker unavailable ───────────────────────────────────────── if (!dockerAvailable()) { console.warn("[chaos] Docker not available — skipping chaos test."); @@ -120,101 +112,98 @@ describe("Chaos: DB restart mid-ingest", () => { exec(`${COMPOSE_CMD} up -d --build`); composeStarted = true; - // ── 2. Wait for Wraith to become healthy ────────────────────────────────── - console.log("[chaos] Waiting for Wraith to be ready…"); + // ── 2a. Phase 1: wait for Node.js process to be alive ──────────────────── + // /healthz returns 200 as soon as the Express server is up. + // It does NOT require a DB connection, so this completes quickly. + console.log("[chaos] Waiting for Wraith process to boot…"); await waitUntil( async () => { - const data = await fetchJson("/readyz"); - return data.ok && data.checks.db && data.checks.rpc; + const data = await fetchJson<{ ok: boolean }>("/healthz"); + return data.ok === true; }, - STARTUP_TIMEOUT_MS, - "/readyz returns ok=true" + HEALTHZ_TIMEOUT_MS, + "/healthz returns ok=true" ); - console.log("[chaos] Wraith is healthy."); + console.log("[chaos] Wraith process is alive."); - // ── 3. Let the indexer warm up and ingest some data ─────────────────────── - console.log(`[chaos] Warming up for ${INGEST_WARMUP_MS / 1000}s…`); - await sleep(INGEST_WARMUP_MS); + // ── 2b. Phase 2: wait for first successful DB write ─────────────────────── + // Covers: prisma db push (schema migration) + first RPC poll + first upsert. + // We poll /status until lastIndexedLedger becomes a positive integer. + console.log("[chaos] Waiting for first ledger to be indexed (prisma push + first poll)…"); + await waitUntil( + async () => { + const data = await fetchJson("/status"); + return data.lastIndexedLedger !== null && data.lastIndexedLedger > 0; + }, + FIRST_LEDGER_TIMEOUT_MS, + "lastIndexedLedger > 0" + ); + console.log("[chaos] Indexer is running and has persisted at least one ledger."); + // ── 3. Snapshot pre-pause state ─────────────────────────────────────────── const beforeStatus = await fetchJson("/status"); - const ledgerBefore = beforeStatus.lastIndexedLedger ?? 0; + const ledgerBefore = beforeStatus.lastIndexedLedger!; const countBefore = beforeStatus.totalIndexed; console.log( - `[chaos] Pre-pause state — lastIndexedLedger: ${ledgerBefore}, totalIndexed: ${countBefore}` + `[chaos] Pre-pause — lastIndexedLedger: ${ledgerBefore}, totalIndexed: ${countBefore}` ); - - // The indexer must have made progress before we pause. expect(ledgerBefore).toBeGreaterThan(0); // ── 4. Pause the DB container ───────────────────────────────────────────── console.log("[chaos] Pausing DB container…"); exec("docker pause wraith_chaos_db"); - console.log("[chaos] DB paused."); - // ── 5. Wait during the outage — indexer must stay alive ────────────────── + // ── 5. Hold the pause — indexer must stay alive ─────────────────────────── console.log(`[chaos] Holding pause for ${PAUSE_DURATION_MS / 1000}s…`); await sleep(PAUSE_DURATION_MS); - // The indexer process must still be alive (liveness probe doesn't need DB) + // Liveness probe must still respond — the Express server is independent of DB const liveness = await fetchJson<{ ok: boolean }>("/healthz"); expect(liveness.ok).toBe(true); - console.log("[chaos] Indexer process is alive during DB outage ✓"); + console.log("[chaos] Indexer process alive during DB outage ✓"); - // Record the last ledger the indexer *knew about* before the crash const checkpointLedger = ledgerBefore; // ── 6. Resume the DB container ──────────────────────────────────────────── console.log("[chaos] Resuming DB container…"); exec("docker unpause wraith_chaos_db"); - console.log("[chaos] DB resumed."); - // ── 7. Wait for the indexer to recover and catch up ─────────────────────── - console.log("[chaos] Waiting for indexer recovery…"); + // ── 7. Wait for forward progress past the checkpoint ───────────────────── + // The indexer's withRetry loop will reconnect and resume from the saved + // lastIndexedLedger. We wait until it advances strictly beyond the checkpoint. + console.log("[chaos] Waiting for indexer to advance past checkpoint…"); await waitUntil( async () => { - const data = await fetchJson("/readyz"); - return data.ok && data.checks.db && data.checks.indexerCaughtUp; + const data = await fetchJson("/status"); + return (data.lastIndexedLedger ?? 0) > checkpointLedger; }, RECOVERY_TIMEOUT_MS, - "indexer fully recovered and caught up" + `lastIndexedLedger > ${checkpointLedger}` ); - console.log("[chaos] Indexer recovered ✓"); - // ── 8. Collect post-recovery state ──────────────────────────────────────── - // Give it one more poll cycle to persist state - await sleep(POLL_INTERVAL_MS * 2); + // ── 8. Final assertions ─────────────────────────────────────────────────── const afterStatus = await fetchJson("/status"); const ledgerAfter = afterStatus.lastIndexedLedger ?? 0; const countAfter = afterStatus.totalIndexed; console.log( - `[chaos] Post-recovery state — lastIndexedLedger: ${ledgerAfter}, totalIndexed: ${countAfter}` + `[chaos] Post-recovery — lastIndexedLedger: ${ledgerAfter}, totalIndexed: ${countAfter}` ); - // ── 9. Assertions ───────────────────────────────────────────────────────── - - // 9a. Indexer resumed from its checkpoint, not from ledger 0 - expect(ledgerAfter).toBeGreaterThanOrEqual(checkpointLedger); - console.log(`[chaos] Checkpoint preserved: ${checkpointLedger} → ${ledgerAfter} ✓`); - - // 9b. Ledger advanced after recovery — indexer didn't stall + // 8a. Indexer resumed from checkpoint — did NOT reset to ledger 0 expect(ledgerAfter).toBeGreaterThan(checkpointLedger); - console.log("[chaos] Ledger progressed after recovery ✓"); + console.log(`[chaos] Checkpoint preserved: ${checkpointLedger} → ${ledgerAfter} ✓`); - // 9c. Transfer count only increased — no data was lost + // 8b. Transfer count only increased — no data lost expect(countAfter).toBeGreaterThanOrEqual(countBefore); console.log(`[chaos] Data integrity: ${countBefore} → ${countAfter} transfers ✓`); - // 9d. Uniqueness: the DB UNIQUE constraint on eventId is the ultimate guard. - // Verify by querying the duplicate-check endpoint — if the indexer had - // silently double-written any event, the count would equal or exceed - // the expected value but the DB would have skipped it via skipDuplicates. - // We assert the indexer is still ok (not crashed) as the final signal. - const finalHealth = await fetchJson<{ ok: boolean; uptime: number }>("/healthz"); + // 8c. Process still healthy + const finalHealth = await fetchJson<{ ok: boolean }>("/healthz"); expect(finalHealth.ok).toBe(true); console.log("[chaos] Indexer healthy post-recovery ✓"); - console.log("[chaos] All assertions passed — chaos test complete."); + console.log("[chaos] All assertions passed."); }); }); diff --git a/tests/chaos/docker-compose.chaos.yml b/tests/chaos/docker-compose.chaos.yml index c6710f13..de930af1 100644 --- a/tests/chaos/docker-compose.chaos.yml +++ b/tests/chaos/docker-compose.chaos.yml @@ -1,8 +1,7 @@ -version: "3.9" - # Dedicated compose stack for the DB-restart chaos test. # Uses a separate container name and port (5433 / 3001) so it never # collides with a local development stack running on the default ports. +# Note: no top-level 'version' key — obsolete in Compose v2+. services: # ── Postgres ────────────────────────────────────────────────────────────────── From ad25e5be14a0f8054713cf4d5a47e8446cc42bf1 Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 11:04:08 +0100 Subject: [PATCH 4/7] fix(chaos): use docker compose --wait + wraith healthcheck for startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: prisma db push runs synchronously before Express starts, so /healthz is unreachable for 1-2min in CI. The 60s HEALTHZ_TIMEOUT_MS expired before the HTTP server was up. Fix: - Add healthcheck to wraith service in docker-compose.chaos.yml: wget /healthz every 5s, up to 60 retries (5min max), 10s start_period - Use 'docker compose up --wait' which blocks until all healthchecks pass — covers full startup (container boot + prisma push + listen) - HEALTHZ_TIMEOUT_MS now just a 30s fallback sanity check (should be instant after --wait returns) - FIRST_LEDGER_TIMEOUT_MS reduced to 2min (server is already up) --- tests/chaos/db-restart.test.ts | 25 +++++++++++++++---------- tests/chaos/docker-compose.chaos.yml | 10 ++++++++-- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/tests/chaos/db-restart.test.ts b/tests/chaos/db-restart.test.ts index fe14c9f3..b98ffa9d 100644 --- a/tests/chaos/db-restart.test.ts +++ b/tests/chaos/db-restart.test.ts @@ -23,10 +23,11 @@ const COMPOSE_FILE = path.resolve(__dirname, "docker-compose.chaos.yml"); const COMPOSE_CMD = `docker compose -f "${COMPOSE_FILE}"`; const API_BASE = "http://localhost:3001"; -// Phase 1: wait for /healthz (process alive, no DB needed) -const HEALTHZ_TIMEOUT_MS = 60_000; // 1 min — container start + node boot -// Phase 2: wait for first ledger indexed (DB connected + first poll done) -const FIRST_LEDGER_TIMEOUT_MS = 180_000; // 3 min — prisma push + first RPC poll +// docker compose up --wait already blocks until the wraith healthcheck passes, +// so this is just a short fallback for the fetch-level check. +const HEALTHZ_TIMEOUT_MS = 30_000; // 30s — should be instant after --wait +// Phase 2: wait for first ledger indexed (first successful RPC poll + DB write) +const FIRST_LEDGER_TIMEOUT_MS = 120_000; // 2 min — first poll after server is up const PAUSE_DURATION_MS = 15_000; // DB outage window const RECOVERY_TIMEOUT_MS = 90_000; // wait for indexer to advance past checkpoint const POLL_INTERVAL_MS = 3_000; // polling cadence @@ -108,14 +109,18 @@ describe("Chaos: DB restart mid-ingest", () => { } // ── 1. Start the chaos stack ─────────────────────────────────────────────── - console.log("[chaos] Building and starting containers…"); - exec(`${COMPOSE_CMD} up -d --build`); + // --wait blocks until every service with a healthcheck reports healthy. + // The wraith service healthcheck polls /healthz, so this covers the full + // startup sequence: container boot → prisma db push → Express listen. + // Timeout is set via the healthcheck retries in docker-compose.chaos.yml + // (60 × 5s = 5 min max). + console.log("[chaos] Building and starting containers (waiting for healthy)…"); + exec(`${COMPOSE_CMD} up -d --build --wait`); composeStarted = true; + console.log("[chaos] All services healthy."); - // ── 2a. Phase 1: wait for Node.js process to be alive ──────────────────── - // /healthz returns 200 as soon as the Express server is up. - // It does NOT require a DB connection, so this completes quickly. - console.log("[chaos] Waiting for Wraith process to boot…"); + // ── 2a. Phase 1: quick sanity-check that /healthz is reachable ─────────── + console.log("[chaos] Confirming /healthz is reachable…"); await waitUntil( async () => { const data = await fetchJson<{ ok: boolean }>("/healthz"); diff --git a/tests/chaos/docker-compose.chaos.yml b/tests/chaos/docker-compose.chaos.yml index de930af1..70f683d4 100644 --- a/tests/chaos/docker-compose.chaos.yml +++ b/tests/chaos/docker-compose.chaos.yml @@ -1,7 +1,6 @@ # Dedicated compose stack for the DB-restart chaos test. # Uses a separate container name and port (5433 / 3001) so it never # collides with a local development stack running on the default ports. -# Note: no top-level 'version' key — obsolete in Compose v2+. services: # ── Postgres ────────────────────────────────────────────────────────────────── @@ -33,9 +32,16 @@ services: DATABASE_URL: postgresql://wraith:wraith@db:5432/wraith DIRECT_DATABASE_URL: postgresql://wraith:wraith@db:5432/wraith STELLAR_NETWORK: testnet - # Fast poll cycle so the test completes in reasonable time POLL_INTERVAL_MS: "3000" EVENTS_BATCH_SIZE: "500" PORT: "3001" ports: - "3001:3001" + # Healthcheck lets `docker compose up --wait` block until the HTTP server + # is accepting connections (after prisma db push completes). + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:3001/healthz || exit 1"] + interval: 5s + timeout: 5s + retries: 60 # 60 × 5s = 5 min max (covers slow prisma push in CI) + start_period: 10s From 82b2db90694137003411e893753c426f10e9cb7d Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 11:14:01 +0100 Subject: [PATCH 5/7] fix(chaos): use builder stage to keep prisma CLI available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: the production Dockerfile stage runs 'npm ci --omit=dev' which drops the prisma CLI (devDependency). The CMD then runs 'npx prisma migrate deploy' which cannot find prisma and exits(1) immediately, before --wait or /healthz could ever succeed. Fix: build from 'target: builder' which runs 'npm ci' (all deps, including prisma CLI). Override command to 'node dist/index.js' directly — the app's index.ts already calls 'npx prisma db push' internally using the prisma CLI that is now present in node_modules. --- tests/chaos/docker-compose.chaos.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/chaos/docker-compose.chaos.yml b/tests/chaos/docker-compose.chaos.yml index 70f683d4..106eb980 100644 --- a/tests/chaos/docker-compose.chaos.yml +++ b/tests/chaos/docker-compose.chaos.yml @@ -24,6 +24,14 @@ services: build: context: ../.. dockerfile: Dockerfile + # Use the builder stage (npm ci without --omit=dev) so the prisma CLI + # is available. The production stage drops dev deps and the CMD runs + # `prisma migrate deploy` which fails when there are no migration files. + # The builder stage has all deps; we override the command below. + target: builder + # Override the Dockerfile CMD: the app's index.ts already runs + # `npx prisma db push` internally, so we just start the compiled app. + command: node dist/index.js container_name: wraith_chaos_indexer depends_on: db: @@ -38,10 +46,10 @@ services: ports: - "3001:3001" # Healthcheck lets `docker compose up --wait` block until the HTTP server - # is accepting connections (after prisma db push completes). + # is accepting connections (covers prisma db push + Express listen). healthcheck: test: ["CMD-SHELL", "wget -q -O- http://localhost:3001/healthz || exit 1"] interval: 5s timeout: 5s - retries: 60 # 60 × 5s = 5 min max (covers slow prisma push in CI) - start_period: 10s + retries: 60 # 60 × 5s = 5 min max + start_period: 15s From c269f594f30db9bc9d6cfaad14a3bf56d427df82 Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 11:23:17 +0100 Subject: [PATCH 6/7] chore(chaos): dump container logs on --wait failure for diagnostics The container exits with code 1 but the CI log only shows the exit code, not the actual error from inside the container. Wrap the 'docker compose up --wait' call in try/catch and dump both wraith and db logs before rethrowing so we can see what's crashing. --- tests/chaos/db-restart.test.ts | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/chaos/db-restart.test.ts b/tests/chaos/db-restart.test.ts index b98ffa9d..2179923d 100644 --- a/tests/chaos/db-restart.test.ts +++ b/tests/chaos/db-restart.test.ts @@ -115,8 +115,30 @@ describe("Chaos: DB restart mid-ingest", () => { // Timeout is set via the healthcheck retries in docker-compose.chaos.yml // (60 × 5s = 5 min max). console.log("[chaos] Building and starting containers (waiting for healthy)…"); - exec(`${COMPOSE_CMD} up -d --build --wait`); - composeStarted = true; + try { + exec(`${COMPOSE_CMD} up -d --build --wait`); + composeStarted = true; + } catch (err) { + // If --wait fails (container exited or unhealthy), dump container logs + // so we can see why the app crashed instead of just "exit code 1". + composeStarted = true; // ensure afterAll cleanup still runs + console.error("[chaos] docker compose up --wait failed."); + try { + const wraithLogs = execSync(`${COMPOSE_CMD} logs wraith`, { encoding: "utf8" }); + console.error("─── wraith container logs ───"); + console.error(wraithLogs); + } catch (logErr) { + console.error("[chaos] Could not retrieve wraith logs:", (logErr as Error).message); + } + try { + const dbLogs = execSync(`${COMPOSE_CMD} logs db --tail 30`, { encoding: "utf8" }); + console.error("─── db container logs (last 30 lines) ───"); + console.error(dbLogs); + } catch (logErr) { + console.error("[chaos] Could not retrieve db logs:", (logErr as Error).message); + } + throw err; + } console.log("[chaos] All services healthy."); // ── 2a. Phase 1: quick sanity-check that /healthz is reachable ─────────── From 65079ec190ab875fd390a365cc377a5b2e1bad4b Mon Sep 17 00:00:00 2001 From: Ipramking Date: Sat, 30 May 2026 11:29:55 +0100 Subject: [PATCH 7/7] fix(chaos): use Debian-based Dockerfile to avoid alpine+prisma OpenSSL bug Container logs (now visible thanks to previous diagnostic commit) show: prisma:warn Prisma failed to detect the libssl/openssl version to use, Defaulting to 'openssl-1.1.x'. Please manually install OpenSSL... Error: Command failed: npx prisma db push --accept-data-loss The project's prisma/schema.prisma declares binaryTargets = ['native', 'debian-openssl-3.0.x'] but the main Dockerfile uses node:20-alpine (musl + OpenSSL 3). The 'native' probe fails on alpine, Prisma falls back to OpenSSL 1.1.x binaries that aren't installed, and 'prisma db push' crashes the container during startup with exit code 1. Fix: add tests/chaos/Dockerfile.chaos based on node:20-slim (Debian) which matches the existing 'debian-openssl-3.0.x' target. All dev deps (including the prisma CLI) are installed so the app's startup-time 'npx prisma db push' works. Switch the chaos compose to use this Dockerfile via dockerfile: tests/chaos/Dockerfile.chaos. The main Dockerfile is untouched. --- tests/chaos/Dockerfile.chaos | 34 ++++++++++++++++++++++++++++ tests/chaos/docker-compose.chaos.yml | 16 ++++++------- 2 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 tests/chaos/Dockerfile.chaos diff --git a/tests/chaos/Dockerfile.chaos b/tests/chaos/Dockerfile.chaos new file mode 100644 index 00000000..c5ef4d53 --- /dev/null +++ b/tests/chaos/Dockerfile.chaos @@ -0,0 +1,34 @@ +# Chaos-test-only Dockerfile. +# ───────────────────────────── +# Uses node:20-slim (Debian) instead of the project's node:20-alpine to +# avoid the Prisma + alpine musl OpenSSL detection issue, which makes +# `prisma db push` crash on startup with: +# prisma:warn Prisma failed to detect the libssl/openssl version to use, +# Defaulting to "openssl-1.1.x". +# Error: Command failed: npx prisma db push --accept-data-loss +# +# The project's prisma/schema.prisma declares +# binaryTargets = ["native", "debian-openssl-3.0.x"] +# which works natively on Debian. Keeps all dev deps (prisma CLI) so the +# app's `npx prisma db push` runs without an extra install round-trip. + +FROM node:20-slim + +# OpenSSL 3 runtime is needed by the Prisma query engine. +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssl ca-certificates wget \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install all deps (including prisma CLI for db push at runtime) +COPY package*.json ./ +RUN npm ci + +# Generate Prisma client + compile TypeScript +COPY prisma ./prisma +RUN npx prisma generate +COPY . . +RUN npm run build + +CMD ["node", "dist/index.js"] diff --git a/tests/chaos/docker-compose.chaos.yml b/tests/chaos/docker-compose.chaos.yml index 106eb980..a1c933c4 100644 --- a/tests/chaos/docker-compose.chaos.yml +++ b/tests/chaos/docker-compose.chaos.yml @@ -23,15 +23,13 @@ services: wraith: build: context: ../.. - dockerfile: Dockerfile - # Use the builder stage (npm ci without --omit=dev) so the prisma CLI - # is available. The production stage drops dev deps and the CMD runs - # `prisma migrate deploy` which fails when there are no migration files. - # The builder stage has all deps; we override the command below. - target: builder - # Override the Dockerfile CMD: the app's index.ts already runs - # `npx prisma db push` internally, so we just start the compiled app. - command: node dist/index.js + # Use the chaos-specific Debian-based Dockerfile to avoid the Prisma + + # alpine musl OpenSSL detection bug that crashes `prisma db push` on + # startup. The project's prisma/schema.prisma declares + # binaryTargets = ["native", "debian-openssl-3.0.x"] which works natively + # on Debian. The chaos Dockerfile keeps all dev deps (prisma CLI) so the + # app's runtime `npx prisma db push` works. + dockerfile: tests/chaos/Dockerfile.chaos container_name: wraith_chaos_indexer depends_on: db: