Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/workflows/chaos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Chaos tests

on:
pull_request:
paths:
- "tests/chaos/**"
- "src/**"
- "prisma/**"
- "Dockerfile"
- ".github/workflows/chaos.yml"

jobs:
chaos:
name: DB restart chaos test
runs-on: ubuntu-latest
timeout-minutes: 15

steps:
- uses: actions/checkout@v4

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
cache: "npm"

- name: Install dependencies
run: npm ci

- name: Verify Docker is available
run: docker info

- name: Run chaos tests
run: npm run test:chaos
env:
# Forward testnet RPC so the indexer can fetch real events
STELLAR_NETWORK: testnet
10 changes: 10 additions & 0 deletions jest.chaos.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
/** @type {import('jest').Config} */
module.exports = {
preset: "ts-jest",
testEnvironment: "node",
roots: ["<rootDir>/tests/chaos"],
testMatch: ["**/*.test.ts"],
moduleFileExtensions: ["ts", "js", "json"],
clearMocks: true,
testTimeout: 300_000, // 5 minutes — Docker build + full chaos scenario
};
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"start": "node dist/index.js",
"test": "jest --runInBand",
"test:watch": "jest --watch",
"test:chaos": "jest --runInBand --testPathPatterns=tests/chaos --testTimeout=300000 --config=jest.chaos.config.js",
"db:generate": "prisma generate",
"db:migrate": "prisma migrate dev",
"db:push": "prisma db push",
Expand Down
34 changes: 34 additions & 0 deletions tests/chaos/Dockerfile.chaos
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Chaos-test-only Dockerfile.
# ─────────────────────────────
# Uses node:20-slim (Debian) instead of the project's node:20-alpine to
# avoid the Prisma + alpine musl OpenSSL detection issue, which makes
# `prisma db push` crash on startup with:
# prisma:warn Prisma failed to detect the libssl/openssl version to use,
# Defaulting to "openssl-1.1.x".
# Error: Command failed: npx prisma db push --accept-data-loss
#
# The project's prisma/schema.prisma declares
# binaryTargets = ["native", "debian-openssl-3.0.x"]
# which works natively on Debian. Keeps all dev deps (prisma CLI) so the
# app's `npx prisma db push` runs without an extra install round-trip.

FROM node:20-slim

# OpenSSL 3 runtime is needed by the Prisma query engine.
RUN apt-get update \
&& apt-get install -y --no-install-recommends openssl ca-certificates wget \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /app

# Install all deps (including prisma CLI for db push at runtime)
COPY package*.json ./
RUN npm ci

# Generate Prisma client + compile TypeScript
COPY prisma ./prisma
RUN npx prisma generate
COPY . .
RUN npm run build

CMD ["node", "dist/index.js"]
236 changes: 236 additions & 0 deletions tests/chaos/db-restart.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
/**
* Chaos test: DB restart mid-ingest
* ─────────────────────────────────
* Spins up a Wraith + Postgres stack, lets the indexer ingest live testnet
* events, pauses the DB container to simulate a crash, then resumes and
* asserts:
* 1. The indexer process stayed alive during the outage.
* 2. After recovery, lastIndexedLedger advanced beyond the pre-pause value.
* 3. No data was lost (transfer count only ever increases).
*
* Prerequisites: Docker + Docker Compose v2 must be available in PATH.
* The test skips automatically when Docker is not detected.
*
* Run with:
* npm run test:chaos
*/

import { execSync, spawnSync } from "child_process";
import path from "path";

// ─── Config ───────────────────────────────────────────────────────────────────
const COMPOSE_FILE = path.resolve(__dirname, "docker-compose.chaos.yml");
const COMPOSE_CMD = `docker compose -f "${COMPOSE_FILE}"`;
const API_BASE = "http://localhost:3001";

// docker compose up --wait already blocks until the wraith healthcheck passes,
// so this is just a short fallback for the fetch-level check.
const HEALTHZ_TIMEOUT_MS = 30_000; // 30s — should be instant after --wait
// Phase 2: wait for first ledger indexed (first successful RPC poll + DB write)
const FIRST_LEDGER_TIMEOUT_MS = 120_000; // 2 min — first poll after server is up
const PAUSE_DURATION_MS = 15_000; // DB outage window
const RECOVERY_TIMEOUT_MS = 90_000; // wait for indexer to advance past checkpoint
const POLL_INTERVAL_MS = 3_000; // polling cadence

jest.setTimeout(600_000); // 10 minutes total budget

// ─── Helpers ──────────────────────────────────────────────────────────────────

function dockerAvailable(): boolean {
const result = spawnSync("docker", ["info"], { stdio: "pipe" });
return result.status === 0;
}

function exec(cmd: string): string {
return execSync(cmd, { encoding: "utf8" }).trim();
}

function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}

async function fetchJson<T>(urlPath: string): Promise<T> {
const res = await fetch(`${API_BASE}${urlPath}`);
if (!res.ok) throw new Error(`HTTP ${res.status} from ${urlPath}`);
return res.json() as Promise<T>;
}

interface StatusResponse {
ok: boolean;
lastIndexedLedger: number | null;
latestLedger: number;
totalIndexed: number;
}

/**
* Poll fn() every POLL_INTERVAL_MS until it returns true or the deadline passes.
*/
async function waitUntil(
fn: () => Promise<boolean>,
timeoutMs: number,
description: string
): Promise<void> {
const deadline = Date.now() + timeoutMs;
while (Date.now() < deadline) {
try {
if (await fn()) return;
} catch {
// transient — keep polling
}
await sleep(POLL_INTERVAL_MS);
}
throw new Error(`Timed out waiting for: ${description}`);
}

// ─── Lifecycle ────────────────────────────────────────────────────────────────

let composeStarted = false;

afterAll(async () => {
if (!composeStarted) return;
console.log("[chaos] Tearing down containers…");
try {
exec(`${COMPOSE_CMD} down --volumes --remove-orphans`);
console.log("[chaos] Containers removed.");
} catch (e) {
console.error("[chaos] Cleanup failed (manual removal may be needed):", e);
}
});

// ─── Main test ────────────────────────────────────────────────────────────────

describe("Chaos: DB restart mid-ingest", () => {
it("indexer resumes from checkpoint with no data loss after DB pause", async () => {

// ── 0. Skip if Docker unavailable ─────────────────────────────────────────
if (!dockerAvailable()) {
console.warn("[chaos] Docker not available — skipping chaos test.");
return;
}

// ── 1. Start the chaos stack ───────────────────────────────────────────────
// --wait blocks until every service with a healthcheck reports healthy.
// The wraith service healthcheck polls /healthz, so this covers the full
// startup sequence: container boot → prisma db push → Express listen.
// Timeout is set via the healthcheck retries in docker-compose.chaos.yml
// (60 × 5s = 5 min max).
console.log("[chaos] Building and starting containers (waiting for healthy)…");
try {
exec(`${COMPOSE_CMD} up -d --build --wait`);
composeStarted = true;
} catch (err) {
// If --wait fails (container exited or unhealthy), dump container logs
// so we can see why the app crashed instead of just "exit code 1".
composeStarted = true; // ensure afterAll cleanup still runs
console.error("[chaos] docker compose up --wait failed.");
try {
const wraithLogs = execSync(`${COMPOSE_CMD} logs wraith`, { encoding: "utf8" });
console.error("─── wraith container logs ───");
console.error(wraithLogs);
} catch (logErr) {
console.error("[chaos] Could not retrieve wraith logs:", (logErr as Error).message);
}
try {
const dbLogs = execSync(`${COMPOSE_CMD} logs db --tail 30`, { encoding: "utf8" });
console.error("─── db container logs (last 30 lines) ───");
console.error(dbLogs);
} catch (logErr) {
console.error("[chaos] Could not retrieve db logs:", (logErr as Error).message);
}
throw err;
}
console.log("[chaos] All services healthy.");

// ── 2a. Phase 1: quick sanity-check that /healthz is reachable ───────────
console.log("[chaos] Confirming /healthz is reachable…");
await waitUntil(
async () => {
const data = await fetchJson<{ ok: boolean }>("/healthz");
return data.ok === true;
},
HEALTHZ_TIMEOUT_MS,
"/healthz returns ok=true"
);
console.log("[chaos] Wraith process is alive.");

// ── 2b. Phase 2: wait for first successful DB write ───────────────────────
// Covers: prisma db push (schema migration) + first RPC poll + first upsert.
// We poll /status until lastIndexedLedger becomes a positive integer.
console.log("[chaos] Waiting for first ledger to be indexed (prisma push + first poll)…");
await waitUntil(
async () => {
const data = await fetchJson<StatusResponse>("/status");
return data.lastIndexedLedger !== null && data.lastIndexedLedger > 0;
},
FIRST_LEDGER_TIMEOUT_MS,
"lastIndexedLedger > 0"
);
console.log("[chaos] Indexer is running and has persisted at least one ledger.");

// ── 3. Snapshot pre-pause state ───────────────────────────────────────────
const beforeStatus = await fetchJson<StatusResponse>("/status");
const ledgerBefore = beforeStatus.lastIndexedLedger!;
const countBefore = beforeStatus.totalIndexed;

console.log(
`[chaos] Pre-pause — lastIndexedLedger: ${ledgerBefore}, totalIndexed: ${countBefore}`
);
expect(ledgerBefore).toBeGreaterThan(0);

// ── 4. Pause the DB container ─────────────────────────────────────────────
console.log("[chaos] Pausing DB container…");
exec("docker pause wraith_chaos_db");

// ── 5. Hold the pause — indexer must stay alive ───────────────────────────
console.log(`[chaos] Holding pause for ${PAUSE_DURATION_MS / 1000}s…`);
await sleep(PAUSE_DURATION_MS);

// Liveness probe must still respond — the Express server is independent of DB
const liveness = await fetchJson<{ ok: boolean }>("/healthz");
expect(liveness.ok).toBe(true);
console.log("[chaos] Indexer process alive during DB outage ✓");

const checkpointLedger = ledgerBefore;

// ── 6. Resume the DB container ────────────────────────────────────────────
console.log("[chaos] Resuming DB container…");
exec("docker unpause wraith_chaos_db");

// ── 7. Wait for forward progress past the checkpoint ─────────────────────
// The indexer's withRetry loop will reconnect and resume from the saved
// lastIndexedLedger. We wait until it advances strictly beyond the checkpoint.
console.log("[chaos] Waiting for indexer to advance past checkpoint…");
await waitUntil(
async () => {
const data = await fetchJson<StatusResponse>("/status");
return (data.lastIndexedLedger ?? 0) > checkpointLedger;
},
RECOVERY_TIMEOUT_MS,
`lastIndexedLedger > ${checkpointLedger}`
);

// ── 8. Final assertions ───────────────────────────────────────────────────
const afterStatus = await fetchJson<StatusResponse>("/status");
const ledgerAfter = afterStatus.lastIndexedLedger ?? 0;
const countAfter = afterStatus.totalIndexed;

console.log(
`[chaos] Post-recovery — lastIndexedLedger: ${ledgerAfter}, totalIndexed: ${countAfter}`
);

// 8a. Indexer resumed from checkpoint — did NOT reset to ledger 0
expect(ledgerAfter).toBeGreaterThan(checkpointLedger);
console.log(`[chaos] Checkpoint preserved: ${checkpointLedger} → ${ledgerAfter} ✓`);

// 8b. Transfer count only increased — no data lost
expect(countAfter).toBeGreaterThanOrEqual(countBefore);
console.log(`[chaos] Data integrity: ${countBefore} → ${countAfter} transfers ✓`);

// 8c. Process still healthy
const finalHealth = await fetchJson<{ ok: boolean }>("/healthz");
expect(finalHealth.ok).toBe(true);
console.log("[chaos] Indexer healthy post-recovery ✓");

console.log("[chaos] All assertions passed.");
});
});
53 changes: 53 additions & 0 deletions tests/chaos/docker-compose.chaos.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Dedicated compose stack for the DB-restart chaos test.
# Uses a separate container name and port (5433 / 3001) so it never
# collides with a local development stack running on the default ports.

services:
# ── Postgres ──────────────────────────────────────────────────────────────────
db:
image: postgres:16-alpine
container_name: wraith_chaos_db
environment:
POSTGRES_USER: wraith
POSTGRES_PASSWORD: wraith
POSTGRES_DB: wraith
ports:
- "5433:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U wraith"]
interval: 3s
timeout: 3s
retries: 20

# ── Wraith indexer + API ───────────────────────────────────────────────────────
wraith:
build:
context: ../..
# Use the chaos-specific Debian-based Dockerfile to avoid the Prisma +
# alpine musl OpenSSL detection bug that crashes `prisma db push` on
# startup. The project's prisma/schema.prisma declares
# binaryTargets = ["native", "debian-openssl-3.0.x"] which works natively
# on Debian. The chaos Dockerfile keeps all dev deps (prisma CLI) so the
# app's runtime `npx prisma db push` works.
dockerfile: tests/chaos/Dockerfile.chaos
container_name: wraith_chaos_indexer
depends_on:
db:
condition: service_healthy
environment:
DATABASE_URL: postgresql://wraith:wraith@db:5432/wraith
DIRECT_DATABASE_URL: postgresql://wraith:wraith@db:5432/wraith
STELLAR_NETWORK: testnet
POLL_INTERVAL_MS: "3000"
EVENTS_BATCH_SIZE: "500"
PORT: "3001"
ports:
- "3001:3001"
# Healthcheck lets `docker compose up --wait` block until the HTTP server
# is accepting connections (covers prisma db push + Express listen).
healthcheck:
test: ["CMD-SHELL", "wget -q -O- http://localhost:3001/healthz || exit 1"]
interval: 5s
timeout: 5s
retries: 60 # 60 × 5s = 5 min max
start_period: 15s
Loading