From cffd6a6b81caa7e6dba9050d6d04064bdbeac57c Mon Sep 17 00:00:00 2001 From: Chris Lee Date: Wed, 3 Jun 2026 19:32:27 +1000 Subject: [PATCH 1/2] fix(idempotency): gate side-effecting handlers with Valkey claim to prevent redelivery duplicates Closes #202 Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 2 +- docs/build/architecture.md | 4 +- src/webhook/events/issue-comment.ts | 6 ++ src/webhook/events/issues.ts | 3 + src/webhook/events/pull-request.ts | 3 + src/webhook/events/review-comment.ts | 3 + src/webhook/events/review.ts | 6 ++ src/webhook/idempotency.ts | 71 +++++++++++++++++ test/webhook/events/issues.test.ts | 29 +++++-- test/webhook/idempotency.test.ts | 109 +++++++++++++++++++++++++++ 10 files changed, 228 insertions(+), 8 deletions(-) create mode 100644 src/webhook/idempotency.ts create mode 100644 test/webhook/idempotency.test.ts diff --git a/CLAUDE.md b/CLAUDE.md index ff4701cc..112f1c2b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -62,7 +62,7 @@ Single HTTP server (`src/app.ts`) using `octokit` App class. Webhook events arri ## Key Concepts - **Async processing**: Webhook must respond within 10 seconds. All heavy work runs asynchronously after 200 OK. -- **Idempotency**: Two-layer guard. Fast path: in-memory `Map` keyed by `X-GitHub-Delivery` header (lost on restart). Durable: `isAlreadyProcessed()` checks GitHub for an existing tracking comment, survives pod restarts and OOM kills. +- **Idempotency**: GitHub webhooks are at-least-once, a delivery (auto-retry or operator redelivery) replays with the SAME `X-GitHub-Delivery` header for up to 3 days. The four side-effecting event handlers (`events/issue-comment.ts`, `events/review-comment.ts`, and the label branches of `events/issues.ts` + `events/pull-request.ts`) call `claimDelivery(deliveryId)` (`src/webhook/idempotency.ts`) at the very top of their dispatch path, before any LLM call, `workflow_runs` insert, or GitHub write. `claimDelivery` is a Valkey `SET key 1 NX EX 259200` claim: it returns `true` exactly once per `deliveryId` within the 3-day window (the redelivery gets `false` and the handler returns early). It is **fail-OPEN**, when Valkey is unconfigured or errors it returns `true`, degrading to at-least-once rather than dropping webhooks. `events/review.ts` is intentionally NOT gated: it fires only an idempotent reactor wake (no dispatch/write). Durable backstop behind the best-effort Valkey layer: the `idx_workflow_runs_inflight` partial-unique index, which makes the dispatcher reject a second in-flight run for the same workflow+target even if the Valkey claim was skipped (fail-open). `claimDelivery` also fails open when Valkey is configured-but-disconnected (gated on `isValkeyHealthy()` so a down connection skips the SET rather than blocking on Bun's offline queue). (The legacy in-memory `Map` + `isAlreadyProcessed()` tracking-comment scan lives on the `router.ts` `processRequest` path, which the production handlers bypass, issue #202.) - **Repo checkout**: Each request clones the repo to a unique temp dir. Claude operates on local files via `cwd`. - **MCP servers**: Comment updates, inline reviews, and Context7 for library docs. Git changes are made via git CLI (Bash tool) on the cloned repo. - **Scheduled actions**: a repo may ship a `.github-app.yaml` at its default-branch root declaring prompt-based actions on a cron schedule. The internal scheduler (`src/scheduler/`, gated by `SCHEDULER_ENABLED` + `DATABASE_URL` + non-empty `ALLOWED_OWNERS`) enqueues a `scheduled-action` job, a new job kind on the scoped-job rail, that the daemon runs as one agent session via `src/daemon/scheduled-action-executor.ts`. Missed cron slots are skipped, not backfilled. The prompt is owner-trusted config. Cron parsing uses the `cron-parser` dependency. The bot-provided `merge_readiness` MCP tool is exposed only when `SCHEDULER_ALLOW_AUTO_MERGE` env AND per-action `auto_merge` are both true; `allowed_tools` is owner-trusted config, so an action granted a merge-capable Bash tool can still merge regardless. `resolve.ts` FR-017 is untouched. diff --git a/docs/build/architecture.md b/docs/build/architecture.md index 66519ea5..670d117a 100644 --- a/docs/build/architecture.md +++ b/docs/build/architecture.md @@ -9,7 +9,7 @@ flowchart TD GH["GitHub webhook
POST /api/github/webhooks"]:::entry VERIFY["Verify HMAC-SHA256"]:::guard ACK["200 OK within 10 seconds"]:::ack - ROUTE["Router
idempotency + allowlist + concurrency"]:::guard + ROUTE["Handler dispatch
delivery claim + allowlist + concurrency"]:::guard TR["Haiku triage
binary heavy classifier"]:::decide QUEUE["Orchestrator job queue
Valkey list"]:::store SCALE{{"Scale-up decision
heavy OR queue >= threshold
AND no persistent slots
AND cooldown elapsed"}}:::fork @@ -44,7 +44,7 @@ flowchart TD ## Key concepts - **Async processing.** The webhook handler responds within ten seconds, so the router fires `processRequest` with fire-and-forget semantics after the 200 OK is queued. Every box downstream of `ACK` runs after the HTTP response is on the wire. -- **Two-layer idempotency.** The fast path is an in-memory `Map` keyed by `X-GitHub-Delivery`. The durable path (`isAlreadyProcessed` in `src/core/tracking-comment.ts`) scans GitHub issue/PR comments for the hidden delivery marker the bot embeds in the tracking comment, so duplicate deliveries are detected across pod restarts, OOM kills, and crash loops: this works **without** `DATABASE_URL`. `DATABASE_URL` is required only to persist execution / dispatch history across restarts. +- **Webhook delivery idempotency (issue #202).** GitHub is at-least-once: a delivery (auto-retry or operator redelivery) replays with the same `X-GitHub-Delivery` for up to 3 days. The four side-effecting handlers (`events/issue-comment.ts`, `events/review-comment.ts`, the label branches of `events/issues.ts` + `events/pull-request.ts`) call `claimDelivery(deliveryId)` (`src/webhook/idempotency.ts`) at the top of their dispatch path, before any LLM call, `workflow_runs` insert, or GitHub write. It is a Valkey `SET key 1 NX EX 259200` claim: `true` exactly once per delivery, `false` (and an early return) on a redelivery. It is **fail-open**, when Valkey is unconfigured or disconnected (gated on `isValkeyHealthy()`) it returns `true`, degrading to at-least-once rather than dropping or blocking webhooks. `events/review.ts` is exempt (idempotent reactor wake only). The durable backstop behind the best-effort Valkey layer is the `idx_workflow_runs_inflight` partial-unique index: the dispatcher rejects a second in-flight run for the same workflow+target even when the Valkey claim was skipped. The legacy in-memory `Map` + `isAlreadyProcessed` tracking-comment scan (`src/core/tracking-comment.ts`) now runs only on the `router.ts processRequest` path, which production handlers bypass. `DATABASE_URL` is required to persist execution / dispatch history and the in-flight guard across restarts. - **One request, one clone.** Each delivery clones the repo into a unique temp directory under `CLONE_BASE_DIR` **on the daemon host**. Claude operates on local files via `cwd`. On PR events the checkout supplementally fetches `origin/` (when it differs from the head ref) so the agent's `git diff origin/...HEAD` and `git rebase origin/` directives resolve first try. A sibling `${workDir}-artifacts` directory is created outside the checkout and exposed to the agent as `BOT_ARTIFACT_DIR`: workflow summary files (IMPLEMENT.md / REVIEW.md / RESOLVE.md) are written there so they can never be picked up by a `git add` inside the clone. Both directories are removed in the pipeline's `finally` block regardless of outcome. - **GitHub credential resolution.** `src/core/github-token.ts:resolveGithubToken()` is the single source of the GitHub credential the daemon uses. Default is an App installation token minted just-in-time from the cached `App` singleton in `src/orchestrator/connection-handler.ts`. When `GITHUB_PERSONAL_ACCESS_TOKEN` is set, the helper short-circuits and returns the PAT instead, API/git authentication runs as the PAT owner. Commit author/committer metadata is **not** affected; `src/core/checkout.ts` hard-pins git `user.name`/`user.email` to `chrisleekr-bot[bot]` so commit objects still carry the bot identity. The git credential helper, executor `GH_TOKEN`/`GITHUB_TOKEN` env vars, and MCP server env all consume the resolved string without caring about its source. - **The webhook server never runs the pipeline.** Only daemons execute `runPipeline`. The webhook server is the orchestrator: it enqueues jobs and optionally spawns ephemeral daemons. diff --git a/src/webhook/events/issue-comment.ts b/src/webhook/events/issue-comment.ts index 50ef9d00..71b65334 100644 --- a/src/webhook/events/issue-comment.ts +++ b/src/webhook/events/issue-comment.ts @@ -10,6 +10,7 @@ import { addReaction } from "../../utils/reactions"; import { dispatchByIntent } from "../../workflows/dispatcher"; import { dispatchCommentSurface } from "../../workflows/ship/command-dispatch"; import { isOwnerAllowed } from "../authorize"; +import { claimDelivery } from "../idempotency"; /** * Handler for issue_comment.created events. @@ -84,6 +85,11 @@ export function handleIssueComment( // only on PR comments. Canonical wins; legacy `dispatchByIntent` // runs only when canonical produced no command. void (async (): Promise => { + // Idempotency gate (issue #202): GitHub redelivers with the same + // deliveryId, so a redelivery would re-run both the canonical NL classifier + // and the legacy intent LLM call (and any chat-thread turn). Claim the + // delivery before any dispatch; a redelivery skips. Fail-open in claimDelivery. + if (!(await claimDelivery(deliveryId, log))) return; const dispatchLog = log.child({ event_surface: eventSurface }); let canonicalHandled = false; try { diff --git a/src/webhook/events/issues.ts b/src/webhook/events/issues.ts index 58462e44..07ca7d96 100644 --- a/src/webhook/events/issues.ts +++ b/src/webhook/events/issues.ts @@ -7,6 +7,7 @@ import { dispatchByLabel } from "../../workflows/dispatcher"; import { dispatchCanonicalCommand } from "../../workflows/ship/command-dispatch"; import { routeTrigger } from "../../workflows/ship/trigger-router"; import { isOwnerAllowed } from "../authorize"; +import { claimDelivery } from "../idempotency"; // Permits hyphenated verbs (e.g. `bot:open-pr`, `bot:fix-thread`); the verb // must start with a letter and may contain `-`-separated lowercase segments. @@ -97,6 +98,8 @@ export function handleIssues(octokit: Octokit, payload: IssuesEvent, deliveryId: const installationId = payload.installation?.id; void (async (): Promise => { + // Idempotency gate (issue #202): skip a redelivery before any dispatch. + if (!(await claimDelivery(deliveryId, log))) return; if (installationId !== undefined) { try { const command = await routeTrigger({ diff --git a/src/webhook/events/pull-request.ts b/src/webhook/events/pull-request.ts index 0783bc99..7ff4ba67 100644 --- a/src/webhook/events/pull-request.ts +++ b/src/webhook/events/pull-request.ts @@ -8,6 +8,7 @@ import { dispatchCanonicalCommand } from "../../workflows/ship/command-dispatch" import { fireReactor } from "../../workflows/ship/reactor-bridge"; import { routeTrigger } from "../../workflows/ship/trigger-router"; import { isOwnerAllowed } from "../authorize"; +import { claimDelivery } from "../idempotency"; // Permits the documented label shapes: // bot:ship, bot:abort-ship, bot:fix-thread, bot:investigate, ... @@ -179,6 +180,8 @@ function handlePullRequestLabeled( const installationId = payload.installation?.id; void (async (): Promise => { + // Idempotency gate (issue #202): skip a redelivery before any dispatch. + if (!(await claimDelivery(deliveryId, log))) return; if (installationId !== undefined) { try { const command = await routeTrigger({ diff --git a/src/webhook/events/review-comment.ts b/src/webhook/events/review-comment.ts index a6e6c593..31cae418 100644 --- a/src/webhook/events/review-comment.ts +++ b/src/webhook/events/review-comment.ts @@ -11,6 +11,7 @@ import { dispatchByIntent } from "../../workflows/dispatcher"; import { dispatchCommentSurface } from "../../workflows/ship/command-dispatch"; import { fireReactor } from "../../workflows/ship/reactor-bridge"; import { isOwnerAllowed } from "../authorize"; +import { claimDelivery } from "../idempotency"; /** * Handler for pull_request_review_comment.{created,edited,deleted} events. @@ -112,6 +113,8 @@ export function handleReviewComment( const threadId = String(topLevelCommentId); void (async (): Promise => { + // Idempotency gate (issue #202): skip a redelivery before any LLM dispatch. + if (!(await claimDelivery(deliveryId, log))) return; const dispatchLog = log.child({ thread_id: threadId, event_surface: "review-comment" }); let canonicalHandled = false; try { diff --git a/src/webhook/events/review.ts b/src/webhook/events/review.ts index da88949e..1ba68c39 100644 --- a/src/webhook/events/review.ts +++ b/src/webhook/events/review.ts @@ -10,6 +10,12 @@ import { fireReactor } from "../../workflows/ship/reactor-bridge"; * * Fires the ship reactor (T024) so any active intent on this PR wakes early * to inspect the new review state. + * + * No `claimDelivery` idempotency gate (issue #202): unlike the comment/label + * handlers, this fires only an idempotent reactor wake (no LLM dispatch, no + * workflow_runs row, no GitHub write). A redelivery just re-pokes an already- + * awake intent, which is harmless and self-deduping, so the dedup claim would + * add a Valkey round trip with nothing to protect. */ export function handleReview( _octokit: Octokit, diff --git a/src/webhook/idempotency.ts b/src/webhook/idempotency.ts new file mode 100644 index 00000000..f364c1a8 --- /dev/null +++ b/src/webhook/idempotency.ts @@ -0,0 +1,71 @@ +import type { Logger } from "pino"; + +import { getValkeyClient, isValkeyHealthy } from "../orchestrator/valkey"; + +/** + * Webhook delivery idempotency (issue #202). + * + * GitHub replays a delivery (automatic retry or operator-driven manual + * redelivery) with the SAME `X-GitHub-Delivery` header for up to 3 days, so + * webhooks are at-least-once. Without a dedup gate, a redelivery re-runs the + * full handler, including the triage/intent LLM calls and any chat-thread turn, + * double-billing and posting duplicate replies. `claimDelivery` is the + * canonical SET-NX-with-TTL idempotency claim, evaluated at the top of each + * handler's dispatch path before any side-effect. + */ + +// 3 days, matching GitHub's redelivery window. +const TTL_SECONDS = 259_200; +const KEY_PREFIX = "idemp:webhook:"; + +/** + * Claim a webhook delivery for processing. + * + * Returns `true` exactly once per `deliveryId` within the TTL window (the first + * caller proceeds); a redelivery gets `false` and must skip. + * + * Fail-OPEN: if Valkey is unavailable or errors, returns `true` so an outage + * degrades to at-least-once processing rather than dropping every webhook. The + * in-flight unique index (`idx_workflow_runs_inflight`) and the GitHub + * tracking-comment marker scan remain the durable backstops against duplicate + * work when this best-effort layer is unavailable. + */ +export async function claimDelivery(deliveryId: string, log: Logger): Promise { + const client = getValkeyClient(); + // `getValkeyClient()` returns a non-null client even while the TCP connection + // is down (it is null only when VALKEY_URL is unset). Bun's RedisClient + // defaults to `enableOfflineQueue: true`, so issuing SET against a + // disconnected client would QUEUE and block (up to the 10s connectionTimeout) + // instead of failing open. Gate on `isValkeyHealthy()` (the same liveness + // signal `router.ts` dispatch guards use) so a configured-but-down Valkey + // takes the immediate fail-open path, leaving the durable backstops + // (`idx_workflow_runs_inflight` + tracking-comment marker scan) to dedup. + if (client === null || !isValkeyHealthy()) { + log.warn({ deliveryId }, "claimDelivery: Valkey unavailable, proceeding (fail-open)"); + return true; + } + try { + // SET key 1 NX EX : returns "OK" iff the key did not exist (we won the + // claim); returns null when it already exists (a redelivery). + // `RedisClient.send` is typed `Promise`; SET-NX returns "OK" or null. + const res = (await client.send("SET", [ + `${KEY_PREFIX}${deliveryId}`, + "1", + "NX", + "EX", + String(TTL_SECONDS), + ])) as string | null; + if (res === "OK") return true; + log.info( + { deliveryId, event: "dedup-skip" }, + "claimDelivery: duplicate webhook delivery, skipping", + ); + return false; + } catch (err) { + log.warn( + { deliveryId, err: err instanceof Error ? err.message : String(err) }, + "claimDelivery: Valkey error, proceeding (fail-open)", + ); + return true; + } +} diff --git a/test/webhook/events/issues.test.ts b/test/webhook/events/issues.test.ts index fbdd0a20..8132be0a 100644 --- a/test/webhook/events/issues.test.ts +++ b/test/webhook/events/issues.test.ts @@ -13,6 +13,12 @@ * to the dispatcher, which relies on the partial unique index at the * runs-store layer. The handler is invoked twice → dispatcher is invoked * twice → second call returns {status:"refused", reason:"in-flight…"}. + * + * Dispatch is fire-and-forget inside a `void (async () => {...})()` IIFE whose + * first statement is `await claimDelivery(...)` (issue #202). That await defers + * the `dispatchByLabel` call past the synchronous test body, so the positive + * cases must drain the microtask queue (`flushMicrotasks`) before asserting, + * otherwise the deferred call also leaks into the next test after `mockClear`. */ import type { IssuesEvent } from "@octokit/webhooks-types"; @@ -47,6 +53,13 @@ const { handleIssues } = await import("../../../src/webhook/events/issues"); const fakeOctokit = {} as unknown as Octokit; +// Drain the microtask queue so the fire-and-forget dispatch IIFE runs past its +// leading `await claimDelivery(...)` gate (#202) and the `await dispatchByLabel` +// call lands before assertions / the next test's `mockClear`. +async function flushMicrotasks(): Promise { + for (let i = 0; i < 5; i++) await Promise.resolve(); +} + function issueLabeledPayload(overrides?: { labelName?: string; senderLogin?: string; @@ -76,8 +89,9 @@ describe("handleIssues", () => { ); }); - it("dispatches for bot:triage on open issue from allowed sender (T012)", () => { + it("dispatches for bot:triage on open issue from allowed sender (T012)", async () => { handleIssues(fakeOctokit, issueLabeledPayload({ labelName: "bot:triage" }), "delivery-1"); + await flushMicrotasks(); expect(mockDispatchByLabel).toHaveBeenCalledTimes(1); const call = mockDispatchByLabel.mock.calls[0] as unknown as [ @@ -116,8 +130,13 @@ describe("handleIssues", () => { }); it("T014: duplicate label events delegate to dispatcher, second invocation is refused by idempotency guard", async () => { - // First call: normal dispatch + // First call: normal dispatch. Drain BEFORE arming the once-impl so this + // dispatch resolves on the default "dispatched" path. Dispatch is deferred + // past the leading `await claimDelivery(...)` gate (#202), so without this + // drain the once-impl below would be consumed by THIS (first) call instead + // of the second one. handleIssues(fakeOctokit, issueLabeledPayload({ labelName: "bot:triage" }), "delivery-dup-1"); + await flushMicrotasks(); // Second call with the same (target, workflow): dispatcher reports the // partial unique index rejection surfaced by runs-store. @@ -130,9 +149,9 @@ describe("handleIssues", () => { ); handleIssues(fakeOctokit, issueLabeledPayload({ labelName: "bot:triage" }), "delivery-dup-2"); - // Let the micro-task queue drain so the fire-and-forget dispatch resolves. - await Promise.resolve(); - await Promise.resolve(); + // Let the micro-task queue drain so the second fire-and-forget dispatch + // resolves. + await flushMicrotasks(); expect(mockDispatchByLabel).toHaveBeenCalledTimes(2); // Second call's resolved outcome is the "in-flight" refusal, the handler diff --git a/test/webhook/idempotency.test.ts b/test/webhook/idempotency.test.ts new file mode 100644 index 00000000..2b0bd63c --- /dev/null +++ b/test/webhook/idempotency.test.ts @@ -0,0 +1,109 @@ +/** + * Issue #202: webhook delivery idempotency. + * + * `claimDelivery` is a Valkey SET-NX-EX claim that returns true exactly once + * per deliveryId (the first caller proceeds), false on a redelivery, and + * fail-OPEN (true) whenever Valkey is unavailable or errors. + */ + +import { beforeEach, describe, expect, it, mock } from "bun:test"; +import type { Logger } from "pino"; + +// Configurable Valkey stub: each test swaps `clientImpl` before importing / +// invoking claimDelivery. getValkeyClient returns whatever clientImpl holds. +type SendFn = (cmd: string, args: string[]) => Promise; +let clientImpl: { send: SendFn } | null = null; +let healthy = true; + +// `isValkeyHealthy` gates the SET: the non-null-client cases need it true to +// reach the command; the null-client case short-circuits on `client === null` +// before it is read; the configured-but-down case sets it false. +void mock.module("../../src/orchestrator/valkey", () => ({ + getValkeyClient: () => clientImpl, + isValkeyHealthy: () => healthy, +})); + +const { claimDelivery } = await import("../../src/webhook/idempotency"); + +const log = { + warn: () => {}, + info: () => {}, +} as unknown as Logger; + +// SET-NX-EX semantics: first SET of a key returns "OK", a second SET of the +// same key returns null (the key already exists). Mirrors real Valkey NX. +function nxClient(): { send: SendFn; store: Set } { + const store = new Set(); + return { + store, + send: (cmd, args) => { + if (cmd !== "SET") return Promise.resolve(null); + const key = args[0] ?? ""; + const hasNx = args.includes("NX"); + if (hasNx && store.has(key)) return Promise.resolve(null); + store.add(key); + return Promise.resolve("OK"); + }, + }; +} + +describe("claimDelivery (issue #202)", () => { + beforeEach(() => { + clientImpl = null; + healthy = true; + }); + + it("claims a new delivery once, then rejects the redelivery", async () => { + clientImpl = nxClient(); + const first = await claimDelivery("delivery-abc", log); + const second = await claimDelivery("delivery-abc", log); + expect(first).toBe(true); + expect(second).toBe(false); + }); + + it("treats distinct deliveryIds independently", async () => { + clientImpl = nxClient(); + expect(await claimDelivery("delivery-1", log)).toBe(true); + expect(await claimDelivery("delivery-2", log)).toBe(true); + }); + + it("issues SET with NX and a 3-day EX TTL", async () => { + const calls: string[][] = []; + clientImpl = { + send: (cmd, args) => { + calls.push([cmd, ...args]); + return Promise.resolve("OK"); + }, + }; + await claimDelivery("delivery-ttl", log); + expect(calls).toEqual([["SET", "idemp:webhook:delivery-ttl", "1", "NX", "EX", "259200"]]); + }); + + it("fails OPEN (true) when Valkey is unconfigured (null client)", async () => { + clientImpl = null; + expect(await claimDelivery("delivery-no-valkey", log)).toBe(true); + }); + + it("fails OPEN (true) when the Valkey SET throws", async () => { + clientImpl = { + send: () => Promise.reject(new Error("ECONNREFUSED")), + }; + expect(await claimDelivery("delivery-error", log)).toBe(true); + }); + + it("fails OPEN (true) without issuing SET when configured-but-disconnected", async () => { + // Non-null client (VALKEY_URL set) but the connection is down: gating on + // isValkeyHealthy() must skip the SET (which would otherwise queue/block + // under Bun's default enableOfflineQueue) and fail open immediately. + let sendCalled = false; + clientImpl = { + send: () => { + sendCalled = true; + return Promise.resolve("OK"); + }, + }; + healthy = false; + expect(await claimDelivery("delivery-disconnected", log)).toBe(true); + expect(sendCalled).toBe(false); + }); +}); From 8735a97335b8d8de2bcbc324d412ed18bf59212d Mon Sep 17 00:00:00 2001 From: Chris Lee Date: Wed, 3 Jun 2026 19:50:47 +1000 Subject: [PATCH 2/2] docs(idempotency): correct backstop doc comment and test wiring note from review Co-Authored-By: Claude Opus 4.8 --- src/webhook/idempotency.ts | 9 ++++++--- test/webhook/idempotency.test.ts | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/webhook/idempotency.ts b/src/webhook/idempotency.ts index f364c1a8..31de559b 100644 --- a/src/webhook/idempotency.ts +++ b/src/webhook/idempotency.ts @@ -26,9 +26,12 @@ const KEY_PREFIX = "idemp:webhook:"; * * Fail-OPEN: if Valkey is unavailable or errors, returns `true` so an outage * degrades to at-least-once processing rather than dropping every webhook. The - * in-flight unique index (`idx_workflow_runs_inflight`) and the GitHub - * tracking-comment marker scan remain the durable backstops against duplicate - * work when this best-effort layer is unavailable. + * `idx_workflow_runs_inflight` partial-unique index remains the durable backstop + * against duplicate work when this best-effort layer is skipped: the dispatcher + * rejects a second in-flight run for the same workflow+target. (The + * tracking-comment marker scan via `isAlreadyProcessed` is NOT a backstop here: + * it runs only on the legacy `router.ts processRequest` path that production + * handlers bypass, issue #202.) */ export async function claimDelivery(deliveryId: string, log: Logger): Promise { const client = getValkeyClient(); diff --git a/test/webhook/idempotency.test.ts b/test/webhook/idempotency.test.ts index 2b0bd63c..65f9ae00 100644 --- a/test/webhook/idempotency.test.ts +++ b/test/webhook/idempotency.test.ts @@ -9,8 +9,9 @@ import { beforeEach, describe, expect, it, mock } from "bun:test"; import type { Logger } from "pino"; -// Configurable Valkey stub: each test swaps `clientImpl` before importing / -// invoking claimDelivery. getValkeyClient returns whatever clientImpl holds. +// Configurable Valkey stub: the mock is wired once at module load (below); +// each test swaps `clientImpl` (and `healthy`) before invoking claimDelivery. +// getValkeyClient returns whatever clientImpl holds at call time. type SendFn = (cmd: string, args: string[]) => Promise; let clientImpl: { send: SendFn } | null = null; let healthy = true;