From e22a82c8eb79be91f4e09c9217f9c60376cde862 Mon Sep 17 00:00:00 2001 From: "chrisleekr-bot[bot]" <2890262+chrisleekr-bot[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:21:43 +0000 Subject: [PATCH 1/3] feat(observability): add structured retry.* events (#215) Replace text-only retry logs with structured `retry.attempt_failed`, `retry.non_retriable`, `retry.exhausted` events carrying `op`, `attempt`, `max_attempts`, `elapsed_ms`, plus a new `retry.succeeded_after_retry` info event as a weak-flake leading indicator (gated to attempt > 1, so first-try successes stay silent). Schema lives in `retry-log-fields.ts` with `.strict()` field-drift guarding, mirroring `octokit-observability.ts`. Co-Authored-By: Claude Opus 4.7 --- src/utils/retry-log-fields.ts | 52 +++++++++ src/utils/retry.ts | 118 +++++++++++++++---- test/utils/retry-log-fields.test.ts | 173 ++++++++++++++++++++++++++++ test/utils/retry.test.ts | 158 +++++++++++++++++++++++++ 4 files changed, 476 insertions(+), 25 deletions(-) create mode 100644 src/utils/retry-log-fields.ts create mode 100644 test/utils/retry-log-fields.test.ts diff --git a/src/utils/retry-log-fields.ts b/src/utils/retry-log-fields.ts new file mode 100644 index 0000000..38c70c7 --- /dev/null +++ b/src/utils/retry-log-fields.ts @@ -0,0 +1,52 @@ +/** + * Canonical pino log-field schema for `retryWithBackoff` observability (issue #215). + * + * Mirrors `src/utils/octokit-observability.ts` and `src/core/log-fields.ts`: a + * `.strict()` Zod shape pins the structured `retry.*` event family so the four + * emit sites in `retryWithBackoff` cannot drift on a field name (e.g. `delayMs` + * vs `delay_ms`) without the co-located test catching it. Emitters log plain + * objects via `log.warn` / `log.info` / `log.error`; the schema is the + * drift-prevention contract, not a runtime validator on the hot path. + * + * `attempt` is the 1-based attempt ordinal (aligns with OpenTelemetry + * `http.request.resend_count`). `elapsed_ms` is wall-clock since + * `retryWithBackoff` entry, so an `exhausted` line carries the full retry + * window's duration without parsing prior lines. `delay_ms` is the *next* + * backoff that will be slept (set on `retry.attempt_failed` only when another + * attempt will follow; omitted on the final attempt because no sleep will + * occur). + */ +import { z } from "zod"; + +export const RETRY_LOG_EVENTS = { + attemptFailed: "retry.attempt_failed", + nonRetriable: "retry.non_retriable", + exhausted: "retry.exhausted", + succeededAfterRetry: "retry.succeeded_after_retry", +} as const; + +/** + * `.strict()` shape for the four-event family. `err` is intentionally not + * enumerated here: it is serialized via pino's existing `errSerializer` and + * remains attached to the line as-is, mirroring the existing + * `GithubApiLogFieldsSchema` pattern (which also does not enumerate `err`). + * Callers test the structured scalar fields; pino serializes the error. + */ +export const RetryLogFieldsSchema = z + .object({ + event: z.enum([ + RETRY_LOG_EVENTS.attemptFailed, + RETRY_LOG_EVENTS.nonRetriable, + RETRY_LOG_EVENTS.exhausted, + RETRY_LOG_EVENTS.succeededAfterRetry, + ]), + op: z.string().min(1), + attempt: z.number().int().positive(), + max_attempts: z.number().int().positive(), + elapsed_ms: z.number().int().nonnegative(), + delay_ms: z.number().int().nonnegative().optional(), + status: z.number().int().optional(), + }) + .strict(); + +export type RetryLogFields = z.infer; diff --git a/src/utils/retry.ts b/src/utils/retry.ts index 25052d3..5e72452 100644 --- a/src/utils/retry.ts +++ b/src/utils/retry.ts @@ -2,6 +2,7 @@ import pino from "pino"; import type { Logger } from "../logger"; import { errSerializer, REDACT_PATHS, resolveLogLevel } from "./log-redaction"; +import { RETRY_LOG_EVENTS } from "./retry-log-fields"; // Config-free default logger (issue #184). Importing retry.ts must not pull in // src/logger.ts -> src/config, so the stdio MCP servers that use retry (e.g. @@ -40,6 +41,13 @@ export interface RetryOptions { * same), so callers can forward a maybe-undefined logger without a guard. */ log?: Logger | undefined; + /** + * Short dotted identifier for the wrapped operation (e.g. `"github.fetch"`, + * `"mcp.comment.update"`). Surfaces on every `retry.*` event so an operator + * can break the retry rate down per upstream call site. Defaults to + * `"unknown"` when omitted so emits always carry a non-empty `op`. + */ + op?: string | undefined; } /** @@ -90,6 +98,7 @@ export async function retryWithBackoff( maxDelayMs = 20000, backoffFactor = 2, log = defaultLog, + op = "unknown", } = options; // Fail fast on invalid input. Each check names the offending option and @@ -100,42 +109,70 @@ export async function retryWithBackoff( validateNumberOption("maxDelayMs", maxDelayMs, { min: 0 }); validateNumberOption("backoffFactor", backoffFactor, { min: 1 }); + const startedAt = Date.now(); let delayMs = initialDelayMs; let lastError: Error | undefined; for (let attempt = 1; attempt <= maxAttempts; attempt++) { try { // eslint-disable-next-line no-await-in-loop - return await operation(); + const value = await operation(); + // Weak-flake leading indicator: emit only when the call succeeded after + // at least one prior failure. First-try successes stay silent so the + // event count tracks the body of the transient-failure distribution, + // not normal traffic. + if (attempt > 1) { + log.info( + { + event: RETRY_LOG_EVENTS.succeededAfterRetry, + op, + attempt, + max_attempts: maxAttempts, + elapsed_ms: Date.now() - startedAt, + }, + "Operation succeeded after retry", + ); + } + return value; } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); + const elapsedMs = Date.now() - startedAt; - // Do not retry permanent client errors (4xx except 429 Too Many Requests). - // Octokit wraps HTTP errors with a .status property; non-HTTP errors lack it. - const status = (error as { status?: number }).status; - if (status !== undefined && status >= 400 && status < 500 && status !== 429) { - // Exception: GitHub delivers a *secondary* rate limit as HTTP 403 (not - // 429) with a "secondary rate limit" body. A status-only check would - // misclassify it as a permanent permission error, so inspect the - // message and let a secondary rate limit fall through to the backoff - // path. A plain 403 (no marker) still fails fast. The message is - // inspected only inside this 4xx branch, so non-4xx errors skip the - // string work. See issue #199. - const isSecondaryRateLimit = lastError.message - .toLowerCase() - .includes("secondary rate limit"); - if (!isSecondaryRateLimit) { - log.warn( - { attempt, status, err: lastError }, - "Non-retriable error, throwing immediately", - ); - throw lastError; - } + if (isNonRetriable(error, lastError)) { + const status = (error as { status?: number }).status; + log.warn( + { + event: RETRY_LOG_EVENTS.nonRetriable, + op, + attempt, + max_attempts: maxAttempts, + elapsed_ms: elapsedMs, + status, + err: lastError, + }, + "Non-retriable error, throwing immediately", + ); + throw lastError; } - log.warn({ attempt, maxAttempts, err: lastError }, "Operation attempt failed"); + // Compute the next delay BEFORE the emit so the line carries the delay + // that will actually be slept. Omit the field on the final attempt + // because no sleep will occur (the loop falls through to `exhausted`). + const willRetry = attempt < maxAttempts; + log.warn( + { + event: RETRY_LOG_EVENTS.attemptFailed, + op, + attempt, + max_attempts: maxAttempts, + elapsed_ms: elapsedMs, + ...(willRetry ? { delay_ms: delayMs } : {}), + err: lastError, + }, + "Operation attempt failed", + ); - if (attempt < maxAttempts) { + if (willRetry) { // eslint-disable-next-line no-await-in-loop await new Promise((resolve) => setTimeout(resolve, delayMs)); delayMs = Math.min(delayMs * backoffFactor, maxDelayMs); @@ -143,9 +180,40 @@ export async function retryWithBackoff( } } - log.error({ maxAttempts }, "Operation failed after all attempts"); + log.error( + { + event: RETRY_LOG_EVENTS.exhausted, + op, + attempt: maxAttempts, + max_attempts: maxAttempts, + elapsed_ms: Date.now() - startedAt, + err: lastError, + }, + "Operation failed after all attempts", + ); // Safe to assert: `maxAttempts >= 1` is enforced above, so the loop ran // at least once, meaning `lastError` was assigned in the catch block. // eslint-disable-next-line @typescript-eslint/no-non-null-assertion throw lastError!; } + +/** + * Decide whether an error is a permanent 4xx that should bypass retry. + * + * Octokit wraps HTTP errors with a `.status` property; non-HTTP errors lack + * it. A 4xx is non-retriable EXCEPT 429 (Too Many Requests) and a 403 whose + * message marks a GitHub *secondary* rate limit (delivered as 403, not 429). + * The secondary-rate-limit marker is inspected only inside the 4xx branch so + * non-4xx errors skip the string work. See issue #199. + * + * Extracted from `retryWithBackoff` so the main loop's complexity stays + * tractable as more structured-log branches accumulate. + */ +function isNonRetriable(error: unknown, normalized: Error): boolean { + const status = (error as { status?: number }).status; + if (status === undefined || status < 400 || status >= 500 || status === 429) { + return false; + } + const isSecondaryRateLimit = normalized.message.toLowerCase().includes("secondary rate limit"); + return !isSecondaryRateLimit; +} diff --git a/test/utils/retry-log-fields.test.ts b/test/utils/retry-log-fields.test.ts new file mode 100644 index 0000000..ae3c4ea --- /dev/null +++ b/test/utils/retry-log-fields.test.ts @@ -0,0 +1,173 @@ +import { describe, expect, it } from "bun:test"; + +import { RETRY_LOG_EVENTS, RetryLogFieldsSchema } from "../../src/utils/retry-log-fields"; + +describe("RETRY_LOG_EVENTS", () => { + it("pins the four canonical event strings", () => { + expect(RETRY_LOG_EVENTS.attemptFailed).toBe("retry.attempt_failed"); + expect(RETRY_LOG_EVENTS.nonRetriable).toBe("retry.non_retriable"); + expect(RETRY_LOG_EVENTS.exhausted).toBe("retry.exhausted"); + expect(RETRY_LOG_EVENTS.succeededAfterRetry).toBe("retry.succeeded_after_retry"); + }); +}); + +describe("RetryLogFieldsSchema: accepts well-formed events", () => { + it("accepts retry.attempt_failed with delay_ms (an attempt before the last)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + delay_ms: 5000, + }); + expect(result.success).toBe(true); + }); + + it("accepts retry.attempt_failed without delay_ms (the final attempt)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 3, + max_attempts: 3, + elapsed_ms: 15_000, + }); + expect(result.success).toBe(true); + }); + + it("accepts retry.non_retriable with status", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.nonRetriable, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 12, + status: 404, + }); + expect(result.success).toBe(true); + }); + + it("accepts retry.exhausted", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.exhausted, + op: "github.fetch", + attempt: 3, + max_attempts: 3, + elapsed_ms: 30_000, + }); + expect(result.success).toBe(true); + }); + + it("accepts retry.succeeded_after_retry", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.succeededAfterRetry, + op: "github.fetch", + attempt: 2, + max_attempts: 3, + elapsed_ms: 5_100, + }); + expect(result.success).toBe(true); + }); +}); + +describe("RetryLogFieldsSchema: rejects field-name drift", () => { + it("rejects camelCase delayMs (must be delay_ms)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + delayMs: 5000, + }); + expect(result.success).toBe(false); + }); + + it("rejects camelCase elapsedMs (must be elapsed_ms)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsedMs: 42, + }); + expect(result.success).toBe(false); + }); + + it("rejects camelCase maxAttempts (must be max_attempts)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + maxAttempts: 3, + elapsed_ms: 42, + }); + expect(result.success).toBe(false); + }); + + it("rejects an unknown event literal", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: "retry.something_else", + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + }); + expect(result.success).toBe(false); + }); + + it("rejects an unknown extra field (strict)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + surprise: "boo", + }); + expect(result.success).toBe(false); + }); + + it("rejects negative elapsed_ms", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.exhausted, + op: "github.fetch", + attempt: 3, + max_attempts: 3, + elapsed_ms: -1, + }); + expect(result.success).toBe(false); + }); + + it("rejects a non-integer attempt", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1.5, + max_attempts: 3, + elapsed_ms: 42, + }); + expect(result.success).toBe(false); + }); + + it("rejects an empty op", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + }); + expect(result.success).toBe(false); + }); + + it("rejects a missing op", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + }); + expect(result.success).toBe(false); + }); +}); diff --git a/test/utils/retry.test.ts b/test/utils/retry.test.ts index 02d7184..b156bc2 100644 --- a/test/utils/retry.test.ts +++ b/test/utils/retry.test.ts @@ -1,10 +1,32 @@ import { describe, expect, it, mock } from "bun:test"; import { retryWithBackoff } from "../../src/utils/retry"; +import { RETRY_LOG_EVENTS, RetryLogFieldsSchema } from "../../src/utils/retry-log-fields"; import { makeSilentLogger } from "../factories"; const silentLog = makeSilentLogger(); +/** + * Extract the first object-form argument of a captured `log.warn` / `log.info` + * / `log.error` call. pino emitters take `(obj, msg)` shape, so the first arg + * is the structured fields. Strips `err` (pino's errSerializer handles it) and + * any other non-schema scalars before returning so the result can be fed + * straight into `RetryLogFieldsSchema.parse`. + */ +function structuredFields(call: readonly unknown[]): Record { + const arg = call[0]; + if (typeof arg !== "object" || arg === null) { + throw new Error("expected first arg to be a structured fields object"); + } + const out: Record = {}; + for (const [k, v] of Object.entries(arg as Record)) { + if (k === "err") continue; + // eslint-disable-next-line security/detect-object-injection -- keys come from the test's own mock captures, not external input + out[k] = v; + } + return out; +} + function makeStatusError(status: number): Error & { status: number } { const err = new Error(`HTTP ${status}`) as Error & { status: number }; err.status = status; @@ -242,3 +264,139 @@ describe("retryWithBackoff: input validation", () => { await expectValidationError({ backoffFactor: 0.5 }, "backoffFactor must be >= 1"); }); }); + +describe("retryWithBackoff: structured log events (#215)", () => { + it("emits nothing on first-try success (succeeded_after_retry is gated on attempt > 1)", async () => { + const log = makeSilentLogger(); + const op = mock(() => Promise.resolve("ok")); + const result = await retryWithBackoff(op, { log, op: "test.success" }); + expect(result).toBe("ok"); + expect(log.info).toHaveBeenCalledTimes(0); + expect(log.warn).toHaveBeenCalledTimes(0); + expect(log.error).toHaveBeenCalledTimes(0); + }); + + it("emits retry.attempt_failed (with delay_ms) then retry.succeeded_after_retry on attempt-2 success", async () => { + const log = makeSilentLogger(); + let calls = 0; + const op = mock(() => { + calls++; + if (calls === 1) return Promise.reject(new Error("transient")); + return Promise.resolve("recovered"); + }); + const result = await retryWithBackoff(op, { + maxAttempts: 3, + initialDelayMs: 1, + log, + op: "test.attempt2", + }); + expect(result).toBe("recovered"); + + expect(log.warn).toHaveBeenCalledTimes(1); + const warnFields = structuredFields(log.warn.mock.calls[0] ?? []); + expect(warnFields).toMatchObject({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "test.attempt2", + attempt: 1, + max_attempts: 3, + delay_ms: 1, + }); + expect(typeof warnFields["elapsed_ms"]).toBe("number"); + expect(() => RetryLogFieldsSchema.parse(warnFields)).not.toThrow(); + + expect(log.info).toHaveBeenCalledTimes(1); + const infoFields = structuredFields(log.info.mock.calls[0] ?? []); + expect(infoFields).toMatchObject({ + event: RETRY_LOG_EVENTS.succeededAfterRetry, + op: "test.attempt2", + attempt: 2, + max_attempts: 3, + }); + expect(typeof infoFields["elapsed_ms"]).toBe("number"); + expect(() => RetryLogFieldsSchema.parse(infoFields)).not.toThrow(); + }); + + it("emits retry.non_retriable (with status) on a 404 and rethrows", async () => { + const log = makeSilentLogger(); + const err = new Error("HTTP 404") as Error & { status: number }; + err.status = 404; + const op = mock(() => Promise.reject(err)); + + let thrown: unknown; + try { + await retryWithBackoff(op, { + maxAttempts: 3, + initialDelayMs: 1, + log, + op: "test.nonretriable", + }); + } catch (e) { + thrown = e; + } + expect(thrown).toBe(err); + + expect(log.warn).toHaveBeenCalledTimes(1); + const fields = structuredFields(log.warn.mock.calls[0] ?? []); + expect(fields).toMatchObject({ + event: RETRY_LOG_EVENTS.nonRetriable, + op: "test.nonretriable", + attempt: 1, + max_attempts: 3, + status: 404, + }); + expect(typeof fields["elapsed_ms"]).toBe("number"); + expect(() => RetryLogFieldsSchema.parse(fields)).not.toThrow(); + }); + + it("emits retry.exhausted (error level) after the final attempt fails", async () => { + const log = makeSilentLogger(); + const op = mock(() => Promise.reject(new Error("always fails"))); + + let thrown = ""; + try { + await retryWithBackoff(op, { + maxAttempts: 3, + initialDelayMs: 1, + log, + op: "test.exhausted", + }); + } catch (e) { + thrown = e instanceof Error ? e.message : String(e); + } + expect(thrown).toBe("always fails"); + + expect(log.error).toHaveBeenCalledTimes(1); + const fields = structuredFields(log.error.mock.calls[0] ?? []); + expect(fields).toMatchObject({ + event: RETRY_LOG_EVENTS.exhausted, + op: "test.exhausted", + attempt: 3, + max_attempts: 3, + }); + expect(typeof fields["elapsed_ms"]).toBe("number"); + expect(() => RetryLogFieldsSchema.parse(fields)).not.toThrow(); + + // Final attempt_failed line carries NO delay_ms (no sleep will follow). + expect(log.warn).toHaveBeenCalledTimes(3); + const finalWarn = structuredFields(log.warn.mock.calls[2] ?? []); + expect(finalWarn["event"]).toBe(RETRY_LOG_EVENTS.attemptFailed); + expect(finalWarn["attempt"]).toBe(3); + expect(finalWarn["delay_ms"]).toBeUndefined(); + expect(() => RetryLogFieldsSchema.parse(finalWarn)).not.toThrow(); + }); + + it("defaults op to 'unknown' when the caller has not threaded one", async () => { + const log = makeSilentLogger(); + const op = mock(() => Promise.reject(new Error("always fails"))); + let thrown: unknown; + try { + await retryWithBackoff(op, { maxAttempts: 1, initialDelayMs: 1, log }); + } catch (e) { + thrown = e; + } + expect(thrown).toBeInstanceOf(Error); + expect(log.error).toHaveBeenCalledTimes(1); + const fields = structuredFields(log.error.mock.calls[0] ?? []); + expect(fields["op"]).toBe("unknown"); + }); +}); From 1181d777ecca94a907b008dc8e5a94bfdcefdd3f Mon Sep 17 00:00:00 2001 From: "chrisleekr-bot[bot]" <2890262+chrisleekr-bot[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:22:08 +0000 Subject: [PATCH 2/3] feat(observability): thread retry op tag through 12 call sites (#215) Pass `op` to every `retryWithBackoff` call so operators can break the retry rate down per upstream call site. Includes pipeline tracking-comment paths, GitHub state-fetchers, fetcher review followups, ship probe, and four MCP servers. Document the new events + fields in `docs/operate/observability.md`. Co-Authored-By: Claude Opus 4.7 --- docs/operate/observability.md | 18 ++++++++++++++ src/core/fetcher.ts | 2 +- src/core/pipeline.ts | 4 ++++ src/github/state-fetchers.ts | 14 +++++------ src/mcp/servers/comment.ts | 2 +- src/mcp/servers/inline-comment.ts | 3 ++- src/mcp/servers/merge-readiness.ts | 2 +- src/mcp/servers/resolve-review-thread.ts | 4 ++-- src/workflows/ship/probe.ts | 30 ++++++++++++++---------- 9 files changed, 53 insertions(+), 26 deletions(-) diff --git a/docs/operate/observability.md b/docs/operate/observability.md index 7be109a..2f1b6e4 100644 --- a/docs/operate/observability.md +++ b/docs/operate/observability.md @@ -38,6 +38,11 @@ The crash path is covered too. `installFatalHandlers(processName)` in `src/logge | `stage` | Pipeline stage name on a `pipeline.stage` event (e.g. `github.fetch`, `repo.clone`, `executor.invoke`). | | `delta_ms` | Wall-clock of a single `pipeline.stage` (integer ms). | | `pipeline_wall_clock_ms` | Cumulative pipeline duration, on the terminal `pipeline.completed` / `pipeline.failed` line (integer ms). | +| `op` | Short dotted identifier on a `retry.*` event identifying the wrapped call site (e.g. `github.fetch`, `mcp.comment.update`). See [Retry log fields](#retry-log-fields). | +| `attempt` | 1-based attempt ordinal on a `retry.*` event. Aligns with the OpenTelemetry `http.request.resend_count` semantic. | +| `max_attempts` | Maximum-attempts ceiling for the current `retryWithBackoff` call. | +| `elapsed_ms` | Wall-clock since `retryWithBackoff` entry on a `retry.*` event (integer ms). On `retry.exhausted` it carries the full retry-window duration. | +| `delay_ms` | Next backoff that will be slept after a `retry.attempt_failed`. Omitted on the final attempt because no sleep follows. | ## Core pipeline log fields @@ -74,6 +79,19 @@ Volume policy: the per-request line is `debug` (default `info` stays quiet on a | `github.api.rate_limit_low` | warn | Same fields; emitted once `rate_limit_remaining` drops below `RATE_LIMIT_LOW_WATER` (500). | | `github.api.rate_limit_warning` | warn | `route`, `status`, `retry_after_s`; on a 429 or 403 secondary-rate-limit response. | +## Retry log fields + +`retryWithBackoff` (`src/utils/retry.ts`) is the single chokepoint guarding every transient-failure recovery in the bot, including the GraphQL fetcher, every GitHub-touching MCP server, the orchestrator triage probe, the ship probe, and the pipeline's three GitHub writes. Its four-event family is pinned by the `.strict()` Zod schema in `src/utils/retry-log-fields.ts` so an emitter that adds an unpinned field or mistypes one (e.g. `delayMs` vs `delay_ms`) trips the co-located test. Every event carries `op` (a short dotted identifier from the call site), `attempt`, `max_attempts`, and `elapsed_ms`. See [Common log fields](#common-log-fields) for the scalar field definitions. + +The load-bearing event is `retry.succeeded_after_retry`: it is the only signal in the bot's telemetry today that an upstream is starting to wobble _before_ full failure. A 1% transient-failure rate against GitHub or Bedrock is invisible at the default `info` level otherwise, the only retry telemetry is warn/error, which fires only on the long tail. The `info`-level `succeeded_after_retry` line makes the body of the transient-failure distribution observable; an alert on `count(event = "retry.succeeded_after_retry") by op` over 5-minute windows surfaces the leading indicator without waiting for the warn-level `attempt_failed` count to spike. + +| `event` | Level | When | +| ----------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `retry.attempt_failed` | warn | An attempt threw a retriable error (5xx, 429, or 403 secondary rate-limit). Carries `delay_ms` when another attempt will follow; omitted on the final attempt. | +| `retry.non_retriable` | warn | A 4xx (except 429 and 403 secondary rate-limit) bypasses retry. Carries `status` and rethrows. | +| `retry.exhausted` | error | All `max_attempts` attempts failed. Carries the full retry-window `elapsed_ms`; rethrows the last error. | +| `retry.succeeded_after_retry` | info | The call succeeded on `attempt > 1`. Weak-flake leading indicator: gated on `attempt > 1` so first-try successes stay silent. Alert on `count(...) by op` over 5-minute windows. | + ## Output secret-guard log events `safePostToGitHub` (`src/utils/github-output-guard.ts`) is the output-side chokepoint for every byte sent to GitHub. It emits structured `warn`/`error` events when the regex pass or the optional LLM scanner acts on a body. Per the logging contract, none of these carry the matched bytes, surrounding context, or a hash, only `kinds`, counts, lengths, `callsite`, and `deliveryId`. diff --git a/src/core/fetcher.ts b/src/core/fetcher.ts index 19d67eb..0d9a2d1 100644 --- a/src/core/fetcher.ts +++ b/src/core/fetcher.ts @@ -441,7 +441,7 @@ async function fetchRemainingReviewComments( reviewId, cursor: startCursor, }), - { log }, + { log, op: "github.review.followup" }, ); return result.node?.comments.nodes ?? []; } catch (err) { diff --git a/src/core/pipeline.ts b/src/core/pipeline.ts index 74a3114..9d7a98a 100644 --- a/src/core/pipeline.ts +++ b/src/core/pipeline.ts @@ -304,6 +304,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 1000, log: ctx.log, + op: "trackingComment.create", }), ); } @@ -318,6 +319,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 2000, log: ctx.log, + op: "github.fetch", }), ); @@ -468,6 +470,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 1000, log: enrichedCtx.log, + op: "trackingComment.finalize", }, ), ); @@ -587,6 +590,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 1000, log: ctx.log, + op: "trackingComment.finalize.failure", }, ); } catch (commentError) { diff --git a/src/github/state-fetchers.ts b/src/github/state-fetchers.ts index ce128cd..02b8cca 100644 --- a/src/github/state-fetchers.ts +++ b/src/github/state-fetchers.ts @@ -118,7 +118,7 @@ export async function getPrStateCheckRollup( repo: deps.repo, number: prNumber, }), - { log: deps.log }, + { log: deps.log, op: "github.state.prStateCheckRollup" }, ); const pr = data.repository?.pullRequest; if (pr === null || pr === undefined) { @@ -187,7 +187,7 @@ export async function getCheckRunOutput( repo: deps.repo, check_run_id: checkRunId, }), - { log: deps.log }, + { log: deps.log, op: "github.state.checkRunOutput" }, ); const text = result.data.output.text ?? ""; const truncated = truncate(text, MAX_CHECK_OUTPUT_BYTES); @@ -216,7 +216,7 @@ export async function getWorkflowRun(deps: GithubStateDeps, runId: number): Prom repo: deps.repo, run_id: runId, }), - { log: deps.log }, + { log: deps.log, op: "github.state.workflowRun" }, ); return serialize({ id: result.data.id, @@ -261,7 +261,7 @@ export async function getBranchProtection(deps: GithubStateDeps, branch: string) throw err; } }, - { log: deps.log }, + { log: deps.log, op: "github.state.branchProtection" }, ); if (result === null) { return serialize({ branch, protected: false }); @@ -285,7 +285,7 @@ export async function getPrDiff(deps: GithubStateDeps, prNumber: number): Promis pull_number: prNumber, mediaType: { format: "diff" }, }), - { log: deps.log }, + { log: deps.log, op: "github.state.prDiff" }, ); const diff = typeof result.data === "string" ? result.data : JSON.stringify(result.data); const { text, truncated } = truncate(diff, MAX_DIFF_BYTES); @@ -309,7 +309,7 @@ export async function getPrFiles(deps: GithubStateDeps, prNumber: number): Promi pull_number: prNumber, per_page: 100, }), - { log: deps.log }, + { log: deps.log, op: "github.state.prFiles" }, ); return serialize({ pr_number: prNumber, @@ -340,7 +340,7 @@ export async function listPrComments( per_page: 30, page, }), - { log: deps.log }, + { log: deps.log, op: "github.state.prComments" }, ); const comments = result.data.map((c) => ({ id: c.id, diff --git a/src/mcp/servers/comment.ts b/src/mcp/servers/comment.ts index 42a340d..fdf40a8 100644 --- a/src/mcp/servers/comment.ts +++ b/src/mcp/servers/comment.ts @@ -113,7 +113,7 @@ server.tool( comment_id: commentId, body: bodyWithMarker, }), - { log }, + { log, op: "mcp.comment.update" }, ); return { diff --git a/src/mcp/servers/inline-comment.ts b/src/mcp/servers/inline-comment.ts index 88dae0d..f3ed3ea 100644 --- a/src/mcp/servers/inline-comment.ts +++ b/src/mcp/servers/inline-comment.ts @@ -115,7 +115,7 @@ server.tool( repo: REPO_NAME, pull_number, }), - { log }, + { log, op: "mcp.inlineComment.fetchPr" }, ); commitSha = pr.data.head.sha; } @@ -140,6 +140,7 @@ server.tool( const result = await retryWithBackoff(() => octokit.rest.pulls.createReviewComment(params), { log, + op: "mcp.inlineComment.createReviewComment", }); return { diff --git a/src/mcp/servers/merge-readiness.ts b/src/mcp/servers/merge-readiness.ts index 3c53cca..3d0ff01 100644 --- a/src/mcp/servers/merge-readiness.ts +++ b/src/mcp/servers/merge-readiness.ts @@ -87,7 +87,7 @@ server.tool( repo: REPO_NAME, number: pr_number, }), - { log }, + { log, op: "mcp.mergeReadiness.probe" }, ); // The probe fetches the first 100 review threads only, sufficient for // a scheduled action's own freshly-created PR. botPushedShas is empty: diff --git a/src/mcp/servers/resolve-review-thread.ts b/src/mcp/servers/resolve-review-thread.ts index 88b5a67..cb2b4ea 100644 --- a/src/mcp/servers/resolve-review-thread.ts +++ b/src/mcp/servers/resolve-review-thread.ts @@ -172,7 +172,7 @@ server.tool( // still see prompt failures. const preflight = await retryWithBackoff( () => octokit.graphql(GET_THREAD_QUERY, { threadId: thread_id }), - { log }, + { log, op: "mcp.resolveReviewThread.preflight" }, ); const preflightPr = preflight.node?.pullRequest.number; const preflightRepo = preflight.node?.pullRequest.repository; @@ -208,7 +208,7 @@ server.tool( const result = await retryWithBackoff( () => octokit.graphql(RESOLVE_MUTATION, { threadId: thread_id }), - { log }, + { log, op: "mcp.resolveReviewThread.mutate" }, ); const thread = result.resolveReviewThread.thread; diff --git a/src/workflows/ship/probe.ts b/src/workflows/ship/probe.ts index 689042a..8ab8eb6 100644 --- a/src/workflows/ship/probe.ts +++ b/src/workflows/ship/probe.ts @@ -90,13 +90,15 @@ async function paginateReviewThreads( let cursor: string | null = startCursor; while (cursor !== null) { // eslint-disable-next-line no-await-in-loop -- pagination is inherently sequential - const page = await retryWithBackoff(() => - octokit.graphql(REVIEW_THREADS_PAGE_QUERY, { - owner: args.owner, - repo: args.repo, - number: args.pr_number, - cursor, - }), + const page = await retryWithBackoff( + () => + octokit.graphql(REVIEW_THREADS_PAGE_QUERY, { + owner: args.owner, + repo: args.repo, + number: args.pr_number, + cursor, + }), + { op: "ship.probe.reviewThreads" }, ); const rt = page.repository?.pullRequest?.reviewThreads; if (rt === undefined) break; @@ -159,12 +161,14 @@ export async function runProbe(input: RunProbeInput): Promise { let lastResponse: ProbeResponseShape | null = null; for (let attempt = 0; attempt <= backoff.length; attempt += 1) { // eslint-disable-next-line no-await-in-loop -- attempts are inherently sequential per FR-021 - const response = await retryWithBackoff(() => - input.octokit.graphql(PROBE_QUERY, { - owner: input.owner, - repo: input.repo, - number: input.pr_number, - }), + const response = await retryWithBackoff( + () => + input.octokit.graphql(PROBE_QUERY, { + owner: input.owner, + repo: input.repo, + number: input.pr_number, + }), + { op: "ship.probe.main" }, ); lastResponse = response; const mergeable = response.repository?.pullRequest?.mergeable ?? null; From 8599157d89f70599f9ec2cb7fab9c8a81bd59012 Mon Sep 17 00:00:00 2001 From: "chrisleekr-bot[bot]" <2890262+chrisleekr-bot[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 13:51:13 +0000 Subject: [PATCH 3/3] fix(observability): tighten retry.* contract + harmonise op-tag convention (#225) Address PR review pushback on #225: - retry.ts: normalize op via a small normalizeOp(op) helper so empty / whitespace-only / non-string op falls back to "unknown", holding the op: z.string().min(1) contract regardless of caller. - retry.ts: always read status from the raw error and spread it into the retry.attempt_failed payload (was missing); operators can now slice transient-failure rate by HTTP status without parsing err. - retry-log-fields.ts: refactor to z.discriminatedUnion of strictObject branches so per-event field presence is pinned, non_retriable requires status, only attempt_failed may carry delay_ms, and exhausted / succeeded_after_retry carry neither. Adopts Zod v4 idiomatic z.strictObject already used elsewhere in the repo. - Rename camelCase op-tag segments to snake_case (12 sites in pipeline, state-fetchers, ship probe, and three MCP servers) so all 20 op tags follow lowercase-dotted segments. Documented under "Retry log fields" in observability.md. - docs/operate/observability.md: add status row, document op convention, and refresh the retry-event table to reflect per-event field constraints. - docs/use/workflows/ship.md: note that ship probe wraps its GraphQL calls in retryWithBackoff with ship.probe.main / ship.probe.review_threads op tags (satisfies docs-sync guard for FR-019). - Tests: +6 schema cases for per-event field constraints, +5 behaviour cases for op normalisation and attempt_failed status parity. Co-authored-by: chrisleekr-bot[bot] --- docs/operate/observability.md | 19 ++--- docs/use/workflows/ship.md | 2 + src/core/pipeline.ts | 6 +- src/github/state-fetchers.ts | 14 ++-- src/mcp/servers/inline-comment.ts | 4 +- src/mcp/servers/merge-readiness.ts | 2 +- src/mcp/servers/resolve-review-thread.ts | 4 +- src/utils/retry-log-fields.ts | 82 ++++++++++++++++----- src/utils/retry.ts | 29 ++++++-- src/workflows/ship/probe.ts | 2 +- test/utils/retry-log-fields.test.ts | 74 +++++++++++++++++++ test/utils/retry.test.ts | 92 ++++++++++++++++++++++++ 12 files changed, 284 insertions(+), 46 deletions(-) diff --git a/docs/operate/observability.md b/docs/operate/observability.md index 2f1b6e4..3216a06 100644 --- a/docs/operate/observability.md +++ b/docs/operate/observability.md @@ -38,11 +38,12 @@ The crash path is covered too. `installFatalHandlers(processName)` in `src/logge | `stage` | Pipeline stage name on a `pipeline.stage` event (e.g. `github.fetch`, `repo.clone`, `executor.invoke`). | | `delta_ms` | Wall-clock of a single `pipeline.stage` (integer ms). | | `pipeline_wall_clock_ms` | Cumulative pipeline duration, on the terminal `pipeline.completed` / `pipeline.failed` line (integer ms). | -| `op` | Short dotted identifier on a `retry.*` event identifying the wrapped call site (e.g. `github.fetch`, `mcp.comment.update`). See [Retry log fields](#retry-log-fields). | +| `op` | Short dotted identifier on a `retry.*` event identifying the wrapped call site (e.g. `github.fetch`, `mcp.comment.update`). Lowercase-dotted segments with `snake_case` inside each segment, see [Retry log fields](#retry-log-fields) for the naming convention. | | `attempt` | 1-based attempt ordinal on a `retry.*` event. Aligns with the OpenTelemetry `http.request.resend_count` semantic. | | `max_attempts` | Maximum-attempts ceiling for the current `retryWithBackoff` call. | | `elapsed_ms` | Wall-clock since `retryWithBackoff` entry on a `retry.*` event (integer ms). On `retry.exhausted` it carries the full retry-window duration. | | `delay_ms` | Next backoff that will be slept after a `retry.attempt_failed`. Omitted on the final attempt because no sleep follows. | +| `status` | HTTP status code lifted from the raw error. Present on `retry.non_retriable` (always, since the branch only fires for 4xx) and on `retry.attempt_failed` when the failure carried one (e.g. 503, 429); also on `github.api.*` events. | ## Core pipeline log fields @@ -81,16 +82,18 @@ Volume policy: the per-request line is `debug` (default `info` stays quiet on a ## Retry log fields -`retryWithBackoff` (`src/utils/retry.ts`) is the single chokepoint guarding every transient-failure recovery in the bot, including the GraphQL fetcher, every GitHub-touching MCP server, the orchestrator triage probe, the ship probe, and the pipeline's three GitHub writes. Its four-event family is pinned by the `.strict()` Zod schema in `src/utils/retry-log-fields.ts` so an emitter that adds an unpinned field or mistypes one (e.g. `delayMs` vs `delay_ms`) trips the co-located test. Every event carries `op` (a short dotted identifier from the call site), `attempt`, `max_attempts`, and `elapsed_ms`. See [Common log fields](#common-log-fields) for the scalar field definitions. +`retryWithBackoff` (`src/utils/retry.ts`) is the single chokepoint guarding every transient-failure recovery in the bot, including the GraphQL fetcher, every GitHub-touching MCP server, the orchestrator triage probe, the ship probe, and the pipeline's three GitHub writes. Its four-event family is pinned by a `z.discriminatedUnion` of strict objects in `src/utils/retry-log-fields.ts` so an emitter that adds an unpinned field, mistypes one (e.g. `delayMs` vs `delay_ms`), or attaches a field to the wrong event (e.g. `delay_ms` on `retry.exhausted`, `status` on `retry.succeeded_after_retry`) trips the co-located test. Every event carries `op` (a short dotted identifier from the call site), `attempt`, `max_attempts`, and `elapsed_ms`. See [Common log fields](#common-log-fields) for the scalar field definitions. + +**`op` naming convention.** Lowercase-dotted segments with `snake_case` inside each segment, e.g. `mcp.inline_comment.fetch_pr`, `github.state.pr_state_check_rollup`, `tracking_comment.create`. This matches the `pipeline.stage` event-name style already used elsewhere in this doc and keeps `op =~ "mcp\\..*"`-style operator groupings regular as new call sites are threaded. New `retryWithBackoff` call sites should follow it. Empty / whitespace-only `op` is normalised to `"unknown"` at the entry of `retryWithBackoff` so the non-empty `op` contract holds even when a caller threads an unexpected value. The load-bearing event is `retry.succeeded_after_retry`: it is the only signal in the bot's telemetry today that an upstream is starting to wobble _before_ full failure. A 1% transient-failure rate against GitHub or Bedrock is invisible at the default `info` level otherwise, the only retry telemetry is warn/error, which fires only on the long tail. The `info`-level `succeeded_after_retry` line makes the body of the transient-failure distribution observable; an alert on `count(event = "retry.succeeded_after_retry") by op` over 5-minute windows surfaces the leading indicator without waiting for the warn-level `attempt_failed` count to spike. -| `event` | Level | When | -| ----------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `retry.attempt_failed` | warn | An attempt threw a retriable error (5xx, 429, or 403 secondary rate-limit). Carries `delay_ms` when another attempt will follow; omitted on the final attempt. | -| `retry.non_retriable` | warn | A 4xx (except 429 and 403 secondary rate-limit) bypasses retry. Carries `status` and rethrows. | -| `retry.exhausted` | error | All `max_attempts` attempts failed. Carries the full retry-window `elapsed_ms`; rethrows the last error. | -| `retry.succeeded_after_retry` | info | The call succeeded on `attempt > 1`. Weak-flake leading indicator: gated on `attempt > 1` so first-try successes stay silent. Alert on `count(...) by op` over 5-minute windows. | +| `event` | Level | When | +| ----------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `retry.attempt_failed` | warn | An attempt threw a retriable error (5xx, 429, or 403 secondary rate-limit). Carries `delay_ms` when another attempt will follow (omitted on the final attempt) and `status` when the underlying error carried one (HTTP errors; absent for non-HTTP errors like connection resets). | +| `retry.non_retriable` | warn | A 4xx (except 429 and 403 secondary rate-limit) bypasses retry. Always carries `status` (the branch only fires when the raw error carried a 4xx) and rethrows. | +| `retry.exhausted` | error | All `max_attempts` attempts failed. Carries the full retry-window `elapsed_ms`; rethrows the last error. Neither `delay_ms` nor `status` are emitted on this event. | +| `retry.succeeded_after_retry` | info | The call succeeded on `attempt > 1`. Weak-flake leading indicator: gated on `attempt > 1` so first-try successes stay silent. Alert on `count(...) by op` over 5-minute windows. Neither `delay_ms` nor `status` are emitted on this event. | ## Output secret-guard log events diff --git a/docs/use/workflows/ship.md b/docs/use/workflows/ship.md index 1164206..3d7dc00 100644 --- a/docs/use/workflows/ship.md +++ b/docs/use/workflows/ship.md @@ -112,6 +112,8 @@ Other non-readiness reasons (`failing_checks`, `behind_base`, `pending_checks`, The `chat-thread` executor's GitHub-state tool calls (CI rollup, check output, branch protection, PR diff/files, comments) go through `dispatchGithubStateTool`, whose fetchers wrap every octokit call in `retryWithBackoff`. A transient GitHub API blip (5xx, 429, or a secondary rate limit) is retried up to three times with exponential backoff and a deliveryId-correlated retry-warning log, instead of surfacing to the model as a tool error it would have to recover from semantically (issue #199). +The merge-readiness probe (`src/workflows/ship/probe.ts`) follows the same pattern. Both its main `PROBE_QUERY` call and the `paginateReviewThreads` follow-up are wrapped in `retryWithBackoff`, carrying `op: "ship.probe.main"` and `op: "ship.probe.review_threads"` respectively so probe-side retries can be sliced out from the surrounding fleet in observability dashboards (see [Retry log fields](../../operate/observability.md#retry-log-fields)). A single GraphQL rate-limit or network blip is recovered in-place rather than tearing down the verdict and yielding the session. + ## Re-triggering Re-applying the `bot:ship` label or re-commenting `bot:ship` on the same PR while a session is **active** is a no-op. Re-applying after the session is **terminal** starts a fresh session: the prior `ship_intents` row is preserved for audit. diff --git a/src/core/pipeline.ts b/src/core/pipeline.ts index 9d7a98a..49e2c07 100644 --- a/src/core/pipeline.ts +++ b/src/core/pipeline.ts @@ -304,7 +304,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 1000, log: ctx.log, - op: "trackingComment.create", + op: "tracking_comment.create", }), ); } @@ -470,7 +470,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 1000, log: enrichedCtx.log, - op: "trackingComment.finalize", + op: "tracking_comment.finalize", }, ), ); @@ -590,7 +590,7 @@ export async function runPipeline( maxAttempts: 3, initialDelayMs: 1000, log: ctx.log, - op: "trackingComment.finalize.failure", + op: "tracking_comment.finalize.failure", }, ); } catch (commentError) { diff --git a/src/github/state-fetchers.ts b/src/github/state-fetchers.ts index 02b8cca..5dd4149 100644 --- a/src/github/state-fetchers.ts +++ b/src/github/state-fetchers.ts @@ -118,7 +118,7 @@ export async function getPrStateCheckRollup( repo: deps.repo, number: prNumber, }), - { log: deps.log, op: "github.state.prStateCheckRollup" }, + { log: deps.log, op: "github.state.pr_state_check_rollup" }, ); const pr = data.repository?.pullRequest; if (pr === null || pr === undefined) { @@ -187,7 +187,7 @@ export async function getCheckRunOutput( repo: deps.repo, check_run_id: checkRunId, }), - { log: deps.log, op: "github.state.checkRunOutput" }, + { log: deps.log, op: "github.state.check_run_output" }, ); const text = result.data.output.text ?? ""; const truncated = truncate(text, MAX_CHECK_OUTPUT_BYTES); @@ -216,7 +216,7 @@ export async function getWorkflowRun(deps: GithubStateDeps, runId: number): Prom repo: deps.repo, run_id: runId, }), - { log: deps.log, op: "github.state.workflowRun" }, + { log: deps.log, op: "github.state.workflow_run" }, ); return serialize({ id: result.data.id, @@ -261,7 +261,7 @@ export async function getBranchProtection(deps: GithubStateDeps, branch: string) throw err; } }, - { log: deps.log, op: "github.state.branchProtection" }, + { log: deps.log, op: "github.state.branch_protection" }, ); if (result === null) { return serialize({ branch, protected: false }); @@ -285,7 +285,7 @@ export async function getPrDiff(deps: GithubStateDeps, prNumber: number): Promis pull_number: prNumber, mediaType: { format: "diff" }, }), - { log: deps.log, op: "github.state.prDiff" }, + { log: deps.log, op: "github.state.pr_diff" }, ); const diff = typeof result.data === "string" ? result.data : JSON.stringify(result.data); const { text, truncated } = truncate(diff, MAX_DIFF_BYTES); @@ -309,7 +309,7 @@ export async function getPrFiles(deps: GithubStateDeps, prNumber: number): Promi pull_number: prNumber, per_page: 100, }), - { log: deps.log, op: "github.state.prFiles" }, + { log: deps.log, op: "github.state.pr_files" }, ); return serialize({ pr_number: prNumber, @@ -340,7 +340,7 @@ export async function listPrComments( per_page: 30, page, }), - { log: deps.log, op: "github.state.prComments" }, + { log: deps.log, op: "github.state.pr_comments" }, ); const comments = result.data.map((c) => ({ id: c.id, diff --git a/src/mcp/servers/inline-comment.ts b/src/mcp/servers/inline-comment.ts index f3ed3ea..99e4ed2 100644 --- a/src/mcp/servers/inline-comment.ts +++ b/src/mcp/servers/inline-comment.ts @@ -115,7 +115,7 @@ server.tool( repo: REPO_NAME, pull_number, }), - { log, op: "mcp.inlineComment.fetchPr" }, + { log, op: "mcp.inline_comment.fetch_pr" }, ); commitSha = pr.data.head.sha; } @@ -140,7 +140,7 @@ server.tool( const result = await retryWithBackoff(() => octokit.rest.pulls.createReviewComment(params), { log, - op: "mcp.inlineComment.createReviewComment", + op: "mcp.inline_comment.create_review_comment", }); return { diff --git a/src/mcp/servers/merge-readiness.ts b/src/mcp/servers/merge-readiness.ts index 3d0ff01..0da673c 100644 --- a/src/mcp/servers/merge-readiness.ts +++ b/src/mcp/servers/merge-readiness.ts @@ -87,7 +87,7 @@ server.tool( repo: REPO_NAME, number: pr_number, }), - { log, op: "mcp.mergeReadiness.probe" }, + { log, op: "mcp.merge_readiness.probe" }, ); // The probe fetches the first 100 review threads only, sufficient for // a scheduled action's own freshly-created PR. botPushedShas is empty: diff --git a/src/mcp/servers/resolve-review-thread.ts b/src/mcp/servers/resolve-review-thread.ts index cb2b4ea..48f44e3 100644 --- a/src/mcp/servers/resolve-review-thread.ts +++ b/src/mcp/servers/resolve-review-thread.ts @@ -172,7 +172,7 @@ server.tool( // still see prompt failures. const preflight = await retryWithBackoff( () => octokit.graphql(GET_THREAD_QUERY, { threadId: thread_id }), - { log, op: "mcp.resolveReviewThread.preflight" }, + { log, op: "mcp.resolve_review_thread.preflight" }, ); const preflightPr = preflight.node?.pullRequest.number; const preflightRepo = preflight.node?.pullRequest.repository; @@ -208,7 +208,7 @@ server.tool( const result = await retryWithBackoff( () => octokit.graphql(RESOLVE_MUTATION, { threadId: thread_id }), - { log, op: "mcp.resolveReviewThread.mutate" }, + { log, op: "mcp.resolve_review_thread.mutate" }, ); const thread = result.resolveReviewThread.thread; diff --git a/src/utils/retry-log-fields.ts b/src/utils/retry-log-fields.ts index 38c70c7..03945b7 100644 --- a/src/utils/retry-log-fields.ts +++ b/src/utils/retry-log-fields.ts @@ -2,7 +2,7 @@ * Canonical pino log-field schema for `retryWithBackoff` observability (issue #215). * * Mirrors `src/utils/octokit-observability.ts` and `src/core/log-fields.ts`: a - * `.strict()` Zod shape pins the structured `retry.*` event family so the four + * strict Zod shape pins the structured `retry.*` event family so the four * emit sites in `retryWithBackoff` cannot drift on a field name (e.g. `delayMs` * vs `delay_ms`) without the co-located test catching it. Emitters log plain * objects via `log.warn` / `log.info` / `log.error`; the schema is the @@ -15,6 +15,19 @@ * backoff that will be slept (set on `retry.attempt_failed` only when another * attempt will follow; omitted on the final attempt because no sleep will * occur). + * + * The schema is a `z.discriminatedUnion` on `event` so per-event field + * presence is pinned: `retry.non_retriable` requires `status` (the 4xx branch + * always has it), only `retry.attempt_failed` may carry `delay_ms`, and + * `retry.exhausted` / `retry.succeeded_after_retry` carry neither. Future + * emitter changes that drop a required field or attach `delay_ms` to the + * wrong event trip the co-located test. + * + * `op` follows lowercase-dotted segments with snake_case inside each segment + * (e.g. `mcp.inline_comment.fetch_pr`, `github.state.pr_state_check_rollup`, + * `tracking_comment.create`). The convention is documented under "Retry log + * fields" in `docs/operate/observability.md`; new call sites should follow it + * so future operator queries (`op =~ "mcp\\..*"`) stay regular. */ import { z } from "zod"; @@ -26,27 +39,60 @@ export const RETRY_LOG_EVENTS = { } as const; /** - * `.strict()` shape for the four-event family. `err` is intentionally not - * enumerated here: it is serialized via pino's existing `errSerializer` and - * remains attached to the line as-is, mirroring the existing + * Shared scalar shape across every `retry.*` event. `err` is intentionally + * not enumerated here: it is serialized via pino's existing `errSerializer` + * and remains attached to the line as-is, mirroring the existing * `GithubApiLogFieldsSchema` pattern (which also does not enumerate `err`). * Callers test the structured scalar fields; pino serializes the error. */ -export const RetryLogFieldsSchema = z - .object({ - event: z.enum([ - RETRY_LOG_EVENTS.attemptFailed, - RETRY_LOG_EVENTS.nonRetriable, - RETRY_LOG_EVENTS.exhausted, - RETRY_LOG_EVENTS.succeededAfterRetry, - ]), - op: z.string().min(1), - attempt: z.number().int().positive(), - max_attempts: z.number().int().positive(), - elapsed_ms: z.number().int().nonnegative(), +const baseFields = { + op: z.string().min(1), + attempt: z.number().int().positive(), + max_attempts: z.number().int().positive(), + elapsed_ms: z.number().int().nonnegative(), +} as const; + +export const RetryLogFieldsSchema = z.discriminatedUnion("event", [ + /** + * Warn-level emit on a retriable failure. `delay_ms` is present when + * another attempt will follow (and equals the next backoff sleep); + * omitted on the final attempt because no sleep occurs. `status` is + * present when the raw error carried one (HTTP errors) and absent for + * non-HTTP errors like connection resets. + */ + z.strictObject({ + event: z.literal(RETRY_LOG_EVENTS.attemptFailed), + ...baseFields, delay_ms: z.number().int().nonnegative().optional(), status: z.number().int().optional(), - }) - .strict(); + }), + /** + * Warn-level emit on a 4xx (except 429 and 403 secondary-rate-limit) that + * bypasses retry. `status` is always present in this branch because + * `isNonRetriable` only returns `true` when a numeric 4xx status was + * read off the error. + */ + z.strictObject({ + event: z.literal(RETRY_LOG_EVENTS.nonRetriable), + ...baseFields, + status: z.number().int(), + }), + /** + * Error-level emit after `maxAttempts` retriable failures. No `delay_ms` + * (no further sleep), no `status` (last-error context lives on `err`). + */ + z.strictObject({ + event: z.literal(RETRY_LOG_EVENTS.exhausted), + ...baseFields, + }), + /** + * Info-level emit when an attempt succeeded after at least one prior + * failure. Gated on `attempt > 1` so first-try successes stay silent. + */ + z.strictObject({ + event: z.literal(RETRY_LOG_EVENTS.succeededAfterRetry), + ...baseFields, + }), +]); export type RetryLogFields = z.infer; diff --git a/src/utils/retry.ts b/src/utils/retry.ts index 5e72452..69b4146 100644 --- a/src/utils/retry.ts +++ b/src/utils/retry.ts @@ -98,8 +98,8 @@ export async function retryWithBackoff( maxDelayMs = 20000, backoffFactor = 2, log = defaultLog, - op = "unknown", } = options; + const op = normalizeOp(options.op); // Fail fast on invalid input. Each check names the offending option and // value. Without these guards, NaN/Infinity/below-min values could bypass @@ -137,9 +137,14 @@ export async function retryWithBackoff( } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); const elapsedMs = Date.now() - startedAt; + // Read once; spread into both emit paths so an operator graphing + // transient-failure rate can break `attempt_failed` down by HTTP + // status (e.g. 503 surge vs 429 surge) without parsing the `err` + // serializer payload. Optional in the schema so undefined just + // omits the key. + const status = (error as { status?: number }).status; if (isNonRetriable(error, lastError)) { - const status = (error as { status?: number }).status; log.warn( { event: RETRY_LOG_EVENTS.nonRetriable, @@ -156,8 +161,11 @@ export async function retryWithBackoff( } // Compute the next delay BEFORE the emit so the line carries the delay - // that will actually be slept. Omit the field on the final attempt + // that will actually be slept. Omit `delay_ms` on the final attempt // because no sleep will occur (the loop falls through to `exhausted`). + // pino serialises objects via JSON.stringify so undefined-valued keys + // are dropped from the emitted line, matching the schema's + // .optional() shape for both `delay_ms` and `status`. const willRetry = attempt < maxAttempts; log.warn( { @@ -166,7 +174,8 @@ export async function retryWithBackoff( attempt, max_attempts: maxAttempts, elapsed_ms: elapsedMs, - ...(willRetry ? { delay_ms: delayMs } : {}), + delay_ms: willRetry ? delayMs : undefined, + status, err: lastError, }, "Operation attempt failed", @@ -197,6 +206,18 @@ export async function retryWithBackoff( throw lastError!; } +/** + * Coerce `op` to a non-empty identifier so the `op: z.string().min(1)` + * contract pinned in `retry-log-fields.ts` holds even when a caller threads + * an empty / whitespace-only / non-string value. The destructure default + * only fires for literal `undefined`, so `op: ""`, `op: " "`, and a + * stray non-string slip through without this. + */ +function normalizeOp(op: unknown): string { + const trimmed = typeof op === "string" ? op.trim() : ""; + return trimmed.length > 0 ? trimmed : "unknown"; +} + /** * Decide whether an error is a permanent 4xx that should bypass retry. * diff --git a/src/workflows/ship/probe.ts b/src/workflows/ship/probe.ts index 8ab8eb6..1ea2727 100644 --- a/src/workflows/ship/probe.ts +++ b/src/workflows/ship/probe.ts @@ -98,7 +98,7 @@ async function paginateReviewThreads( number: args.pr_number, cursor, }), - { op: "ship.probe.reviewThreads" }, + { op: "ship.probe.review_threads" }, ); const rt = page.repository?.pullRequest?.reviewThreads; if (rt === undefined) break; diff --git a/test/utils/retry-log-fields.test.ts b/test/utils/retry-log-fields.test.ts index ae3c4ea..c185276 100644 --- a/test/utils/retry-log-fields.test.ts +++ b/test/utils/retry-log-fields.test.ts @@ -171,3 +171,77 @@ describe("RetryLogFieldsSchema: rejects field-name drift", () => { expect(result.success).toBe(false); }); }); + +describe("RetryLogFieldsSchema: per-event field constraints (discriminated union)", () => { + it("rejects retry.non_retriable without status (status is required on this branch)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.nonRetriable, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 12, + }); + expect(result.success).toBe(false); + }); + + it("rejects retry.exhausted with delay_ms (no sleep follows the exhausted emit)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.exhausted, + op: "github.fetch", + attempt: 3, + max_attempts: 3, + elapsed_ms: 30_000, + delay_ms: 5000, + }); + expect(result.success).toBe(false); + }); + + it("rejects retry.succeeded_after_retry with delay_ms (no retry follows success)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.succeededAfterRetry, + op: "github.fetch", + attempt: 2, + max_attempts: 3, + elapsed_ms: 5_100, + delay_ms: 5000, + }); + expect(result.success).toBe(false); + }); + + it("rejects retry.succeeded_after_retry with status (only error-bearing events carry status)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.succeededAfterRetry, + op: "github.fetch", + attempt: 2, + max_attempts: 3, + elapsed_ms: 5_100, + status: 503, + }); + expect(result.success).toBe(false); + }); + + it("accepts retry.attempt_failed with status (5xx transient failure)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + delay_ms: 5000, + status: 503, + }); + expect(result.success).toBe(true); + }); + + it("accepts retry.attempt_failed without status (non-HTTP error like a connection reset)", () => { + const result = RetryLogFieldsSchema.safeParse({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "github.fetch", + attempt: 1, + max_attempts: 3, + elapsed_ms: 42, + delay_ms: 5000, + }); + expect(result.success).toBe(true); + }); +}); diff --git a/test/utils/retry.test.ts b/test/utils/retry.test.ts index b156bc2..17ce363 100644 --- a/test/utils/retry.test.ts +++ b/test/utils/retry.test.ts @@ -399,4 +399,96 @@ describe("retryWithBackoff: structured log events (#215)", () => { const fields = structuredFields(log.error.mock.calls[0] ?? []); expect(fields["op"]).toBe("unknown"); }); + + it("normalizes an empty op to 'unknown' (so the non-empty op contract holds)", async () => { + const log = makeSilentLogger(); + const op = mock(() => Promise.reject(new Error("always fails"))); + try { + await retryWithBackoff(op, { maxAttempts: 1, initialDelayMs: 1, log, op: "" }); + } catch { + /* expected */ + } + const fields = structuredFields(log.error.mock.calls[0] ?? []); + expect(fields["op"]).toBe("unknown"); + expect(() => RetryLogFieldsSchema.parse(fields)).not.toThrow(); + }); + + it("normalizes a whitespace-only op to 'unknown'", async () => { + const log = makeSilentLogger(); + const op = mock(() => Promise.reject(new Error("always fails"))); + try { + await retryWithBackoff(op, { maxAttempts: 1, initialDelayMs: 1, log, op: " " }); + } catch { + /* expected */ + } + const fields = structuredFields(log.error.mock.calls[0] ?? []); + expect(fields["op"]).toBe("unknown"); + }); + + it("trims surrounding whitespace from a non-empty op", async () => { + const log = makeSilentLogger(); + const op = mock(() => Promise.reject(new Error("always fails"))); + try { + await retryWithBackoff(op, { + maxAttempts: 1, + initialDelayMs: 1, + log, + op: " github.fetch ", + }); + } catch { + /* expected */ + } + const fields = structuredFields(log.error.mock.calls[0] ?? []); + expect(fields["op"]).toBe("github.fetch"); + }); + + it("includes status on retry.attempt_failed when the error carried one (mirrors non_retriable)", async () => { + const log = makeSilentLogger(); + let calls = 0; + const op = mock(() => { + calls++; + if (calls === 1) return Promise.reject(makeStatusError(503)); + return Promise.resolve("recovered"); + }); + const result = await retryWithBackoff(op, { + maxAttempts: 3, + initialDelayMs: 1, + log, + op: "test.transient5xx", + }); + expect(result).toBe("recovered"); + + expect(log.warn).toHaveBeenCalledTimes(1); + const warnFields = structuredFields(log.warn.mock.calls[0] ?? []); + expect(warnFields).toMatchObject({ + event: RETRY_LOG_EVENTS.attemptFailed, + op: "test.transient5xx", + attempt: 1, + max_attempts: 3, + delay_ms: 1, + status: 503, + }); + expect(() => RetryLogFieldsSchema.parse(warnFields)).not.toThrow(); + }); + + it("omits status on retry.attempt_failed when the error has none (non-HTTP errors)", async () => { + const log = makeSilentLogger(); + let calls = 0; + const op = mock(() => { + calls++; + if (calls === 1) return Promise.reject(new Error("ECONNRESET")); + return Promise.resolve("recovered"); + }); + await retryWithBackoff(op, { + maxAttempts: 3, + initialDelayMs: 1, + log, + op: "test.connreset", + }); + + expect(log.warn).toHaveBeenCalledTimes(1); + const warnFields = structuredFields(log.warn.mock.calls[0] ?? []); + expect(warnFields["status"]).toBeUndefined(); + expect(() => RetryLogFieldsSchema.parse(warnFields)).not.toThrow(); + }); });