From 069c35ad47d7dd728a6553cc4c403cf2ce018dd6 Mon Sep 17 00:00:00 2001 From: neubig <398875+neubig@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:03:44 -0400 Subject: [PATCH 1/2] Retry transient backend disconnects --- src/hooks/query/use-backends-health.test.ts | 97 +++++++++++++++++++++ src/hooks/query/use-backends-health.ts | 80 ++++++++++++++++- src/hooks/query/use-config.test.ts | 39 +++++++++ src/hooks/query/use-config.ts | 35 +++++++- src/i18n/translation.json | 34 ++++++++ src/root.tsx | 27 +++++- 6 files changed, 304 insertions(+), 8 deletions(-) create mode 100644 src/hooks/query/use-backends-health.test.ts create mode 100644 src/hooks/query/use-config.test.ts diff --git a/src/hooks/query/use-backends-health.test.ts b/src/hooks/query/use-backends-health.test.ts new file mode 100644 index 000000000..a2f1a80e7 --- /dev/null +++ b/src/hooks/query/use-backends-health.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it, vi } from "vitest"; +import { AgentServerUnknownVersionError } from "#/api/agent-server-compatibility"; +import type { Backend } from "#/api/backend-registry/types"; +import { + BACKEND_HEALTH_PROBE_MAX_ATTEMPTS, + CLOUD_BACKEND_API_KEY_OR_NETWORK_ERROR, + INVALID_BACKEND_API_KEY_ERROR, + isRetryableBackendHealthError, + probeBackendWithRetries, +} from "./use-backends-health"; + +const localBackend: Backend = { + id: "local", + kind: "local", + name: "Local", + host: "http://localhost:8001", + apiKey: "", +}; + +const cloudBackend: Backend = { + id: "cloud", + kind: "cloud", + name: "Cloud", + host: "https://app.all-hands.dev", + apiKey: "oh-cloud-key", +}; + +describe("isRetryableBackendHealthError", () => { + it("does not retry credential and compatibility failures", () => { + expect( + isRetryableBackendHealthError( + localBackend, + new Error(INVALID_BACKEND_API_KEY_ERROR), + ), + ).toBe(false); + expect( + isRetryableBackendHealthError( + localBackend, + new AgentServerUnknownVersionError("unknown"), + ), + ).toBe(false); + }); + + it("retries transient network failures", () => { + expect( + isRetryableBackendHealthError( + cloudBackend, + new Error(CLOUD_BACKEND_API_KEY_OR_NETWORK_ERROR), + ), + ).toBe(true); + expect( + isRetryableBackendHealthError(localBackend, new Error("Failed to fetch")), + ).toBe(true); + }); +}); + +describe("probeBackendWithRetries", () => { + it("retries transient probe failures before succeeding", async () => { + const probe = vi + .fn() + .mockRejectedValueOnce(new Error("Failed to fetch")) + .mockResolvedValueOnce(true); + const sleep = vi.fn().mockResolvedValue(undefined); + + await expect( + probeBackendWithRetries(localBackend, probe, sleep), + ).resolves.toBe(true); + + expect(probe).toHaveBeenCalledTimes(2); + expect(sleep).toHaveBeenCalledTimes(1); + }); + + it("does not retry non-transient failures", async () => { + const probe = vi + .fn() + .mockRejectedValue(new Error(INVALID_BACKEND_API_KEY_ERROR)); + const sleep = vi.fn().mockResolvedValue(undefined); + + await expect( + probeBackendWithRetries(localBackend, probe, sleep), + ).rejects.toThrow(INVALID_BACKEND_API_KEY_ERROR); + + expect(probe).toHaveBeenCalledTimes(1); + expect(sleep).not.toHaveBeenCalled(); + }); + + it("throws after exhausting transient retry attempts", async () => { + const probe = vi.fn().mockRejectedValue(new Error("Failed to fetch")); + const sleep = vi.fn().mockResolvedValue(undefined); + + await expect( + probeBackendWithRetries(localBackend, probe, sleep), + ).rejects.toThrow("Failed to fetch"); + + expect(probe).toHaveBeenCalledTimes(BACKEND_HEALTH_PROBE_MAX_ATTEMPTS); + }); +}); diff --git a/src/hooks/query/use-backends-health.ts b/src/hooks/query/use-backends-health.ts index 6919bf031..b50a057be 100644 --- a/src/hooks/query/use-backends-health.ts +++ b/src/hooks/query/use-backends-health.ts @@ -8,6 +8,8 @@ import { import { getCurrentCloudApiKey } from "#/api/cloud/organization-service.api"; import { assertAgentServerVersionIsSupported, + isAgentServerUnknownVersionError, + isAgentServerUnsupportedVersionError, isSdkHttpStatusError, } from "#/api/agent-server-compatibility"; import type { Backend } from "#/api/backend-registry/types"; @@ -26,6 +28,8 @@ import { MAX_CONSECUTIVE_FAILURES } from "#/api/backend-registry/health-storage" const REFRESH_INTERVAL_MS = 10000; const PROBE_TIMEOUT_MS = 4000; +export const BACKEND_HEALTH_PROBE_MAX_ATTEMPTS = 3; +export const BACKEND_HEALTH_PROBE_RETRY_DELAY_MS = 750; export const INVALID_BACKEND_API_KEY_ERROR = "Invalid API key"; export const MISSING_BACKEND_API_KEY_ERROR = "API key required"; export const CLOUD_BACKEND_API_KEY_OR_NETWORK_ERROR = @@ -60,6 +64,47 @@ export function isCloudBackendLoggedOutHealthError( return error === CLOUD_BACKEND_LOGGED_OUT_ERROR; } +function sleep(ms: number): Promise { + return new Promise((resolve) => { + window.setTimeout(resolve, ms); + }); +} + +export function isRetryableBackendHealthError( + backend: Backend, + error: unknown, +): boolean { + if (isSdkHttpStatusError(error, 401)) { + return false; + } + + if ( + isAgentServerUnsupportedVersionError(error) || + isAgentServerUnknownVersionError(error) + ) { + return false; + } + + if (error instanceof Error) { + if ( + error.message === INVALID_BACKEND_API_KEY_ERROR || + error.message === MISSING_BACKEND_API_KEY_ERROR || + error.message === CLOUD_BACKEND_LOGGED_OUT_ERROR + ) { + return false; + } + + if ( + backend.kind === "cloud" && + error.message === CLOUD_BACKEND_API_KEY_OR_NETWORK_ERROR + ) { + return true; + } + } + + return true; +} + /** * Probe a single backend for connectivity. The probe path differs by * backend kind: @@ -139,6 +184,39 @@ export interface UseBackendsHealthOptions { probeDisabledOnce?: boolean; } +type ProbeBackendFn = (backend: Backend) => Promise; +type SleepFn = (ms: number) => Promise; + +export async function probeBackendWithRetries( + backend: Backend, + probe: ProbeBackendFn = probeBackend, + sleepFn: SleepFn = sleep, +): Promise { + let lastError: unknown; + + for ( + let attempt = 1; + attempt <= BACKEND_HEALTH_PROBE_MAX_ATTEMPTS; + attempt += 1 + ) { + try { + return await probe(backend); + } catch (error) { + lastError = error; + if ( + attempt >= BACKEND_HEALTH_PROBE_MAX_ATTEMPTS || + !isRetryableBackendHealthError(backend, error) + ) { + throw error; + } + + await sleepFn(BACKEND_HEALTH_PROBE_RETRY_DELAY_MS * attempt); + } + } + + throw lastError; +} + /** * Poll every backend in `backends` once every 10s and report a simple * connected / disconnected verdict per backend id. @@ -190,7 +268,7 @@ export function useBackendsHealth( ] as const, queryFn: async () => { try { - const result = await probeBackend(b); + const result = await probeBackendWithRetries(b); recordBackendSuccess(b.id); return result; } catch (err) { diff --git a/src/hooks/query/use-config.test.ts b/src/hooks/query/use-config.test.ts new file mode 100644 index 000000000..3c9f386ba --- /dev/null +++ b/src/hooks/query/use-config.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from "vitest"; +import { AgentServerUnavailableError } from "#/api/agent-server-compatibility"; +import { + AGENT_SERVER_BOOTSTRAP_RETRY_COUNT, + getConfigRetryDelay, + shouldRetryConfigQuery, +} from "./use-config"; + +describe("shouldRetryConfigQuery", () => { + it("retries transient agent-server unavailable errors", () => { + const error = new AgentServerUnavailableError("timeout"); + + expect(shouldRetryConfigQuery(0, error)).toBe(true); + expect( + shouldRetryConfigQuery(AGENT_SERVER_BOOTSTRAP_RETRY_COUNT, error), + ).toBe(false); + }); + + it("does not retry when no backend is configured", () => { + const error = new AgentServerUnavailableError("No backend configured", { + noBackendConfigured: true, + }); + + expect(shouldRetryConfigQuery(0, error)).toBe(false); + }); + + it("keeps the existing retry cap for non-bootstrap errors", () => { + const error = new Error("Unexpected"); + + expect(shouldRetryConfigQuery(0, error)).toBe(true); + expect(shouldRetryConfigQuery(3, error)).toBe(false); + }); + + it("uses capped exponential backoff", () => { + expect(getConfigRetryDelay(0)).toBe(1000); + expect(getConfigRetryDelay(1)).toBe(2000); + expect(getConfigRetryDelay(10)).toBe(5000); + }); +}); diff --git a/src/hooks/query/use-config.ts b/src/hooks/query/use-config.ts index 995064df2..ebb621f4b 100644 --- a/src/hooks/query/use-config.ts +++ b/src/hooks/query/use-config.ts @@ -10,14 +10,41 @@ interface UseConfigOptions { enabled?: boolean; } +export const AGENT_SERVER_BOOTSTRAP_RETRY_COUNT = 4; +export const AGENT_SERVER_BOOTSTRAP_RETRY_BASE_DELAY_MS = 1000; +export const AGENT_SERVER_BOOTSTRAP_RETRY_MAX_DELAY_MS = 5000; + +export function shouldRetryConfigQuery( + failureCount: number, + error: unknown, +): boolean { + if (isAgentServerAuthError(error)) { + return false; + } + + if (isAgentServerUnavailableError(error)) { + return ( + !error.noBackendConfigured && + failureCount < AGENT_SERVER_BOOTSTRAP_RETRY_COUNT + ); + } + + return failureCount < 3; +} + +export function getConfigRetryDelay(attemptIndex: number): number { + return Math.min( + AGENT_SERVER_BOOTSTRAP_RETRY_BASE_DELAY_MS * 2 ** attemptIndex, + AGENT_SERVER_BOOTSTRAP_RETRY_MAX_DELAY_MS, + ); +} + export const useConfig = (options?: UseConfigOptions) => useQuery({ queryKey: QUERY_KEYS.WEB_CLIENT_CONFIG, queryFn: OptionService.getConfig, - retry: (failureCount, error) => - !isAgentServerUnavailableError(error) && - !isAgentServerAuthError(error) && - failureCount < 3, + retry: shouldRetryConfigQuery, + retryDelay: getConfigRetryDelay, meta: { disableToast: true }, ...CONFIG_CACHE_OPTIONS, enabled: options?.enabled, diff --git a/src/i18n/translation.json b/src/i18n/translation.json index c4c85ccdd..7da50338e 100644 --- a/src/i18n/translation.json +++ b/src/i18n/translation.json @@ -6629,6 +6629,40 @@ "uk": "Повторити підключення", "ca": "Torna a provar la connexió" }, + "SETTINGS$AGENT_SERVER_RECONNECTING_TITLE": { + "en": "Reconnecting to backend...", + "ja": "バックエンドに再接続しています...", + "zh-CN": "正在重新连接到后端...", + "zh-TW": "正在重新連線到後端...", + "ko-KR": "백엔드에 다시 연결하는 중...", + "no": "Kobler til backend på nytt...", + "it": "Riconnessione al backend...", + "pt": "Reconectando ao backend...", + "es": "Reconectando con el backend...", + "ar": "جارٍ إعادة الاتصال بالواجهة الخلفية...", + "fr": "Reconnexion au backend...", + "tr": "Arka uca yeniden bağlanılıyor...", + "de": "Verbindung zum Backend wird wiederhergestellt...", + "uk": "Повторне підключення до бекенду...", + "ca": "S'està reconnectant al backend..." + }, + "SETTINGS$AGENT_SERVER_RECONNECTING_MESSAGE": { + "en": "Keeping this session open while the agent server recovers.", + "ja": "エージェントサーバーが復旧するまで、このセッションを開いたままにします。", + "zh-CN": "在代理服务器恢复期间保持此会话打开。", + "zh-TW": "在代理伺服器復原期間保持此工作階段開啟。", + "ko-KR": "에이전트 서버가 복구되는 동안 이 세션을 열어 둡니다.", + "no": "Holder denne økten åpen mens agentserveren gjenopprettes.", + "it": "Manteniamo aperta questa sessione mentre il server dell'agente si ripristina.", + "pt": "Mantendo esta sessão aberta enquanto o servidor do agente se recupera.", + "es": "Mantenemos esta sesión abierta mientras el servidor del agente se recupera.", + "ar": "سنُبقي هذه الجلسة مفتوحة أثناء تعافي خادم الوكيل.", + "fr": "Cette session reste ouverte pendant que le serveur d'agent se rétablit.", + "tr": "Aracı sunucusu toparlanırken bu oturumu açık tutuyoruz.", + "de": "Diese Sitzung bleibt geöffnet, während der Agent-Server wiederhergestellt wird.", + "uk": "Тримаємо цей сеанс відкритим, доки сервер агента відновлюється.", + "ca": "Mantenim aquesta sessió oberta mentre el servidor de l'agent es recupera." + }, "SETTINGS$AGENT_SERVER_ONBOARDING_EYEBROW": { "en": "Get started", "ja": "はじめに", diff --git a/src/root.tsx b/src/root.tsx index 88c55ec2b..f61a5cb6f 100644 --- a/src/root.tsx +++ b/src/root.tsx @@ -12,11 +12,13 @@ import "./index.css"; import React from "react"; import { useQueryClient } from "@tanstack/react-query"; import { Toaster } from "react-hot-toast"; +import { useTranslation } from "react-i18next"; import { clearCachedAgentServerInfo, isAgentServerUnavailableError, isAgentServerAuthError, } from "#/api/agent-server-compatibility"; +import { I18nKey } from "#/i18n/declaration"; import { isAuthRequiredAndMissing } from "#/api/agent-server-config"; import { getEffectiveLocalBackend } from "#/api/backend-registry/active-store"; import { useActiveBackendContext } from "#/contexts/active-backend-context"; @@ -80,12 +82,28 @@ export function Layout({ children }: { children: React.ReactNode }) { ); } -function AgentServerBootstrapLoading() { +function AgentServerBootstrapLoading({ + reconnecting = false, +}: { + reconnecting?: boolean; +}) { + const { t } = useTranslation(); + return (
-
+
+ {reconnecting ? ( +
+

+ {t(I18nKey.SETTINGS$AGENT_SERVER_RECONNECTING_TITLE)} +

+

+ {t(I18nKey.SETTINGS$AGENT_SERVER_RECONNECTING_MESSAGE)} +

+
+ ) : null}
@@ -174,7 +192,10 @@ export default function App() { } if (config.isPending || config.isLoading) { - return ; + const reconnecting = + config.failureCount > 0 && + isAgentServerUnavailableError(config.failureReason); + return ; } if (activeCloudLoggedOut || isAgentServerUnavailableError(config.error)) { From a79dfca864ecffdb722536b280ce0e962ec0cb75 Mon Sep 17 00:00:00 2001 From: neubig <398875+neubig@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:18:09 -0400 Subject: [PATCH 2/2] Fix backend reconnect retry tests --- .../backends/backend-selector.test.tsx | 13 ++++++---- .../hooks/query/use-backends-health.test.tsx | 26 +++++++++++-------- __tests__/root.test.tsx | 22 ++++++++-------- src/hooks/query/use-config.test.ts | 18 ++++++++++++- src/hooks/query/use-config.ts | 8 +++++- 5 files changed, 58 insertions(+), 29 deletions(-) diff --git a/__tests__/components/backends/backend-selector.test.tsx b/__tests__/components/backends/backend-selector.test.tsx index e0dff14d7..0fc6f2563 100644 --- a/__tests__/components/backends/backend-selector.test.tsx +++ b/__tests__/components/backends/backend-selector.test.tsx @@ -726,11 +726,14 @@ describe("BackendSelector", () => { renderWithProviders(); - await waitFor(() => { - const wrapper = screen.getByTestId("backend-selector"); - const dot = within(wrapper).getByTestId("backend-status-dot"); - expect(dot.getAttribute("data-status")).toBe("disconnected"); - }); + await waitFor( + () => { + const wrapper = screen.getByTestId("backend-selector"); + const dot = within(wrapper).getByTestId("backend-status-dot"); + expect(dot.getAttribute("data-status")).toBe("disconnected"); + }, + { timeout: 5000 }, + ); }); }); }); diff --git a/__tests__/hooks/query/use-backends-health.test.tsx b/__tests__/hooks/query/use-backends-health.test.tsx index 7bcc29491..a08bab32f 100644 --- a/__tests__/hooks/query/use-backends-health.test.tsx +++ b/__tests__/hooks/query/use-backends-health.test.tsx @@ -119,8 +119,9 @@ describe("useBackendsHealth", () => { wrapper, }); - await waitFor(() => - expect(result.current[localBackend.id].isConnected).toBe(false), + await waitFor( + () => expect(result.current[localBackend.id].isConnected).toBe(false), + { timeout: 5000 }, ); }); @@ -169,8 +170,9 @@ describe("useBackendsHealth", () => { wrapper, }); - await waitFor(() => - expect(result.current[cloudBackend.id].isConnected).toBe(false), + await waitFor( + () => expect(result.current[cloudBackend.id].isConnected).toBe(false), + { timeout: 5000 }, ); }); @@ -227,13 +229,15 @@ describe("useBackendsHealth", () => { // Assert — one failed probe surfaces the new metadata fields on // the hook's return value and persists them to localStorage; the // disabled flag stays false because we're below the cap. - await waitFor(() => - expect(result.current[localBackend.id]).toMatchObject({ - isConnected: false, - consecutiveFailures: 1, - lastError: "ECONNREFUSED", - disabled: false, - }), + await waitFor( + () => + expect(result.current[localBackend.id]).toMatchObject({ + isConnected: false, + consecutiveFailures: 1, + lastError: "ECONNREFUSED", + disabled: false, + }), + { timeout: 5000 }, ); const persisted = JSON.parse( window.localStorage.getItem(BACKEND_HEALTH_STORAGE_KEY) ?? "{}", diff --git a/__tests__/root.test.tsx b/__tests__/root.test.tsx index e5f75f23b..ed8a3eaef 100644 --- a/__tests__/root.test.tsx +++ b/__tests__/root.test.tsx @@ -19,6 +19,9 @@ const TRANSLATIONS: Record = { BACKEND$EDIT: "Edit", BACKEND$REMOVE: "Remove", HOME$DONE: "Done", + SETTINGS$AGENT_SERVER_RECONNECTING_TITLE: "Reconnecting to backend...", + SETTINGS$AGENT_SERVER_RECONNECTING_MESSAGE: + "Keeping this session open while the agent server recovers.", }; vi.mock("react-i18next", () => ({ @@ -105,7 +108,7 @@ describe("App root agent-server availability guard", () => { expect(screen.queryByTestId("app-outlet")).not.toBeInTheDocument(); }); - it("shows the manage-backends modal when the backend is unreachable", async () => { + it("shows a reconnecting state when the configured backend is transiently unreachable", async () => { let serverInfoRequests = 0; // Use "*" prefix to match both relative paths and absolute URLs (e.g., @@ -121,19 +124,14 @@ describe("App root agent-server availability guard", () => { await waitFor(() => { expect( - screen.getByTestId("agent-server-onboarding-screen"), + screen.getByText("Reconnecting to backend..."), ).toBeInTheDocument(); }); - // The onboarding placeholder now hosts the Manage Backends modal - // directly so the user can edit/add a backend immediately. The - // modal additionally probes /server_info per registered backend - // for its status dot + version label, so the request count is - // bounded but greater than the single config probe. - await waitFor(() => { - expect(screen.getByTestId("manage-backends-modal")).toBeInTheDocument(); - }); expect(serverInfoRequests).toBeGreaterThanOrEqual(1); + expect( + screen.queryByTestId("manage-backends-modal"), + ).not.toBeInTheDocument(); expect(screen.queryByTestId("app-outlet")).not.toBeInTheDocument(); }); @@ -171,7 +169,9 @@ describe("App root agent-server availability guard", () => { screen.getByTestId("agent-server-onboarding-screen"), ).toBeInTheDocument(); }); - expect(screen.getByTestId("manage-backends-modal")).toBeInTheDocument(); + await waitFor(() => { + expect(screen.getByTestId("manage-backends-modal")).toBeInTheDocument(); + }); expect(screen.getByText("Logged out")).toBeInTheDocument(); expect( screen.getByRole("button", { name: "Log back in" }), diff --git a/src/hooks/query/use-config.test.ts b/src/hooks/query/use-config.test.ts index 3c9f386ba..2b604f477 100644 --- a/src/hooks/query/use-config.test.ts +++ b/src/hooks/query/use-config.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from "vitest"; -import { AgentServerUnavailableError } from "#/api/agent-server-compatibility"; +import { + AgentServerUnknownVersionError, + AgentServerUnavailableError, + AgentServerUnsupportedVersionError, +} from "#/api/agent-server-compatibility"; import { AGENT_SERVER_BOOTSTRAP_RETRY_COUNT, getConfigRetryDelay, @@ -24,6 +28,18 @@ describe("shouldRetryConfigQuery", () => { expect(shouldRetryConfigQuery(0, error)).toBe(false); }); + it("does not retry compatibility failures", () => { + expect( + shouldRetryConfigQuery( + 0, + new AgentServerUnsupportedVersionError("1.0.0"), + ), + ).toBe(false); + expect( + shouldRetryConfigQuery(0, new AgentServerUnknownVersionError(null)), + ).toBe(false); + }); + it("keeps the existing retry cap for non-bootstrap errors", () => { const error = new Error("Unexpected"); diff --git a/src/hooks/query/use-config.ts b/src/hooks/query/use-config.ts index ebb621f4b..98ccc8931 100644 --- a/src/hooks/query/use-config.ts +++ b/src/hooks/query/use-config.ts @@ -2,6 +2,8 @@ import { useQuery } from "@tanstack/react-query"; import { isAgentServerUnavailableError, isAgentServerAuthError, + isAgentServerUnknownVersionError, + isAgentServerUnsupportedVersionError, } from "#/api/agent-server-compatibility"; import OptionService from "#/api/option-service/option-service.api"; import { QUERY_KEYS, CONFIG_CACHE_OPTIONS } from "./query-keys"; @@ -18,7 +20,11 @@ export function shouldRetryConfigQuery( failureCount: number, error: unknown, ): boolean { - if (isAgentServerAuthError(error)) { + if ( + isAgentServerAuthError(error) || + isAgentServerUnsupportedVersionError(error) || + isAgentServerUnknownVersionError(error) + ) { return false; }