diff --git a/docs/ai/patterns/error-handling.mdx b/docs/ai/patterns/error-handling.mdx index b1ac3ec8..f61d215e 100644 --- a/docs/ai/patterns/error-handling.mdx +++ b/docs/ai/patterns/error-handling.mdx @@ -7,9 +7,9 @@ description: "Implement robust error handling and retry strategies for reliable import {GlobalTabs, GlobalTab} from "/snippets/components/global-tabs.jsx"; import { GitHubLink } from '/snippets/blocks/github-link.mdx'; -import SetupPydanticAI from '/snippets/tour/ai/setup-pydantic-ai.mdx'; -LLM calls are costly, so you want to configure retry behavior to avoid infinite loops and high costs while still recovering from transient failures. +Restate automatically retries failures of your agents until they succeed. +But LLM calls are costly, so you might want to configure retry behavior to fit your use case and to avoid retrying errors that cannot heal. Restate distinguishes between two types of errors: - **Transient errors**: Temporary issues like network failures or rate limits. Restate automatically retries these until they succeed or the retry policy is exhausted. @@ -27,16 +27,60 @@ Restate distinguishes between two types of errors: ## Retrying LLM calls -LLM API calls fail transiently (rate limits, network issues, provider outages). Configure retry limits to handle this automatically and prevent runaway costs. +LLM API calls can suffer from transient failures (rate limits, network issues, provider outages). Restate retries failed LLM calls so your agents recover automatically. + +### Default behavior -In the Vercel AI SDK, set `maxRetries` on `generateText` (default: 2) to retry failed calls due to rate limits or transient errors. -After retries are exhausted, the agent throws an error. -Restate then retries the invocation with exponential backoff to handle longer outages or network issues. +The Vercel AI SDK and the Restate middleware each have their own retry layer, and they compose. + +The Vercel AI SDK does the first layer of retries based on what is set for `maxRetries` on `generateText` (default: 2) . Once those are exhausted, the AI SDK throws an error. + +Restate then takes over and retries the invocation. Each Restate retry replays the call, which goes through `maxRetries` Vercel AI SDK attempts again. + +By default, Restate's retries follow the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). +Restate will go through a limited set of retries with exponential backoff (see [default policy](/references/server-config#param-default-retry-policy)), after which the invocation will be paused. This gives you time to fix the issue, and then [resume the invocation](/services/invocation/managing-invocations#resume). + + + + +By default, `DurableRunner.run` retries LLM calls according to the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). +Restate will go through a limited set of retries with exponential backoff (see [default policy](/references/server-config#param-default-retry-policy)), after which the invocation will be paused. This gives you time to fix the issue, and then [resume the invocation](/services/invocation/managing-invocations#resume). + + + + +By default, the `RestatePlugin` retries LLM calls according to the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). +Restate will go through a limited set of retries with exponential backoff (see [default policy](/references/server-config#param-default-retry-policy)), after which the invocation will be paused. This gives you time to fix the issue, and then [resume the invocation](/services/invocation/managing-invocations#resume). -You can limit Restate's retries with the `maxRetryAttempts` option in `durableCalls` middleware: + + + +By default, `RestateAgent` retries LLM calls according to the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). +Restate will go through a limited set of retries with exponential backoff (see [default policy](/references/server-config#param-default-retry-policy)), after which the invocation will be paused. This gives you time to fix the issue, and then [resume the invocation](/services/invocation/managing-invocations#resume). + + + + +When you wrap LLM calls in `ctx.run()`, Restate retries them according to the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). +Restate will go through a limited set of retries with exponential backoff (see [default policy](/references/server-config#param-default-retry-policy)), after which the invocation will be paused. This gives you time to fix the issue, and then [resume the invocation](/services/invocation/managing-invocations#resume). + + + + +When you wrap LLM calls in `ctx.run_typed()`, Restate retries them according to the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). +Restate will go through a limited set of retries with exponential backoff (see [default policy](/references/server-config#param-default-retry-policy)), after which the invocation will be paused. This gives you time to fix the issue, and then [resume the invocation](/services/invocation/managing-invocations#resume). + + + + +### Setting a retry policy + + + +To set a separate retry policy for LLM calls, pass [`RunOptions`](https://restatedev.github.io/sdk-typescript/types/_restatedev_restate-sdk.RunOptions.html) to the `durableCalls` middleware: ```typescript errorhandling/fail-on-terminal-tool-agent.ts {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/vercel-ai/tour-of-agents/src/errorhandling/fail-on-terminal-tool-agent.ts#max_attempts_example"} const model = wrapLanguageModel({ @@ -46,14 +90,14 @@ const model = wrapLanguageModel({ ``` -Each Restate retry triggers up to `maxRetries` SDK attempts. -For example, with `maxRetryAttempts`: 3 and `maxRetries`: 2, a call may be attempted 6 times. -Once Restate's retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. +If you set a maximum number of retry attempts, Restate will still go through the AI SDK's `maxRetries` for each attempt, so the two limits multiply (e.g. `maxRetryAttempts`: 3 × `maxRetries`: 2 = up to 6 attempts). + +Once Restate's retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. You can catch the Terminal Error in your handler and act accordingly. -Restate's `DurableRunner` lets you specify the retry behavior for LLM calls: +To set a separate retry policy for LLM calls, pass [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37) to `DurableRunner.run`: ```python error_handling.py {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/openai-agents/tour-of-agents/app/error_handling.py#handle"} @agent_service.handler() @@ -71,12 +115,12 @@ async def run(_ctx: restate.Context, req: WeatherPrompt) -> str: ``` -By default, the runner retries ten times with an initial interval of one second. Once Restate's retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. +Once these retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. You can catch the Terminal Error in your handler and act accordingly. -Configure the number of retries for LLM calls when activating the Restate plugin for your ADK App: +To set a separate retry policy for LLM calls, pass [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37) to the Restate plugin when activating it for your ADK App: ```python error_handling.py {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/google-adk/tour-of-agents/app/error_handling.py#retries"} run_options = RunOptions(max_attempts=3, initial_retry_interval=timedelta(seconds=1)) @@ -88,12 +132,12 @@ app = App( ``` -By default, the runner retries ten times with an initial interval of one second. Once Restate's retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. +Once these retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. You can catch the Terminal Error in your handler and act accordingly. -Restate's `RestateAgent` lets you specify the retry behavior for LLM calls via `RunOptions`: +To set a separate retry policy for LLM calls, pass [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37) to `RestateAgent`: ```python error_handling.py {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/pydantic-ai/tour-of-agents/app/error_handling.py#retries"} restate_agent = RestateAgent( @@ -103,7 +147,7 @@ restate_agent = RestateAgent( ``` -By default, the runner retries ten times with an initial interval of one second. Once Restate's retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. +Once these retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. You can catch the Terminal Error in your handler and act accordingly. @@ -124,7 +168,7 @@ By default, the middleware retries indefinitely with exponential backoff. Once R -Wrap LLM calls in `ctx.run()` with a retry limit to handle transient failures automatically: +To set a separate retry policy for LLM calls, pass [`RunOptions`](https://restatedev.github.io/sdk-typescript/types/_restatedev_restate-sdk.RunOptions.html) to `ctx.run()`: ```typescript // Retries up to 3 times with exponential backoff @@ -135,14 +179,12 @@ const result = await ctx.run( ); ``` -Without `maxRetryAttempts`, Restate retries indefinitely with exponential backoff. For LLM calls, setting a limit prevents runaway costs from persistent failures. - -You can set [custom retry policies](/guides/error-handling#at-the-run-block-level) for `ctx.run` steps. +Once these retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. You can catch the Terminal Error in your handler and act accordingly. -Wrap LLM calls in `ctx.run_typed()` with a retry limit to handle transient failures automatically: +To set a separate retry policy for LLM calls, pass [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37) to `ctx.run_typed()`: ```python # Retries up to 3 times with exponential backoff @@ -155,30 +197,24 @@ result = await ctx.run_typed( ) ``` -Without `max_attempts`, Restate retries indefinitely with exponential backoff. For LLM calls, setting a limit prevents runaway costs from persistent failures. +Once these retries are exhausted, the invocation fails with a `TerminalError` and won't be retried further. You can catch the Terminal Error in your handler and act accordingly. -You can set [custom retry policies](/guides/error-handling#at-the-run-block-level) for `.run` actions. ## Tool execution errors - - - -When agent tools use Restate Context actions like `ctx.run`, Restate automatically retries transient errors in these operations. This makes your tools resilient to network failures, database hiccups, and other temporary issues. For all operations that might suffer from transient errors, use Context actions. +Restate makes tool execution resilient by retrying transient errors and propagating terminal ones. -For errors that should not be retried, throw a terminal error: +### Transient errors -```typescript {"CODE_LOAD::ts/src/tour/agents/terminal_error.ts#terminal_error"} -throw new TerminalError("This tool is not allowed to run for this input."); -``` + + -By default, the Vercel AI will convert any errors in tool executions into a message to the LLM, and the agent will decide how to proceed. -This is often desirable, as the LLM can decide to use a different tool or provide a fallback answer. +By default, the Vercel AI SDK converts any errors in tool executions into a message to the LLM, and the agent decides how to proceed. This is often desirable, as the LLM can decide to use a different tool or provide a fallback answer. -However, if you use Restate Context actions like `ctx.run` in your tool execution, Restate will retry any transient errors in these actions until they succeed. +When you wrap external calls in Restate Context actions like `ctx.run`, Restate retries transient errors within the Context action before the result reaches the agent. This makes your tools resilient to network failures, database hiccups, and other temporary issues. For all operations that might suffer from transient errors, use Context actions: ```typescript {"CODE_LOAD::ts/src/tour/agents/inline-tool-errors.ts#here"} // Without ctx.run - error goes straight to agent @@ -195,11 +231,159 @@ async function myToolWithRestate(ctx: restate.Context) { } ``` -Terminal errors thrown from Restate Context actions are not retried by Restate, and get processed by the Vercel AI. -Also here, the Vercel AI will convert the error into a message to the LLM, and the agent will decide how to proceed. +Restate then retries the whole invocation according to the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). + + + + +Restate retries all transient errors to make your tools resilient to network failures, database hiccups, and other temporary issues. + +By default, it uses the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). + + + + +Restate retries all transient errors to make your tools resilient to network failures, database hiccups, and other temporary issues. + +By default, it uses the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). + + + + +Restate retries all transient errors to make your tools resilient to network failures, database hiccups, and other temporary issues. + +By default, it uses the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). + + + + +Restate retries all transient errors to make your tools resilient to network failures, database hiccups, and other temporary issues. + +By default, it uses the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). + + + + +Restate retries all transient errors to make your tools resilient to network failures, database hiccups, and other temporary issues. + +By default, it uses the policy configured at the [service or handler level](/services/configuration#how-to-configure), or otherwise the [Restate server's default policy](/guides/error-handling#configure-restate-server-defaults). + + + + +### Setting a retry policy on run actions + + + + +If you do run actions in your tools, you can override the default retry policy by passing [`RunOptions`](https://restatedev.github.io/sdk-typescript/types/_restatedev_restate-sdk.RunOptions.html): + +```ts {"CODE_LOAD::ts/src/ai/guides/errorhandling/error_handling.ts#retries"} +const result = await ctx.run( + "fetch-data", + () => fetch("/api/data"), + { maxRetryAttempts: 3 } +); +``` + +See [custom retry policies](/guides/error-handling#at-the-run-block-level) for more options. When retries are exhausted, the tool will fail with a Terminal Error. + + + -In some cases, you might want to treat terminal tool execution errors as permanent failures and stop the agent instead of letting the LLM decide how to proceed. -The Restate middleware provides two utilities to help with this: +If you do run actions in your tools, you can override the default retry policy by passing [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37): + +```python {"CODE_LOAD::python/src/ai/error_handling.py#retries"} +result = await restate_context().run_typed( + "fetch data", + fetch_data, + RunOptions(max_attempts=3), + req=req, +) +``` + +See [custom retry policies](/guides/error-handling#at-the-run-block-level) for more options. When retries are exhausted, the tool will fail with a Terminal Error. + + + + +If you do run actions in your tools, you can override the default retry policy by passing [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37): + +```python {"CODE_LOAD::python/src/ai/error_handling.py#retries"} +result = await restate_context().run_typed( + "fetch data", + fetch_data, + RunOptions(max_attempts=3), + req=req, +) +``` + +See [custom retry policies](/guides/error-handling#at-the-run-block-level) for more options. When retries are exhausted, the tool will fail with a Terminal Error. + + + + +If you do run actions in your tools, you can override the default retry policy by passing [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37): + +```python {"CODE_LOAD::python/src/ai/error_handling.py#retries"} +result = await restate_context().run_typed( + "fetch data", + fetch_data, + RunOptions(max_attempts=3), + req=req, +) +``` + +See [custom retry policies](/guides/error-handling#at-the-run-block-level) for more options. When retries are exhausted, the tool will fail with a Terminal Error. + + + + +If you do `ctx.run` actions in your tools, you can override the default retry policy by passing [`RunOptions`](https://restatedev.github.io/sdk-typescript/types/_restatedev_restate-sdk.RunOptions.html): + +```ts {"CODE_LOAD::ts/src/ai/guides/errorhandling/error_handling.ts#retries"} +const result = await ctx.run( + "fetch-data", + () => fetch("/api/data"), + { maxRetryAttempts: 3 } +); +``` + +See [custom retry policies](/guides/error-handling#at-the-run-block-level) for more options. When retries are exhausted, the tool will fail with a Terminal Error. + + + + +For `ctx.run_typed` actions specifically, you can override the default retry policy by passing [`RunOptions`](https://github.com/restatedev/sdk-python/blob/main/python/restate/context.py#L37): + +```python {"CODE_LOAD::python/src/ai/error_handling.py#retries"} +result = await restate_context().run_typed( + "fetch data", + fetch_data, + RunOptions(max_attempts=3), + req=req, +) +``` + +See [custom retry policies](/guides/error-handling#at-the-run-block-level) for more options. When retries are exhausted, the tool will fail with a Terminal Error. + + + + +### Terminal errors + +For errors that should not be retried (invalid input, business rule violations, resource not found), use a `TerminalError` in your tool. Restate does not retry these: + + + + +```typescript {"CODE_LOAD::ts/src/tour/agents/terminal_error.ts#terminal_error"} +throw new TerminalError("This tool is not allowed to run for this input."); +``` + +By default, Vercel AI converts the terminal error into a message to the LLM, and the agent decides how to proceed. + +If you want to treat terminal tool errors as permanent failures and stop the agent instead, the Restate middleware provides two utilities: @@ -258,75 +442,74 @@ if (terminalSteps.length > 0) { -When agent tools use Restate Context actions like `ctx.run`, Restate automatically retries transient errors in these operations. This makes your tools resilient to network failures, database hiccups, and other temporary issues. For all operations that might suffer from transient errors, use Context actions. - -For errors that should not be retried, throw a terminal error: - -```python +```python {"CODE_LOAD::python/src/ai/error_handling.py#terminal"} from restate import TerminalError raise TerminalError("This tool is not allowed to run for this input.") ``` -By default, the Restate OpenAI integration will raise any terminal errors in tool executions and will let you handle them in your handler. +The Restate OpenAI integration raises terminal errors to your handler, where you can catch and handle them: - +```python {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/openai-agents/tour-of-agents/app/error_handling.py#handle"} +@agent_service.handler() +async def run(_ctx: restate.Context, req: WeatherPrompt) -> str: + try: + run_opts = RunOptions( + max_attempts=3, initial_retry_interval=timedelta(seconds=2) + ) + result = await DurableRunner.run(agent, req.message, run_options=run_opts) + except restate.TerminalError as e: + # Handle terminal errors gracefully + return f"The agent couldn't complete the request: {e.message}" + + return result.final_output +``` + + The OpenAI Agent SDK also allows setting `failure_error_function` to `None`, which will rethrow any error in the agent execution as-is. Also for example invalid LLM responses (e.g. tool call with invalid arguments or to a tool that doesn't exist). - The error will then lead to Restate retries. Restate will recover the invocation by replaying the journal entries. - This can lead to infinite retries if the error is not transient. - Therefore, be careful when using this option and handle errors appropriately in your agent logic. - You also might want to set [a retry policy at the service or handler level](/services/configuration#how-to-configure) to avoid infinite retries. - + The error will then lead to Restate retries. Since the error isn't transient, the invocation will be paused when the retries are exhausted, and will require manual intervention. + Therefore, we do not recommend using this setting and instead recommend handling these errors appropriately in your agent logic. + -When agent tools use Restate Context actions like `ctx.run`, Restate automatically retries transient errors in these operations. This makes your tools resilient to network failures, database hiccups, and other temporary issues. For all operations that might suffer from transient errors, use Context actions. - -For errors that should not be retried, throw a terminal error: - -```python +```python {"CODE_LOAD::python/src/ai/error_handling.py#terminal"} from restate import TerminalError raise TerminalError("This tool is not allowed to run for this input.") ``` -Restate retries tool executions by default until they succeed. -For errors which should not be retried, raise terminal errors from within your tool implementations. +You can catch these terminal errors in your handler and handle them accordingly: -You can catch these terminal errors in your handler and handle them accordingly. +```python {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/google-adk/tour-of-agents/app/error_handling.py#handle"} +@agent_service.handler() +async def run(ctx: restate.ObjectContext, req: WeatherPrompt) -> str | None: + try: + events = runner.run_async( + user_id=ctx.key(), + session_id=req.session_id, + new_message=Content(role="user", parts=[Part.from_text(text=req.message)]), + ) + return await parse_agent_response(events) + except TerminalError as e: + # Handle the error appropriately, e.g., log it or return a default response + return "Sorry, I'm unable to process your request at the moment." +``` -When agent tools use Restate Context actions like `ctx.run`, Restate automatically retries transient errors in these operations. This makes your tools resilient to network failures, database hiccups, and other temporary issues. For all operations that might suffer from transient errors, use Context actions. - -For example, wrapping a tool call in `restate_context().run_typed()` makes it durable with automatic retries: - -```python error_handling.py {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/pydantic-ai/tour-of-agents/app/error_handling.py#here"} -async def get_weather(city: WeatherRequest) -> WeatherResponse: - """Get the current weather for a given city.""" - return await restate_context().run_typed( - f"Get weather {city}", fetch_weather, req=city - ) -``` - - -For errors that should not be retried, raise a terminal error: - -```python +```python {"CODE_LOAD::python/src/ai/error_handling.py#terminal"} from restate import TerminalError raise TerminalError("This tool is not allowed to run for this input.") ``` -Restate retries tool executions by default until they succeed. -For errors which should not be retried, raise terminal errors from within your tool implementations. - You can catch these terminal errors in your handler and handle them accordingly: -```python error_handling.py {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/pydantic-ai/tour-of-agents/app/error_handling.py#handle"} +```python {"CODE_LOAD::https://raw.githubusercontent.com/restatedev/ai-examples/refs/heads/main/pydantic-ai/tour-of-agents/app/error_handling.py#handle"} @agent_service.handler() async def run(_ctx: restate.Context, req: WeatherPrompt) -> str: try: @@ -336,7 +519,6 @@ async def run(_ctx: restate.Context, req: WeatherPrompt) -> str: return f"The agent couldn't complete the request: {e.message}" return result.output ``` - @@ -376,10 +558,6 @@ except restate.TerminalError as e: -Restate automatically retries transient errors. This makes your tools resilient to network failures, database hiccups, and other temporary issues. - -When a tool encounters an unrecoverable error (e.g., resource not found, invalid input, business rule violation), throw a `TerminalError` to stop retries immediately: - ```typescript {"CODE_LOAD::ts/src/tour/agents/terminal_error.ts#terminal_error"} throw new TerminalError("This tool is not allowed to run for this input."); ``` @@ -389,11 +567,7 @@ You can catch and handle terminal errors in your agent logic if needed. -Restate automatically retries transient errors. This makes your tools resilient to network failures, database hiccups, and other temporary issues. - -When a tool encounters an unrecoverable error (e.g., resource not found, invalid input, business rule violation), raise a `TerminalError` to stop retries immediately: - -```python +```python {"CODE_LOAD::python/src/ai/error_handling.py#terminal"} from restate import TerminalError raise TerminalError("This tool is not allowed to run for this input.") @@ -410,4 +584,4 @@ You can catch and handle terminal errors in your agent logic if needed. ## Combining with rollback -For multi-step agent workflows where steps have side effects (bookings, payments, emails), combine error handling with [compensation/rollback patterns](/ai/patterns/rollback) to undo completed work when later steps fail. +For multi-step agent workflows where steps have side effects (bookings, payments, emails), combine terminal errors with [compensation/rollback patterns](/ai/patterns/rollback) to undo completed work before finishing. diff --git a/docs/guides/error-handling.mdx b/docs/guides/error-handling.mdx index 5b897515..4e40e72b 100644 --- a/docs/guides/error-handling.mdx +++ b/docs/guides/error-handling.mdx @@ -4,7 +4,7 @@ description: "Learn how to handle transient and terminal errors in your applicat tags: ["development"] --- -Restate handles retries for failed invocations. By default, Restate infinitely retries all errors with an exponential backoff strategy. +Restate handles retries for failed invocations. By default, Restate retries all errors with an exponential backoff strategy. This guide helps you fine-tune the retry behavior for your use cases. @@ -40,7 +40,12 @@ The retry policy can be set on each individual handler, or for all the handlers To configure the retry policy on a service/handler level, check [retry service configuration](/services/configuration#retries). - Via the [`restate-server` configuration file](/server/configuration): + + The default retry policy will retry the invocation a limited number of times, after which the invocation will be paused if no progress can be made. To resume a paused invocation, check the [resume documentation](/services/invocation/managing-invocations#resume). + + Check the [configuration reference](/references/server-config) for the `default-retry-policy`. + + You can change the default behavior via the [`restate-server` configuration file](/server/configuration): ```toml restate.toml [invocation.default-retry-policy] @@ -65,7 +70,6 @@ The retry policy can be set on each individual handler, or for all the handlers RESTATE_DEFAULT_RETRY_POLICY__MAX_INTERVAL="10s" ``` - This retry policy will retry the invocation 100 times, after which the invocation will be paused if no progress can be made. To resume a paused invocation, check the paragraph below. You can also retry forever, without ever pausing or killing the invocation: @@ -73,7 +77,6 @@ The retry policy can be set on each individual handler, or for all the handlers RESTATE_DEFAULT_RETRY_POLICY__MAX_ATTEMPTS=unlimited ``` - Check the [configuration documentation](/server/configuration) and [reference](/references/server-config) for the `default-retry-policy`. When a retry policy is unset, Restate by default will retry undefinitely, alike setting `max-attempts = "unlimited"`. @@ -204,7 +207,7 @@ You can also throw `RetryableError` directly in handler code (outside of `ctx.ru ## Application errors (terminal) -By default, Restate infinitely retries all errors. +By default, Restate retries all errors. In some cases, you might not want to retry an error (e.g. because of business logic, because the issue is not transient, ...). For these cases you can throw a terminal error. Terminal errors are permanent and are not retried by Restate. diff --git a/snippets/python/src/ai/error_handling.py b/snippets/python/src/ai/error_handling.py new file mode 100644 index 00000000..7122d2de --- /dev/null +++ b/snippets/python/src/ai/error_handling.py @@ -0,0 +1,38 @@ +import restate +from restate import Context, Service +from restate import RunOptions +from restate.ext.pydantic import restate_context + +my_service = Service("MyService") + + +def fetch_data(req: str) -> str: + return "" + + +@my_service.handler("myServiceHandler") +async def my_service_handler(ctx: Context, req: str) -> str: + # + result = await restate_context().run_typed( + "fetch data", + fetch_data, + RunOptions(max_attempts=3), + req=req, + ) + # + + # + result = await ctx.run_typed( + "fetch data", + fetch_data, + RunOptions(max_attempts=3), + req=req, + ) + # + + + # + from restate import TerminalError + + raise TerminalError("This tool is not allowed to run for this input.") + # diff --git a/snippets/ts/src/ai/guides/errorhandling/error_handling.ts b/snippets/ts/src/ai/guides/errorhandling/error_handling.ts new file mode 100644 index 00000000..29af6054 --- /dev/null +++ b/snippets/ts/src/ai/guides/errorhandling/error_handling.ts @@ -0,0 +1,18 @@ +import * as restate from "@restatedev/restate-sdk"; + +export const myAgent = restate.service({ + name: "my-agent", + handlers: { + run: async (ctx: restate.Context, { message }: { message: string }) => { + + // + const result = await ctx.run( + "fetch-data", + () => fetch("/api/data"), + { maxRetryAttempts: 3 } + ); + // + return `${message}!`; + }, + }, +});