From 9d60a9ac3045f6279166117eed154f9f38cc49b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 30 Dec 2024 17:25:07 +0100 Subject: [PATCH 1/4] Make the default retry policy the aggressive one with half the attempts --- docs/use/api.rst | 25 +++++----- tests/test_retry.py | 115 +++++++++++++++----------------------------- zyte_api/_retry.py | 97 ++++++++++++++++++------------------- 3 files changed, 98 insertions(+), 139 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index 8e3d34e..c6983bf 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -148,32 +148,29 @@ retries for :ref:`rate-limiting ` and :ref:`unsuccessful .. _default-retry-policy: The default retry policy, :data:`~zyte_api.zyte_api_retrying`, does the -following: +following for each request: - Retries :ref:`rate-limiting responses ` forever. -- Retries :ref:`temporary download errors - ` up to 3 times. +- Retries :ref:`temporary download errors ` + up to 3 times. :ref:`Permanent download errors + ` also count towards this retry limit. + +- Retries permanent download errors up to 3 times per request. - Retries network errors until they have happened for 15 minutes straight. +- Retries error responses with an HTTP status code in the 500-599 range (503, + 520 and 521 excluded) up to 3 times. + All retries are done with an exponential backoff algorithm. .. _aggressive-retry-policy: If some :ref:`unsuccessful responses ` exceed maximum retries with the default retry policy, try using -:data:`~zyte_api.aggressive_retrying` instead, which modifies the default retry -policy as follows: - -- Temporary download error are retried 7 times. :ref:`Permanent download - errors ` also count towards this retry - limit. - -- Retries permanent download errors up to 3 times. - -- Retries error responses with an HTTP status code in the 500-599 range (503, - 520 and 521 excluded) up to 3 times. +:data:`~zyte_api.aggressive_retrying` instead, which duplicates attempts for +all retry scenarios. Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` and :class:`~zyte_api.AggressiveRetryFactory` features some examples of custom diff --git a/tests/test_retry.py b/tests/test_retry.py index ef8f2d0..86e0293 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -140,6 +140,15 @@ def __init__(self, time): self.time = time +class scale: + + def __init__(self, factor): + self.factor = factor + + def __call__(self, number, add=0): + return int(number * self.factor) + add + + @pytest.mark.parametrize( ("retrying", "outcomes", "exhausted"), ( @@ -237,81 +246,36 @@ def __init__(self, time): ), ) ), - # Behaviors specific to the default retry policy + # Scaled behaviors, where the default retry policy uses half as many + # attempts as the aggressive retry policy. *( - (zyte_api_retrying, outcomes, exhausted) - for outcomes, exhausted in ( - # Temporary download errors are retried until they have - # happened 4 times in total. - ( - (mock_request_error(status=520),) * 3, - False, - ), - ( - (mock_request_error(status=520),) * 4, - True, - ), - ( - ( - *(mock_request_error(status=429),) * 2, - mock_request_error(status=520), - ), - False, - ), - ( - ( - *(mock_request_error(status=429),) * 3, - mock_request_error(status=520), - ), - False, - ), - ( - ( - *( - mock_request_error(status=429), - mock_request_error(status=520), - ) - * 3, - ), - False, - ), - ( - ( - *( - mock_request_error(status=429), - mock_request_error(status=520), - ) - * 4, - ), - True, - ), + (retrying, outcomes, exhausted) + for retrying, scaled in ( + (zyte_api_retrying, scale(0.5)), + (aggressive_retrying, scale(1)), ) - ), - # Behaviors specific to the aggressive retry policy - *( - (aggressive_retrying, outcomes, exhausted) for outcomes, exhausted in ( # Temporary download errors are retried until they have - # happened 8 times in total. Permanent download errors also - # count towards that limit. + # happened 8*factor times in total. Permanent download errors + # also count towards that limit. ( - (mock_request_error(status=520),) * 7, + (mock_request_error(status=520),) * scaled(8, -1), False, ), ( - (mock_request_error(status=520),) * 8, + (mock_request_error(status=520),) * scaled(8), True, ), ( ( - *(mock_request_error(status=429),) * 6, + *(mock_request_error(status=429),) * scaled(8, -2), mock_request_error(status=520), ), False, ), ( ( - *(mock_request_error(status=429),) * 7, + *(mock_request_error(status=429),) * scaled(8, -1), mock_request_error(status=520), ), False, @@ -322,7 +286,7 @@ def __init__(self, time): mock_request_error(status=429), mock_request_error(status=520), ) - * 7, + * scaled(8, -1), ), False, ), @@ -332,13 +296,13 @@ def __init__(self, time): mock_request_error(status=429), mock_request_error(status=520), ) - * 8, + * scaled(8), ), True, ), ( ( - *(mock_request_error(status=520),) * 5, + *(mock_request_error(status=520),) * scaled(8, -3), *(mock_request_error(status=521),) * 1, *(mock_request_error(status=520),) * 1, ), @@ -346,7 +310,7 @@ def __init__(self, time): ), ( ( - *(mock_request_error(status=520),) * 6, + *(mock_request_error(status=520),) * scaled(8, -2), *(mock_request_error(status=521),) * 1, *(mock_request_error(status=520),) * 1, ), @@ -354,29 +318,30 @@ def __init__(self, time): ), ( ( - *(mock_request_error(status=520),) * 6, + *(mock_request_error(status=520),) * scaled(8, -2), *(mock_request_error(status=521),) * 1, ), False, ), ( ( - *(mock_request_error(status=520),) * 7, + *(mock_request_error(status=520),) * scaled(8, -1), *(mock_request_error(status=521),) * 1, ), True, ), # Permanent download errors are retried until they have - # happened 4 times in total. + # happened 4*factor times in total. ( - (*(mock_request_error(status=521),) * 3,), + (*(mock_request_error(status=521),) * scaled(4, -1),), False, ), ( - (*(mock_request_error(status=521),) * 4,), + (*(mock_request_error(status=521),) * scaled(4),), True, ), - # Undocumented 5xx errors are retried up to 3 times. + # Undocumented 5xx errors are retried until they have happened + # 4*factor times. *( scenario for status in ( @@ -386,16 +351,16 @@ def __init__(self, time): ) for scenario in ( ( - (*(mock_request_error(status=status),) * 3,), + (*(mock_request_error(status=status),) * scaled(4, -1),), False, ), ( - (*(mock_request_error(status=status),) * 4,), + (*(mock_request_error(status=status),) * scaled(4),), True, ), ( ( - *(mock_request_error(status=status),) * 2, + *(mock_request_error(status=status),) * scaled(4, -2), mock_request_error(status=429), mock_request_error(status=503), ServerConnectionError(), @@ -405,7 +370,7 @@ def __init__(self, time): ), ( ( - *(mock_request_error(status=status),) * 3, + *(mock_request_error(status=status),) * scaled(4, -1), mock_request_error(status=429), mock_request_error(status=503), ServerConnectionError(), @@ -415,17 +380,15 @@ def __init__(self, time): ), ( ( - mock_request_error(status=status), mock_request_error(status=555), - mock_request_error(status=status), + *(mock_request_error(status=status),) * scaled(4, -2), ), False, ), ( ( - mock_request_error(status=status), mock_request_error(status=555), - *(mock_request_error(status=status),) * 2, + *(mock_request_error(status=status),) * scaled(4, -1), ), True, ), @@ -464,7 +427,7 @@ async def run(): try: await run() except Exception as outcome: - assert exhausted + assert exhausted, outcome assert outcome is last_outcome else: assert not exhausted diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 3b01c0e..90bdcad 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -54,10 +54,6 @@ def _is_throttling_error(exc: BaseException) -> bool: return isinstance(exc, RequestError) and exc.status in (429, 503) -def _is_temporary_download_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 520 - - class stop_on_count(stop_base): """Keep a call count with the specified counter name, and stop after the specified number os calls. @@ -128,6 +124,42 @@ def __call__(self, retry_state: "RetryCallState") -> bool: return True +class stop_on_download_error(stop_base): + """Stop after the specified max numbers of total or permanent download + errors.""" + + def __init__(self, max_total: int, max_permanent: int) -> None: + self._max_total = max_total + self._max_permanent = max_permanent + + def __call__(self, retry_state: "RetryCallState") -> bool: + if not hasattr(retry_state, "counter"): + retry_state.counter = Counter() # type: ignore + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if exc.status == 521: # type: ignore + retry_state.counter["permanent_download_error"] += 1 # type: ignore + if retry_state.counter["permanent_download_error"] >= self._max_permanent: # type: ignore + return True + retry_state.counter["download_error"] += 1 # type: ignore + if retry_state.counter["download_error"] >= self._max_total: # type: ignore + return True + return False + + +def _download_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status in {520, 521} + + +def _undocumented_error(exc: BaseException) -> bool: + return ( + isinstance(exc, RequestError) + and exc.status >= 500 + and exc.status not in {503, 520, 521} + ) + + class RetryFactory: """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`default retry policy `. @@ -160,7 +192,8 @@ class CustomRetryFactory(RetryFactory): retry_condition: retry_base = ( retry_if_exception(_is_throttling_error) | retry_if_exception(_is_network_error) - | retry_if_exception(_is_temporary_download_error) + | retry_if_exception(_download_error) + | retry_if_exception(_undocumented_error) ) # throttling throttling_wait = wait_chain( @@ -182,7 +215,10 @@ class CustomRetryFactory(RetryFactory): temporary_download_error_wait = network_error_wait throttling_stop = stop_never network_error_stop = stop_after_uninterrupted_delay(15 * 60) - temporary_download_error_stop = stop_on_count(4) + temporary_download_error_stop = stop_on_download_error(max_total=4, max_permanent=2) + + undocumented_error_stop = stop_on_count(2) + undocumented_error_wait = network_error_wait def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" @@ -192,7 +228,9 @@ def wait(self, retry_state: RetryCallState) -> float: return self.throttling_wait(retry_state=retry_state) if _is_network_error(exc): return self.network_error_wait(retry_state=retry_state) - assert _is_temporary_download_error(exc) # See retry_condition + if _undocumented_error(exc): + return self.undocumented_error_wait(retry_state=retry_state) + assert _download_error(exc) # See retry_condition return self.temporary_download_error_wait(retry_state=retry_state) def stop(self, retry_state: RetryCallState) -> bool: @@ -203,7 +241,9 @@ def stop(self, retry_state: RetryCallState) -> bool: return self.throttling_stop(retry_state) if _is_network_error(exc): return self.network_error_stop(retry_state) - assert _is_temporary_download_error(exc) # See retry_condition + if _undocumented_error(exc): + return self.undocumented_error_stop(retry_state) + assert _download_error(exc) # See retry_condition return self.temporary_download_error_stop(retry_state) def reraise(self) -> bool: @@ -224,42 +264,6 @@ def build(self) -> AsyncRetrying: zyte_api_retrying: AsyncRetrying = RetryFactory().build() -def _download_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status in {520, 521} - - -def _undocumented_error(exc: BaseException) -> bool: - return ( - isinstance(exc, RequestError) - and exc.status >= 500 - and exc.status not in {503, 520, 521} - ) - - -class stop_on_download_error(stop_base): - """Stop after the specified max numbers of total or permanent download - errors.""" - - def __init__(self, max_total: int, max_permanent: int) -> None: - self._max_total = max_total - self._max_permanent = max_permanent - - def __call__(self, retry_state: "RetryCallState") -> bool: - if not hasattr(retry_state, "counter"): - retry_state.counter = Counter() # type: ignore - assert retry_state.outcome, "Unexpected empty outcome" - exc = retry_state.outcome.exception() - assert exc, "Unexpected empty exception" - if exc.status == 521: # type: ignore - retry_state.counter["permanent_download_error"] += 1 # type: ignore - if retry_state.counter["permanent_download_error"] >= self._max_permanent: # type: ignore - return True - retry_state.counter["download_error"] += 1 # type: ignore - if retry_state.counter["download_error"] >= self._max_total: # type: ignore - return True - return False - - class AggressiveRetryFactory(RetryFactory): """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`aggressive retry policy `. @@ -300,7 +304,6 @@ class CustomRetryFactory(AggressiveRetryFactory): download_error_wait = RetryFactory.temporary_download_error_wait undocumented_error_stop = stop_on_count(4) - undocumented_error_wait = RetryFactory.temporary_download_error_wait def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" @@ -308,8 +311,6 @@ def stop(self, retry_state: RetryCallState) -> bool: assert exc, "Unexpected empty exception" if _download_error(exc): return self.download_error_stop(retry_state) - if _undocumented_error(exc): - return self.undocumented_error_stop(retry_state) return super().stop(retry_state) def wait(self, retry_state: RetryCallState) -> float: @@ -318,8 +319,6 @@ def wait(self, retry_state: RetryCallState) -> float: assert exc, "Unexpected empty exception" if _download_error(exc): return self.download_error_wait(retry_state) - if _undocumented_error(exc): - return self.undocumented_error_wait(retry_state=retry_state) return super().wait(retry_state) From d052ed144eb630ad9e39826613a83982a22e1d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Jan 2025 15:51:31 +0100 Subject: [PATCH 2/4] Initial stab at circuit break for undocumented erors --- docs/use/api.rst | 3 ++ pyproject.toml | 3 ++ tests/mockserver.py | 9 +++-- tests/test_async.py | 24 ++++++++++- tests/test_main.py | 16 +++----- tests/test_retry.py | 95 +++++++++++++++++++++++++++++++++++++++++++- zyte_api/__init__.py | 2 +- zyte_api/__main__.py | 7 ++-- zyte_api/_async.py | 10 ++++- zyte_api/_errors.py | 4 +- zyte_api/_retry.py | 77 ++++++++++++++++++++++++++++++++--- zyte_api/stats.py | 6 +-- 12 files changed, 223 insertions(+), 33 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index c6983bf..c9419bd 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -163,6 +163,9 @@ following for each request: - Retries error responses with an HTTP status code in the 500-599 range (503, 520 and 521 excluded) up to 3 times. +- Disallows new requests if undocumented error responses are more than 10 + *and* more than 1% of all responses. + All retries are done with an exponential backoff algorithm. .. _aggressive-retry-policy: diff --git a/pyproject.toml b/pyproject.toml index c6b28cc..e882810 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,6 @@ multi_line_output = 3 [tool.black] target-version = ["py39", "py310", "py311", "py312", "py313"] + +[tool.mypy] +check_untyped_defs = true diff --git a/tests/mockserver.py b/tests/mockserver.py index 023b72f..ce84138 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -6,10 +6,12 @@ from base64 import b64encode from importlib import import_module from subprocess import PIPE, Popen -from typing import Any, Dict +from typing import Any, Dict, cast from urllib.parse import urlparse from twisted.internet import reactor +from twisted.internet.defer import Deferred +from twisted.internet.interfaces import IReactorTime from twisted.internet.task import deferLater from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET, Site @@ -40,7 +42,7 @@ def _cancelrequest(_): d.addErrback(lambda _: None) d.cancel() - d = deferLater(reactor, delay, f, *a, **kw) + d: Deferred = deferLater(cast(IReactorTime, reactor), delay, f, *a, **kw) request.notifyFinish().addErrback(_cancelrequest) return d @@ -82,6 +84,7 @@ def render_POST(self, request): url = request_data["url"] domain = urlparse(url).netloc + response_data: Dict[str, Any] if domain == "e429.example": request.setResponseCode(429) response_data = {"status": 429, "type": "/limits/over-user-limit"} @@ -119,7 +122,7 @@ def render_POST(self, request): request.setResponseCode(500) return b'["foo"]' - response_data: Dict[str, Any] = { + response_data = { "url": url, } diff --git a/tests/test_async.py b/tests/test_async.py index 3f33ce7..771c0f1 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -3,7 +3,13 @@ import pytest -from zyte_api import AggressiveRetryFactory, AsyncZyteAPI, RequestError +from zyte_api import ( + AggressiveRetryFactory, + AsyncZyteAPI, + RequestError, + TooManyUndocumentedErrors, +) +from zyte_api._retry import ZyteAsyncRetrying from zyte_api.aio.client import AsyncClient from zyte_api.apikey import NoApiKey from zyte_api.errors import ParsedError @@ -318,4 +324,18 @@ def test_retrying_class(): """A descriptive exception is raised when creating a client with an AsyncRetrying subclass or similar instead of an instance of it.""" with pytest.raises(ValueError): - AsyncZyteAPI(api_key="foo", retrying=AggressiveRetryFactory) + AsyncZyteAPI(api_key="foo", retrying=AggressiveRetryFactory) # type: ignore[arg-type] + + +@pytest.mark.asyncio +async def test_too_many_undocumented_errors(mockserver): + ZyteAsyncRetrying._total_outcomes = 9 + ZyteAsyncRetrying._total_undocumented_errors = 9 + + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + + await client.get({"url": "https://a.example", "httpResponseBody": True}) + with pytest.raises(TooManyUndocumentedErrors): + await client.get({"url": "https://e500.example", "httpResponseBody": True}) + with pytest.raises(TooManyUndocumentedErrors): + await client.get({"url": "https://a.example", "httpResponseBody": True}) diff --git a/tests/test_main.py b/tests/test_main.py index 7e3c8d8..d8c4df2 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -11,15 +11,6 @@ from zyte_api.aio.errors import RequestError -class MockRequestError(Exception): - @property - def parsed(self): - mock = Mock( - response_body=Mock(decode=Mock(return_value=forbidden_domain_response())) - ) - return mock - - def get_json_content(file_object): if not file_object: return @@ -53,7 +44,12 @@ def forbidden_domain_response(): async def fake_exception(value=True): # Simulating an error condition if value: - raise MockRequestError() + raise RequestError( + query={"url": "https://example.com", "httpResponseBody": True}, + response_content=json.dumps(forbidden_domain_response()).encode(), + request_info=None, + history=None, + ) create_session_mock = AsyncMock() return await create_session_mock.coroutine() diff --git a/tests/test_retry.py b/tests/test_retry.py index 86e0293..a86fc9b 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -11,13 +11,20 @@ AsyncZyteAPI, RequestError, RetryFactory, + TooManyUndocumentedErrors, aggressive_retrying, zyte_api_retrying, ) +from zyte_api._retry import ZyteAsyncRetrying from .mockserver import DropResource, MockServer +def reset_totals(): + ZyteAsyncRetrying._total_outcomes = 0 + ZyteAsyncRetrying._total_undocumented_errors = 0 + + def test_deprecated_imports(): from zyte_api import RetryFactory, zyte_api_retrying from zyte_api.aio.retry import RetryFactory as DeprecatedRetryFactory @@ -74,10 +81,11 @@ def broken_stop(_): ) @pytest.mark.asyncio async def test_retry_wait(retry_factory, status, waiter, mockserver): + def broken_wait(self, retry_state): raise OutlierException - class CustomRetryFactory(retry_factory): + class CustomRetryFactory(retry_factory): # type: ignore[valid-type, misc] pass setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) @@ -105,7 +113,7 @@ async def test_retry_wait_network_error(retry_factory): def broken_wait(self, retry_state): raise OutlierException - class CustomRetryFactory(retry_factory): + class CustomRetryFactory(retry_factory): # type: ignore[valid-type, misc] pass setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) @@ -401,6 +409,7 @@ def __call__(self, number, add=0): @pytest.mark.asyncio @patch("time.monotonic") async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): + reset_totals() monotonic_mock.return_value = 0 last_outcome = outcomes[-1] outcomes = deque(outcomes) @@ -431,3 +440,85 @@ async def run(): assert outcome is last_outcome else: assert not exhausted + + +mock_good_response = object() + + +@pytest.mark.parametrize( + ("retrying", "outcome_sequences", "exhausted"), + ( + # A ZyteAPIError exception is raised when, of all responses, + # undocumented 5xx responses are at least 10 and at least 1%. + # + # 9, 100%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 9, + False, + ), + # 10, 100%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 10, + True, + ), + # 10, <1%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 9 # 9 / 18 (50%) + + ((mock_good_response,),) * (982) # + 0 / 982 = 9 / 1000 (0.9%) + + ((mock_request_error(status=500),),) * 1, # + 1 / 1 = 10 / 1001 (0.999…%) + False, + ), + # 10, ≥1%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 9 # 9 / 18 (50%) + + ((mock_good_response,),) * (981) # + 0 / 981 = 9 / 999 (0.9%) + + ((mock_request_error(status=500),),) * 1, # + 1 / 1 = 10 / 1000 (1%) + True, + ), + ), +) +@pytest.mark.asyncio +@patch("time.monotonic") +async def test_retry_stop_global_parallel( + monotonic_mock, retrying, outcome_sequences, exhausted +): + reset_totals() + monotonic_mock.return_value = 0 + last_outcome = outcome_sequences[-1][-1] + outcome_sequences = tuple(deque(outcomes) for outcomes in outcome_sequences) + + def wait(retry_state): + return 0.0 + + retrying = copy(retrying) + retrying.wait = wait + + async def run(outcomes): + while True: + try: + outcome = outcomes.popleft() + except IndexError: + return + else: + if isinstance(outcome, fast_forward): + monotonic_mock.return_value += outcome.time + continue + if outcome is mock_good_response: + continue + raise outcome + + run = retrying.wraps(run) + + try: + for outcomes in outcome_sequences: + await run(outcomes) + except Exception as exc: + assert exhausted, exc + assert isinstance(exc, TooManyUndocumentedErrors) + assert exc.outcome is last_outcome + else: + assert not exhausted diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 1f97fd2..0e9b55b 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -4,7 +4,7 @@ from ._async import AsyncZyteAPI from ._errors import RequestError -from ._retry import AggressiveRetryFactory, RetryFactory +from ._retry import AggressiveRetryFactory, RetryFactory, TooManyUndocumentedErrors from ._retry import aggressive_retrying as _aggressive_retrying from ._retry import ( stop_after_uninterrupted_delay, diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index 776f9c1..94bab28 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -11,6 +11,7 @@ import tqdm from tenacity import retry_if_exception +from zyte_api import RequestError from zyte_api._async import AsyncZyteAPI from zyte_api._retry import RetryFactory, _is_throttling_error from zyte_api._utils import create_session @@ -71,13 +72,13 @@ def write_output(content): try: result = await fut except Exception as e: - if store_errors: - write_output(e.parsed.response_body.decode()) + if store_errors and isinstance(e, RequestError): + write_output(e.parsed.data) if stop_on_errors: raise - logger.error(str(e)) + logger.exception("Exception raised during response handling") else: write_output(result) finally: diff --git a/zyte_api/_async.py b/zyte_api/_async.py index afa20ff..f6f30e6 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -10,7 +10,7 @@ from tenacity import AsyncRetrying from ._errors import RequestError -from ._retry import zyte_api_retrying +from ._retry import TooManyUndocumentedErrors, zyte_api_retrying from ._utils import _AIO_API_TIMEOUT, create_session from .apikey import get_apikey from .constants import API_URL @@ -103,6 +103,7 @@ def __init__( self.retrying = retrying or zyte_api_retrying self.user_agent = user_agent or USER_AGENT self._semaphore = asyncio.Semaphore(n_conn) + self._disabling_exception: TooManyUndocumentedErrors | None = None async def get( self, @@ -114,6 +115,9 @@ async def get( retrying: Optional[AsyncRetrying] = None, ) -> _ResponseFuture: """Asynchronous equivalent to :meth:`ZyteAPI.get`.""" + if self._disabling_exception is not None: + raise self._disabling_exception + retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) @@ -172,7 +176,9 @@ async def request(): # Try to make a request result = await request() self.agg_stats.n_success += 1 - except Exception: + except Exception as exc: + if isinstance(exc, TooManyUndocumentedErrors): + self._disabling_exception = exc self.agg_stats.n_fatal_errors += 1 raise diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py index 6476c39..13e33be 100644 --- a/zyte_api/_errors.py +++ b/zyte_api/_errors.py @@ -31,11 +31,11 @@ def __init__(self, *args, **kwargs): @property def parsed(self): """Response as a :class:`ParsedError` object.""" - return ParsedError.from_body(self.response_content) + return ParsedError.from_body(self.response_content or b"") def __str__(self): return ( f"RequestError: {self.status}, message={self.message}, " - f"headers={self.headers}, body={self.response_content}, " + f"headers={self.headers}, body={self.response_content!r}, " f"request_id={self.request_id}" ) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 90bdcad..ad8156e 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -1,13 +1,17 @@ +from __future__ import annotations + import asyncio import logging from collections import Counter from datetime import timedelta from itertools import count -from typing import Union +from typing import TYPE_CHECKING, Any, Union from aiohttp import client_exceptions from tenacity import ( AsyncRetrying, + DoAttempt, + DoSleep, RetryCallState, after_log, before_log, @@ -23,6 +27,12 @@ from ._errors import RequestError +if TYPE_CHECKING: + from tenacity import RetryBaseT as SyncRetryBaseT + from tenacity.asyncio import RetryBaseT + from tenacity.stop import StopBaseT + from tenacity.wait import WaitBaseT + logger = logging.getLogger(__name__) _IDS = count() @@ -160,6 +170,66 @@ def _undocumented_error(exc: BaseException) -> bool: ) +class TooManyUndocumentedErrors(RuntimeError): + def __init__(self, outcome, errors, total): + msg = ( + f"Too many undocumented error responses received from Zyte API " + f"({errors} out of {total}, {errors / total:.2%}). This process " + f"will no longer be able to send Zyte API requests. Please, " + f"monitor https://status.zyte.com/ or contact support " + f"(https://support.zyte.com/support/tickets/new) before sending " + f"more requests like the ones causing these error responses.\n" + f"Last offending query: {outcome.query}\n" + f"Last offending response: {outcome}" + ) + self.outcome = outcome + super().__init__(msg) + + +class ZyteAsyncRetrying(AsyncRetrying): + _total_outcomes = 0 + _total_undocumented_errors = 0 + + def __init__( + self, + stop: "StopBaseT", + wait: "WaitBaseT", + retry: "SyncRetryBaseT | RetryBaseT", + reraise: bool, + **kwargs, + ): + kwargs.setdefault("before", before_log(logger, logging.DEBUG)) + kwargs.setdefault("after", after_log(logger, logging.DEBUG)) + kwargs.setdefault("before_sleep", before_sleep_log(logger, logging.DEBUG)) + super().__init__( + stop=stop, + wait=wait, + retry=retry, + reraise=reraise, + **kwargs, + ) + + async def iter(self, retry_state: RetryCallState) -> DoAttempt | DoSleep | Any: + do = await super().iter(retry_state) + retry_cls = retry_state.retry_object.__class__ + if retry_state.outcome is not None: + retry_cls._total_outcomes += 1 # type: ignore[attr-defined] + try: + retry_state.outcome.result() + except Exception as exc: + if _undocumented_error(exc): + retry_cls._total_undocumented_errors += 1 # type: ignore[attr-defined] + errors = retry_cls._total_undocumented_errors # type: ignore[attr-defined] + total = retry_cls._total_outcomes # type: ignore[attr-defined] + if errors >= 10 and errors / total >= 0.01: + raise TooManyUndocumentedErrors( + outcome=exc, + errors=errors, # type: ignore[attr-defined] + total=total, # type: ignore[attr-defined] + ) + return do + + class RetryFactory: """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`default retry policy `. @@ -250,14 +320,11 @@ def reraise(self) -> bool: return True def build(self) -> AsyncRetrying: - return AsyncRetrying( + return ZyteAsyncRetrying( wait=self.wait, retry=self.retry_condition, stop=self.stop, reraise=self.reraise(), - before=before_log(logger, logging.DEBUG), - after=after_log(logger, logging.DEBUG), - before_sleep=before_sleep_log(logger, logging.DEBUG), ) diff --git a/zyte_api/stats.py b/zyte_api/stats.py index 42c7b6a..6b23107 100644 --- a/zyte_api/stats.py +++ b/zyte_api/stats.py @@ -37,9 +37,9 @@ def __init__(self): self.n_429 = 0 # number of 429 (throttling) responses self.n_errors = 0 # number of errors, including errors which were retried - self.status_codes = Counter() - self.exception_types = Counter() - self.api_error_types = Counter() + self.status_codes: Counter = Counter() + self.exception_types: Counter = Counter() + self.api_error_types: Counter = Counter() def __str__(self): return "conn:{:0.2f}s, resp:{:0.2f}s, throttle:{:.1%}, err:{}+{}({:.1%}) | success:{}/{}({:.1%})".format( From 2d2c56f3d6e780c808f454ed41f11861b7b11a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Jan 2025 16:16:39 +0100 Subject: [PATCH 3/4] Remove unnecessary comments --- zyte_api/_retry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index ad8156e..5ccee03 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -224,8 +224,8 @@ async def iter(self, retry_state: RetryCallState) -> DoAttempt | DoSleep | Any: if errors >= 10 and errors / total >= 0.01: raise TooManyUndocumentedErrors( outcome=exc, - errors=errors, # type: ignore[attr-defined] - total=total, # type: ignore[attr-defined] + errors=errors, + total=total, ) return do From 6394d94db481146637e8220f5d4595b926ae54b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 Jan 2025 15:01:02 +0100 Subject: [PATCH 4/4] unquote type hints --- zyte_api/_retry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 5ccee03..d7cd5ce 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -192,9 +192,9 @@ class ZyteAsyncRetrying(AsyncRetrying): def __init__( self, - stop: "StopBaseT", - wait: "WaitBaseT", - retry: "SyncRetryBaseT | RetryBaseT", + stop: StopBaseT, + wait: WaitBaseT, + retry: SyncRetryBaseT | RetryBaseT, reraise: bool, **kwargs, ):