From 1307e8b2d4f7fafa5189ea88cb330375cfc2773e Mon Sep 17 00:00:00 2001 From: Rico Furtado Date: Mon, 25 May 2026 18:03:27 -0400 Subject: [PATCH 1/3] fix: enhance error handling for Google Drive file downloads with SSL retries --- src/connectors/google_drive/connector.py | 72 +++++++++++++++--------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/src/connectors/google_drive/connector.py b/src/connectors/google_drive/connector.py index a656730a1..944bd9192 100644 --- a/src/connectors/google_drive/connector.py +++ b/src/connectors/google_drive/connector.py @@ -1,6 +1,7 @@ import asyncio import io import os +import ssl import time from collections import deque from collections.abc import Iterable @@ -509,34 +510,49 @@ def _download_file_bytes(self, file_meta: dict[str, Any]) -> bytes: # Binary download (get_media also doesn't accept the Drive flags) request = self.service.files().get_media(fileId=file_id) - # Download the file with error handling for misclassified Google Docs - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) - done = False - - try: - while not done: - status, done = downloader.next_chunk() - # Optional: you can log progress via status.progress() - except HttpError as e: - # If download fails with "fileNotDownloadable", it's a Docs Editor file - # that wasn't properly detected. Retry with export_media. - if "fileNotDownloadable" in str(e) and mime_type not in exportable_types: - logger.warning( - f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. " - f"Retrying with export_media (file might be a Google Doc)" - ) - export_mime = "application/pdf" - request = self.service.files().export_media( - fileId=file_id, mimeType=export_mime - ) - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) - done = False - while not done: - status, done = downloader.next_chunk() - else: - raise + # Download the file with error handling for misclassified Google Docs and + # transient SSL/network errors. Each SSL retry recreates the downloader + # from scratch to avoid partial-buffer state. + _max_retries = 3 + for _attempt in range(_max_retries): + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) + done = False + try: + try: + while not done: + status, done = downloader.next_chunk() + except HttpError as e: + # If download fails with "fileNotDownloadable", it's a Docs Editor file + # that wasn't properly detected. Retry with export_media. + if "fileNotDownloadable" in str(e) and mime_type not in exportable_types: + logger.warning( + f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. " + f"Retrying with export_media (file might be a Google Doc)" + ) + export_mime = "application/pdf" + request = self.service.files().export_media( + fileId=file_id, mimeType=export_mime + ) + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) + done = False + while not done: + status, done = downloader.next_chunk() + else: + raise + break # download succeeded + except (ssl.SSLError, ConnectionResetError, TimeoutError) as e: + if _attempt < _max_retries - 1: + _delay = 2.0 ** _attempt + logger.warning( + "[GoogleDrive] Transient network error for %s (attempt %d/%d), " + "retrying in %.1fs: %s", + file_id, _attempt + 1, _max_retries, _delay, e, + ) + time.sleep(_delay) + else: + raise data = fh.getvalue() logger.debug("[GoogleDrive] _download_file_bytes: done, %d bytes", len(data)) From c8f254373770a30ed4aebc23eb048f24deeaeb2c Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Mon, 25 May 2026 22:07:24 +0000 Subject: [PATCH 2/3] style: ruff autofix (auto) --- src/connectors/google_drive/connector.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/connectors/google_drive/connector.py b/src/connectors/google_drive/connector.py index 944bd9192..9c7ab77ac 100644 --- a/src/connectors/google_drive/connector.py +++ b/src/connectors/google_drive/connector.py @@ -544,11 +544,15 @@ def _download_file_bytes(self, file_meta: dict[str, Any]) -> bytes: break # download succeeded except (ssl.SSLError, ConnectionResetError, TimeoutError) as e: if _attempt < _max_retries - 1: - _delay = 2.0 ** _attempt + _delay = 2.0**_attempt logger.warning( "[GoogleDrive] Transient network error for %s (attempt %d/%d), " "retrying in %.1fs: %s", - file_id, _attempt + 1, _max_retries, _delay, e, + file_id, + _attempt + 1, + _max_retries, + _delay, + e, ) time.sleep(_delay) else: From 8507b4a425b3d708dd53d1064b77d42b5c6aca22 Mon Sep 17 00:00:00 2001 From: Rico Furtado Date: Mon, 25 May 2026 23:17:35 -0400 Subject: [PATCH 3/3] fix: improve error handling and retry logic for Google Drive file downloads --- src/connectors/google_drive/connector.py | 91 +++++++++++++----------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/src/connectors/google_drive/connector.py b/src/connectors/google_drive/connector.py index 9c7ab77ac..50c57d932 100644 --- a/src/connectors/google_drive/connector.py +++ b/src/connectors/google_drive/connector.py @@ -510,53 +510,62 @@ def _download_file_bytes(self, file_meta: dict[str, Any]) -> bytes: # Binary download (get_media also doesn't accept the Drive flags) request = self.service.files().get_media(fileId=file_id) - # Download the file with error handling for misclassified Google Docs and - # transient SSL/network errors. Each SSL retry recreates the downloader - # from scratch to avoid partial-buffer state. - _max_retries = 3 - for _attempt in range(_max_retries): - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) - done = False + # Download the file with error handling for misclassified Google Docs and + # transient SSL/network errors. Each SSL retry recreates the downloader + # from scratch to avoid partial-buffer state. + _max_retries = 3 + for _attempt in range(_max_retries): + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) + done = False + _should_retry = False + _delay = 0.0 + _err: Exception | None = None + try: try: - try: - while not done: - status, done = downloader.next_chunk() - except HttpError as e: - # If download fails with "fileNotDownloadable", it's a Docs Editor file - # that wasn't properly detected. Retry with export_media. - if "fileNotDownloadable" in str(e) and mime_type not in exportable_types: - logger.warning( - f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. " - f"Retrying with export_media (file might be a Google Doc)" - ) - export_mime = "application/pdf" + while not done: + status, done = downloader.next_chunk() + except HttpError as e: + # If download fails with "fileNotDownloadable", it's a Docs Editor file + # that wasn't properly detected. Retry with export_media. + if "fileNotDownloadable" in str(e) and mime_type not in exportable_types: + logger.warning( + f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. " + f"Retrying with export_media (file might be a Google Doc)" + ) + export_mime = "application/pdf" + with self._lock: request = self.service.files().export_media( fileId=file_id, mimeType=export_mime ) - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) - done = False - while not done: - status, done = downloader.next_chunk() - else: - raise - break # download succeeded - except (ssl.SSLError, ConnectionResetError, TimeoutError) as e: - if _attempt < _max_retries - 1: - _delay = 2.0**_attempt - logger.warning( - "[GoogleDrive] Transient network error for %s (attempt %d/%d), " - "retrying in %.1fs: %s", - file_id, - _attempt + 1, - _max_retries, - _delay, - e, - ) - time.sleep(_delay) + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024) + done = False + while not done: + status, done = downloader.next_chunk() else: raise + break # download succeeded + except (ssl.SSLError, ConnectionResetError, TimeoutError) as e: + _should_retry = _attempt < _max_retries - 1 + _delay = 2.0**_attempt + _err = e + + if _should_retry and _err is not None: + logger.warning( + "[GoogleDrive] Transient network error for %s (attempt %d/%d), " + "retrying in %.1fs: %s", + file_id, + _attempt + 1, + _max_retries, + _delay, + _err, + ) + time.sleep(_delay) + continue + + if _err is not None: + raise _err data = fh.getvalue() logger.debug("[GoogleDrive] _download_file_bytes: done, %d bytes", len(data))