Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 51 additions & 22 deletions src/connectors/google_drive/connector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import io
import os
import ssl
import time
from collections import deque
from collections.abc import Iterable
Expand Down Expand Up @@ -509,34 +510,62 @@ def _download_file_bytes(self, file_meta: dict[str, Any]) -> bytes:
# Binary download (get_media also doesn't accept the Drive flags)
request = self.service.files().get_media(fileId=file_id)

# Download the file with error handling for misclassified Google Docs
# Download the file with error handling for misclassified Google Docs and
# transient SSL/network errors. Each SSL retry recreates the downloader
# from scratch to avoid partial-buffer state.
_max_retries = 3
for _attempt in range(_max_retries):
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024)
done = False

_should_retry = False
_delay = 0.0
_err: Exception | None = None
try:
while not done:
status, done = downloader.next_chunk()
# Optional: you can log progress via status.progress()
except HttpError as e:
# If download fails with "fileNotDownloadable", it's a Docs Editor file
# that wasn't properly detected. Retry with export_media.
if "fileNotDownloadable" in str(e) and mime_type not in exportable_types:
logger.warning(
f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. "
f"Retrying with export_media (file might be a Google Doc)"
)
export_mime = "application/pdf"
request = self.service.files().export_media(
fileId=file_id, mimeType=export_mime
)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024)
done = False
try:
while not done:
status, done = downloader.next_chunk()
else:
raise
except HttpError as e:
# If download fails with "fileNotDownloadable", it's a Docs Editor file
# that wasn't properly detected. Retry with export_media.
if "fileNotDownloadable" in str(e) and mime_type not in exportable_types:
logger.warning(
f"Download failed for {file_id} ({mime_type}) with fileNotDownloadable error. "
f"Retrying with export_media (file might be a Google Doc)"
)
export_mime = "application/pdf"
with self._lock:
request = self.service.files().export_media(
fileId=file_id, mimeType=export_mime
)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request, chunksize=1024 * 1024)
done = False
while not done:
status, done = downloader.next_chunk()
else:
raise
break # download succeeded
except (ssl.SSLError, ConnectionResetError, TimeoutError) as e:
_should_retry = _attempt < _max_retries - 1
_delay = 2.0**_attempt
_err = e

if _should_retry and _err is not None:
logger.warning(
"[GoogleDrive] Transient network error for %s (attempt %d/%d), "
"retrying in %.1fs: %s",
file_id,
_attempt + 1,
_max_retries,
_delay,
_err,
)
time.sleep(_delay)
continue

if _err is not None:
raise _err

data = fh.getvalue()
logger.debug("[GoogleDrive] _download_file_bytes: done, %d bytes", len(data))
Expand Down
Loading