Compare commits

...

1 Commits

Author SHA1 Message Date
Evan Lohn
a17e5c3333 fix: downloads are never larger than 20mb (#5247)
* fix: downloads are never larger than 20mb

* JT comments

* import to fix integration tests
2025-08-25 13:45:00 -07:00
2 changed files with 23 additions and 13 deletions

View File

@@ -52,6 +52,7 @@ SMART_CHIP_CHAR = "\ue907"
WEB_VIEW_LINK_KEY = "webViewLink"
MAX_RETRIEVER_EMAILS = 20
CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read
# Mapping of Google Drive mime types to export formats
GOOGLE_MIME_TYPES_TO_EXPORT = {
@@ -97,18 +98,31 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
return is_valid_image_type(mime_type)
def download_request(service: GoogleDriveService, file_id: str) -> bytes:
def download_request(
service: GoogleDriveService, file_id: str, size_threshold: int
) -> bytes:
"""
Download the file from Google Drive.
"""
# For other file types, download the file
# Use the correct API call for downloading files
request = service.files().get_media(fileId=file_id)
return _download_request(request, file_id, size_threshold)
def _download_request(request: Any, file_id: str, size_threshold: int) -> bytes:
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
downloader = MediaIoBaseDownload(
response_bytes, request, chunksize=size_threshold + CHUNK_SIZE_BUFFER
)
done = False
while not done:
_, done = downloader.next_chunk()
download_progress, done = downloader.next_chunk()
if download_progress.resumable_progress > size_threshold:
logger.warning(
f"File {file_id} exceeds size threshold of {size_threshold}. Skipping2."
)
return bytes()
response = response_bytes.getvalue()
if not response:
@@ -121,6 +135,7 @@ def _download_and_extract_sections_basic(
file: dict[str, str],
service: GoogleDriveService,
allow_images: bool,
size_threshold: int,
) -> list[TextSection | ImageSection]:
"""Extract text and images from a Google Drive file."""
file_id = file["id"]
@@ -132,7 +147,7 @@ def _download_and_extract_sections_basic(
# Use the correct API call for downloading files
# lazy evaluation to only download the file if necessary
def response_call() -> bytes:
return download_request(service, file_id)
return download_request(service, file_id, size_threshold)
if is_gdrive_image_mime_type(mime_type):
# Skip images if not explicitly enabled
@@ -162,13 +177,7 @@ def _download_and_extract_sections_basic(
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()
response = response_bytes.getvalue()
response = _download_request(request, file_id, size_threshold)
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
@@ -467,7 +476,7 @@ def _convert_drive_item_to_document(
" aligning with basic sections"
)
basic_sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images
file, _get_drive_service(), allow_images, size_threshold
)
sections = align_basic_advanced(basic_sections, doc_sections)
@@ -478,7 +487,7 @@ def _convert_drive_item_to_document(
# Not Google Doc, attempt basic extraction
else:
sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images
file, _get_drive_service(), allow_images, size_threshold
)
# If we still don't have any sections, skip this file

View File

@@ -40,6 +40,7 @@ langchainhub==0.1.21
langgraph==0.2.72
langgraph-checkpoint==2.0.13
langgraph-sdk==0.1.44
lazy_imports==1.0.1
litellm==1.72.2
lxml==5.3.0
lxml_html_clean==0.2.2