Compare commits

...

1 Commits

Author SHA1 Message Date
Evan Lohn
2a1c0e258a extension fix and logs 2025-05-09 16:12:31 -07:00
2 changed files with 10 additions and 0 deletions

View File

@@ -590,6 +590,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
logger.info(f"Getting files in folder '{folder_id}' as '{user_email}'")
yield from _yield_from_folder_crawl(folder_id, start)
logger.info(f"Done retrieving files for user {user_email}")
curr_stage.stage = DriveRetrievalStage.DONE
def _manage_service_account_retrieval(
@@ -652,6 +653,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
for user_email, stage_completion in checkpoint.completion_map.items()
if stage_completion.stage != DriveRetrievalStage.DONE
]
logger.info(f"{len(non_completed_org_emails)} users left to retrieve")
# don't process too many emails before returning a checkpoint. This is
# to resolve the case where there are a ton of emails that don't have access

View File

@@ -59,6 +59,10 @@ GOOGLE_MIME_TYPES = {
GDriveMimeType.PPT.value: "text/plain",
}
DRIVE_IGNORE_EXTENSIONS = {
"dll",
}
def _summarize_drive_image(
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
@@ -422,6 +426,10 @@ def _convert_drive_item_to_document(
)
return None
if file.get("name", "").split(".")[-1] in DRIVE_IGNORE_EXTENSIONS:
logger.warning(f"Skipping file {file.get('name')} due to extension.")
return None
# If we don't have sections yet, use the basic extraction method
if not sections:
sections = _download_and_extract_sections_basic(