mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-04 06:22:44 +00:00
Compare commits
3 Commits
v3.1.0-clo
...
feat/resol
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
043df22f43 | ||
|
|
cbb676e10c | ||
|
|
dc25903de1 |
@@ -42,6 +42,9 @@ from onyx.connectors.google_drive.file_retrieval import (
|
||||
get_all_files_in_my_drive_and_shared,
|
||||
)
|
||||
from onyx.connectors.google_drive.file_retrieval import get_external_access_for_folder
|
||||
from onyx.connectors.google_drive.file_retrieval import (
|
||||
get_files_by_web_view_links_batch,
|
||||
)
|
||||
from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive
|
||||
from onyx.connectors.google_drive.file_retrieval import get_folder_metadata
|
||||
from onyx.connectors.google_drive.file_retrieval import get_root_folder_id
|
||||
@@ -70,11 +73,13 @@ from onyx.connectors.interfaces import CheckpointedConnectorWithPermSync
|
||||
from onyx.connectors.interfaces import CheckpointOutput
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.connectors.interfaces import NormalizationResult
|
||||
from onyx.connectors.interfaces import Resolver
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnectorWithPermSync
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import EntityFailure
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import SlimDocument
|
||||
@@ -202,7 +207,9 @@ class DriveIdStatus(Enum):
|
||||
|
||||
|
||||
class GoogleDriveConnector(
|
||||
SlimConnectorWithPermSync, CheckpointedConnectorWithPermSync[GoogleDriveCheckpoint]
|
||||
SlimConnectorWithPermSync,
|
||||
CheckpointedConnectorWithPermSync[GoogleDriveCheckpoint],
|
||||
Resolver,
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1665,6 +1672,82 @@ class GoogleDriveConnector(
|
||||
start, end, checkpoint, include_permissions=True
|
||||
)
|
||||
|
||||
@override
|
||||
def resolve_errors(
|
||||
self,
|
||||
errors: list[ConnectorFailure],
|
||||
include_permissions: bool = False,
|
||||
) -> Generator[Document | ConnectorFailure | HierarchyNode, None, None]:
|
||||
if self._creds is None or self._primary_admin_email is None:
|
||||
raise RuntimeError(
|
||||
"Credentials missing, should not call this method before calling load_credentials"
|
||||
)
|
||||
|
||||
logger.info(f"Resolving {len(errors)} errors")
|
||||
doc_ids = [
|
||||
failure.failed_document.document_id
|
||||
for failure in errors
|
||||
if failure.failed_document
|
||||
]
|
||||
service = get_drive_service(self.creds, self.primary_admin_email)
|
||||
field_type = (
|
||||
DriveFileFieldType.WITH_PERMISSIONS
|
||||
if include_permissions or self.exclude_domain_link_only
|
||||
else DriveFileFieldType.STANDARD
|
||||
)
|
||||
batch_result = get_files_by_web_view_links_batch(service, doc_ids, field_type)
|
||||
|
||||
for doc_id, error in batch_result.errors.items():
|
||||
yield ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=doc_id,
|
||||
document_link=doc_id,
|
||||
),
|
||||
failure_message=f"Failed to retrieve file during error resolution: {error}",
|
||||
exception=error,
|
||||
)
|
||||
|
||||
permission_sync_context = (
|
||||
PermissionSyncContext(
|
||||
primary_admin_email=self.primary_admin_email,
|
||||
google_domain=self.google_domain,
|
||||
)
|
||||
if include_permissions
|
||||
else None
|
||||
)
|
||||
|
||||
retrieved_files = [
|
||||
RetrievedDriveFile(
|
||||
drive_file=file,
|
||||
user_email=self.primary_admin_email,
|
||||
completion_stage=DriveRetrievalStage.DONE,
|
||||
)
|
||||
for file in batch_result.files.values()
|
||||
]
|
||||
|
||||
yield from self._get_new_ancestors_for_files(
|
||||
files=retrieved_files,
|
||||
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
permission_sync_context=permission_sync_context,
|
||||
add_prefix=True,
|
||||
)
|
||||
|
||||
func_with_args = [
|
||||
(
|
||||
self._convert_retrieved_file_to_document,
|
||||
(rf, permission_sync_context),
|
||||
)
|
||||
for rf in retrieved_files
|
||||
]
|
||||
results = cast(
|
||||
list[Document | ConnectorFailure | None],
|
||||
run_functions_tuples_in_parallel(func_with_args, max_workers=8),
|
||||
)
|
||||
for result in results:
|
||||
if result is not None:
|
||||
yield result
|
||||
|
||||
def _extract_slim_docs_from_google_drive(
|
||||
self,
|
||||
checkpoint: GoogleDriveCheckpoint,
|
||||
|
||||
@@ -9,6 +9,7 @@ from urllib.parse import urlparse
|
||||
|
||||
from googleapiclient.discovery import Resource # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from googleapiclient.http import BatchHttpRequest # type: ignore
|
||||
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
|
||||
@@ -60,6 +61,8 @@ SLIM_FILE_FIELDS = (
|
||||
)
|
||||
FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
|
||||
|
||||
MAX_BATCH_SIZE = 100
|
||||
|
||||
HIERARCHY_FIELDS = "id, name, parents, webViewLink, mimeType, driveId"
|
||||
|
||||
HIERARCHY_FIELDS_WITH_PERMISSIONS = (
|
||||
@@ -216,7 +219,7 @@ def get_external_access_for_folder(
|
||||
|
||||
|
||||
def _get_fields_for_file_type(field_type: DriveFileFieldType) -> str:
|
||||
"""Get the appropriate fields string based on the field type enum"""
|
||||
"""Get the appropriate fields string for files().list() based on the field type enum."""
|
||||
if field_type == DriveFileFieldType.SLIM:
|
||||
return SLIM_FILE_FIELDS
|
||||
elif field_type == DriveFileFieldType.WITH_PERMISSIONS:
|
||||
@@ -225,6 +228,25 @@ def _get_fields_for_file_type(field_type: DriveFileFieldType) -> str:
|
||||
return FILE_FIELDS
|
||||
|
||||
|
||||
def _extract_single_file_fields(list_fields: str) -> str:
|
||||
"""Convert a files().list() fields string to one suitable for files().get().
|
||||
|
||||
List fields look like "nextPageToken, files(field1, field2, ...)"
|
||||
Single-file fields should be just "field1, field2, ..."
|
||||
"""
|
||||
start = list_fields.find("files(")
|
||||
if start == -1:
|
||||
return list_fields
|
||||
inner_start = start + len("files(")
|
||||
inner_end = list_fields.rfind(")")
|
||||
return list_fields[inner_start:inner_end]
|
||||
|
||||
|
||||
def _get_single_file_fields(field_type: DriveFileFieldType) -> str:
|
||||
"""Get the appropriate fields string for files().get() based on the field type enum."""
|
||||
return _extract_single_file_fields(_get_fields_for_file_type(field_type))
|
||||
|
||||
|
||||
def _get_files_in_parent(
|
||||
service: Resource,
|
||||
parent_id: str,
|
||||
@@ -536,3 +558,74 @@ def get_file_by_web_view_link(
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
|
||||
|
||||
class BatchRetrievalResult:
|
||||
"""Result of a batch file retrieval, separating successes from errors."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.files: dict[str, GoogleDriveFileType] = {}
|
||||
self.errors: dict[str, Exception] = {}
|
||||
|
||||
|
||||
def get_files_by_web_view_links_batch(
|
||||
service: GoogleDriveService,
|
||||
web_view_links: list[str],
|
||||
field_type: DriveFileFieldType,
|
||||
) -> BatchRetrievalResult:
|
||||
"""Retrieve multiple Google Drive files by webViewLink using the batch API.
|
||||
|
||||
Returns a BatchRetrievalResult containing successful file retrievals
|
||||
and errors for any files that could not be fetched.
|
||||
Automatically splits into chunks of MAX_BATCH_SIZE.
|
||||
"""
|
||||
fields = _get_single_file_fields(field_type)
|
||||
if len(web_view_links) <= MAX_BATCH_SIZE:
|
||||
return _get_files_by_web_view_links_batch(service, web_view_links, fields)
|
||||
|
||||
combined = BatchRetrievalResult()
|
||||
for i in range(0, len(web_view_links), MAX_BATCH_SIZE):
|
||||
chunk = web_view_links[i : i + MAX_BATCH_SIZE]
|
||||
chunk_result = _get_files_by_web_view_links_batch(service, chunk, fields)
|
||||
combined.files.update(chunk_result.files)
|
||||
combined.errors.update(chunk_result.errors)
|
||||
return combined
|
||||
|
||||
|
||||
def _get_files_by_web_view_links_batch(
|
||||
service: GoogleDriveService,
|
||||
web_view_links: list[str],
|
||||
fields: str,
|
||||
) -> BatchRetrievalResult:
|
||||
"""Single-batch implementation."""
|
||||
|
||||
result = BatchRetrievalResult()
|
||||
|
||||
def callback(
|
||||
request_id: str,
|
||||
response: GoogleDriveFileType,
|
||||
exception: Exception | None,
|
||||
) -> None:
|
||||
if exception:
|
||||
logger.warning(f"Error retrieving file {request_id}: {exception}")
|
||||
result.errors[request_id] = exception
|
||||
else:
|
||||
result.files[request_id] = response
|
||||
|
||||
batch = cast(BatchHttpRequest, service.new_batch_http_request(callback=callback))
|
||||
|
||||
for web_view_link in web_view_links:
|
||||
try:
|
||||
file_id = _extract_file_id_from_web_view_link(web_view_link)
|
||||
request = service.files().get(
|
||||
fileId=file_id,
|
||||
supportsAllDrives=True,
|
||||
fields=fields,
|
||||
)
|
||||
batch.add(request, request_id=web_view_link)
|
||||
except ValueError as e:
|
||||
logger.warning(f"Failed to extract file ID from {web_view_link}: {e}")
|
||||
result.errors[web_view_link] = e
|
||||
|
||||
batch.execute()
|
||||
return result
|
||||
|
||||
@@ -298,6 +298,22 @@ class CheckpointedConnectorWithPermSync(CheckpointedConnector[CT]):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Resolver(BaseConnector):
|
||||
@abc.abstractmethod
|
||||
def resolve_errors(
|
||||
self,
|
||||
errors: list[ConnectorFailure],
|
||||
include_permissions: bool = False,
|
||||
) -> Generator[Document | ConnectorFailure | HierarchyNode, None, None]:
|
||||
"""Attempts to yield back ALL the documents described by the errors, no checkpointing.
|
||||
|
||||
Caller's responsibility is to delete the old ConnectorFailures and replace with the new ones.
|
||||
If include_permissions is True, the documents will have permissions synced.
|
||||
May also yield HierarchyNode objects for ancestor folders of resolved documents.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HierarchyConnector(BaseConnector):
|
||||
@abc.abstractmethod
|
||||
def load_hierarchy(
|
||||
|
||||
239
backend/tests/daily/connectors/google_drive/test_resolver.py
Normal file
239
backend/tests/daily/connectors/google_drive/test_resolver.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""Tests for GoogleDriveConnector.resolve_errors against real Google Drive."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
|
||||
from tests.daily.connectors.google_drive.consts_and_utils import (
|
||||
ALL_EXPECTED_HIERARCHY_NODES,
|
||||
)
|
||||
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_ID
|
||||
from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_ID
|
||||
|
||||
_DRIVE_ID_MAPPING_PATH = os.path.join(
|
||||
os.path.dirname(__file__), "drive_id_mapping.json"
|
||||
)
|
||||
|
||||
|
||||
def _load_web_view_links(file_ids: list[int]) -> list[str]:
|
||||
with open(_DRIVE_ID_MAPPING_PATH) as f:
|
||||
mapping: dict[str, str] = json.load(f)
|
||||
return [mapping[str(fid)] for fid in file_ids]
|
||||
|
||||
|
||||
def _build_failures(web_view_links: list[str]) -> list[ConnectorFailure]:
|
||||
return [
|
||||
ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=link,
|
||||
document_link=link,
|
||||
),
|
||||
failure_message=f"Synthetic failure for {link}",
|
||||
)
|
||||
for link in web_view_links
|
||||
]
|
||||
|
||||
|
||||
@patch("onyx.file_processing.extract_file_text.get_unstructured_api_key")
|
||||
def test_resolve_single_file(
|
||||
mock_api_key: None, # noqa: ARG001
|
||||
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||
) -> None:
|
||||
"""Resolve a single known file and verify we get back exactly one Document."""
|
||||
connector = google_drive_service_acct_connector_factory(
|
||||
primary_admin_email=ADMIN_EMAIL,
|
||||
include_shared_drives=True,
|
||||
shared_drive_urls=None,
|
||||
include_my_drives=True,
|
||||
my_drive_emails=None,
|
||||
shared_folder_urls=None,
|
||||
include_files_shared_with_me=False,
|
||||
)
|
||||
|
||||
web_view_links = _load_web_view_links([0])
|
||||
failures = _build_failures(web_view_links)
|
||||
|
||||
results = list(connector.resolve_errors(failures))
|
||||
|
||||
docs = [r for r in results if isinstance(r, Document)]
|
||||
new_failures = [r for r in results if isinstance(r, ConnectorFailure)]
|
||||
hierarchy_nodes = [r for r in results if isinstance(r, HierarchyNode)]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert len(new_failures) == 0
|
||||
assert docs[0].semantic_identifier == "file_0.txt"
|
||||
|
||||
# Should yield at least one hierarchy node (the file's parent folder chain)
|
||||
assert len(hierarchy_nodes) > 0
|
||||
|
||||
|
||||
@patch("onyx.file_processing.extract_file_text.get_unstructured_api_key")
|
||||
def test_resolve_multiple_files(
|
||||
mock_api_key: None, # noqa: ARG001
|
||||
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||
) -> None:
|
||||
"""Resolve multiple files across different folders via batch API."""
|
||||
connector = google_drive_service_acct_connector_factory(
|
||||
primary_admin_email=ADMIN_EMAIL,
|
||||
include_shared_drives=True,
|
||||
shared_drive_urls=None,
|
||||
include_my_drives=True,
|
||||
my_drive_emails=None,
|
||||
shared_folder_urls=None,
|
||||
include_files_shared_with_me=False,
|
||||
)
|
||||
|
||||
# Pick files from different folders: admin files (0-4), shared drive 1 (20-24), folder_2 (45-49)
|
||||
file_ids = [0, 1, 20, 21, 45]
|
||||
web_view_links = _load_web_view_links(file_ids)
|
||||
failures = _build_failures(web_view_links)
|
||||
|
||||
results = list(connector.resolve_errors(failures))
|
||||
|
||||
docs = [r for r in results if isinstance(r, Document)]
|
||||
new_failures = [r for r in results if isinstance(r, ConnectorFailure)]
|
||||
hierarchy_nodes = [r for r in results if isinstance(r, HierarchyNode)]
|
||||
|
||||
assert len(new_failures) == 0
|
||||
retrieved_names = {doc.semantic_identifier for doc in docs}
|
||||
expected_names = {f"file_{fid}.txt" for fid in file_ids}
|
||||
assert expected_names == retrieved_names
|
||||
|
||||
# Files span multiple folders, so we should get hierarchy nodes
|
||||
assert len(hierarchy_nodes) > 0
|
||||
|
||||
|
||||
@patch("onyx.file_processing.extract_file_text.get_unstructured_api_key")
|
||||
def test_resolve_hierarchy_nodes_are_valid(
|
||||
mock_api_key: None, # noqa: ARG001
|
||||
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||
) -> None:
|
||||
"""Verify that hierarchy nodes from resolve_errors match expected structure."""
|
||||
connector = google_drive_service_acct_connector_factory(
|
||||
primary_admin_email=ADMIN_EMAIL,
|
||||
include_shared_drives=True,
|
||||
shared_drive_urls=None,
|
||||
include_my_drives=True,
|
||||
my_drive_emails=None,
|
||||
shared_folder_urls=None,
|
||||
include_files_shared_with_me=False,
|
||||
)
|
||||
|
||||
# File in folder_1 (inside shared_drive_1) — should walk up to shared_drive_1 root
|
||||
web_view_links = _load_web_view_links([25])
|
||||
failures = _build_failures(web_view_links)
|
||||
|
||||
results = list(connector.resolve_errors(failures))
|
||||
|
||||
hierarchy_nodes = [r for r in results if isinstance(r, HierarchyNode)]
|
||||
node_ids = {node.raw_node_id for node in hierarchy_nodes}
|
||||
|
||||
# File 25 is in folder_1 which is inside shared_drive_1.
|
||||
# The parent walk must yield at least these two ancestors.
|
||||
assert (
|
||||
FOLDER_1_ID in node_ids
|
||||
), f"Expected folder_1 ({FOLDER_1_ID}) in hierarchy nodes, got: {node_ids}"
|
||||
assert (
|
||||
SHARED_DRIVE_1_ID in node_ids
|
||||
), f"Expected shared_drive_1 ({SHARED_DRIVE_1_ID}) in hierarchy nodes, got: {node_ids}"
|
||||
|
||||
for node in hierarchy_nodes:
|
||||
if node.raw_node_id not in ALL_EXPECTED_HIERARCHY_NODES:
|
||||
continue
|
||||
expected = ALL_EXPECTED_HIERARCHY_NODES[node.raw_node_id]
|
||||
assert node.display_name == expected.display_name, (
|
||||
f"Display name mismatch for {node.raw_node_id}: "
|
||||
f"expected '{expected.display_name}', got '{node.display_name}'"
|
||||
)
|
||||
assert node.node_type == expected.node_type, (
|
||||
f"Node type mismatch for {node.raw_node_id}: "
|
||||
f"expected '{expected.node_type}', got '{node.node_type}'"
|
||||
)
|
||||
|
||||
|
||||
@patch("onyx.file_processing.extract_file_text.get_unstructured_api_key")
|
||||
def test_resolve_with_invalid_link(
|
||||
mock_api_key: None, # noqa: ARG001
|
||||
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||
) -> None:
|
||||
"""Resolve with a mix of valid and invalid links — invalid ones yield ConnectorFailure."""
|
||||
connector = google_drive_service_acct_connector_factory(
|
||||
primary_admin_email=ADMIN_EMAIL,
|
||||
include_shared_drives=True,
|
||||
shared_drive_urls=None,
|
||||
include_my_drives=True,
|
||||
my_drive_emails=None,
|
||||
shared_folder_urls=None,
|
||||
include_files_shared_with_me=False,
|
||||
)
|
||||
|
||||
valid_links = _load_web_view_links([0])
|
||||
invalid_link = "https://drive.google.com/file/d/NONEXISTENT_FILE_ID_12345"
|
||||
failures = _build_failures(valid_links + [invalid_link])
|
||||
|
||||
results = list(connector.resolve_errors(failures))
|
||||
|
||||
docs = [r for r in results if isinstance(r, Document)]
|
||||
new_failures = [r for r in results if isinstance(r, ConnectorFailure)]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].semantic_identifier == "file_0.txt"
|
||||
assert len(new_failures) == 1
|
||||
assert new_failures[0].failed_document is not None
|
||||
assert new_failures[0].failed_document.document_id == invalid_link
|
||||
|
||||
|
||||
@patch("onyx.file_processing.extract_file_text.get_unstructured_api_key")
|
||||
def test_resolve_empty_errors(
|
||||
mock_api_key: None, # noqa: ARG001
|
||||
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||
) -> None:
|
||||
"""Resolving an empty error list should yield nothing."""
|
||||
connector = google_drive_service_acct_connector_factory(
|
||||
primary_admin_email=ADMIN_EMAIL,
|
||||
include_shared_drives=True,
|
||||
shared_drive_urls=None,
|
||||
include_my_drives=True,
|
||||
my_drive_emails=None,
|
||||
shared_folder_urls=None,
|
||||
include_files_shared_with_me=False,
|
||||
)
|
||||
|
||||
results = list(connector.resolve_errors([]))
|
||||
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
@patch("onyx.file_processing.extract_file_text.get_unstructured_api_key")
|
||||
def test_resolve_entity_failures_are_skipped(
|
||||
mock_api_key: None, # noqa: ARG001
|
||||
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||
) -> None:
|
||||
"""Entity failures (not document failures) should be skipped by resolve_errors."""
|
||||
from onyx.connectors.models import EntityFailure
|
||||
|
||||
connector = google_drive_service_acct_connector_factory(
|
||||
primary_admin_email=ADMIN_EMAIL,
|
||||
include_shared_drives=True,
|
||||
shared_drive_urls=None,
|
||||
include_my_drives=True,
|
||||
my_drive_emails=None,
|
||||
shared_folder_urls=None,
|
||||
include_files_shared_with_me=False,
|
||||
)
|
||||
|
||||
entity_failure = ConnectorFailure(
|
||||
failed_entity=EntityFailure(entity_id="some_stage"),
|
||||
failure_message="retrieval failure",
|
||||
)
|
||||
|
||||
results = list(connector.resolve_errors([entity_failure]))
|
||||
|
||||
assert len(results) == 0
|
||||
Reference in New Issue
Block a user