Compare commits

...

10 Commits

Author SHA1 Message Date
Weves
60fb21ddca Improve comment 2025-05-07 19:38:46 -07:00
Weves
fc89d745c1 Add ondelete=CASCADE 2025-05-07 19:38:46 -07:00
Weves
a748bb28a7 Address EL comments 2025-05-07 19:38:46 -07:00
Weves
2dd304739d Add missing file 2025-05-07 19:38:46 -07:00
Weves
778a7eeb5a Speed up 2025-05-07 19:38:46 -07:00
Weves
0b39a45ae7 Fix 2025-05-07 19:38:46 -07:00
Weves
a540038660 fixes 2025-05-07 19:38:46 -07:00
Weves
1c4ceb36be more stuff 2025-05-07 19:38:46 -07:00
Weves
f3e2ab25f8 add tests 2025-05-07 19:38:46 -07:00
Weves
9b91babed1 Enhance drive perm sync 2025-05-07 19:38:46 -07:00
15 changed files with 882 additions and 300 deletions

View File

@@ -0,0 +1,32 @@
"""Add public_external_user_group table
Revision ID: a7688ab35c45
Revises: 5c448911b12f
Create Date: 2025-05-06 20:55:12.747875
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "a7688ab35c45"
down_revision = "5c448911b12f"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_table(
"public_external_user_group",
sa.Column("external_user_group_id", sa.String(), nullable=False),
sa.Column("cc_pair_id", sa.Integer(), nullable=False),
sa.PrimaryKeyConstraint("external_user_group_id", "cc_pair_id"),
sa.ForeignKeyConstraint(
["cc_pair_id"], ["connector_credential_pair.id"], ondelete="CASCADE"
),
)
def downgrade() -> None:
op.drop_table("public_external_user_group")

View File

@@ -1,6 +1,7 @@
from sqlalchemy.orm import Session
from ee.onyx.db.external_perm import fetch_external_groups_for_user
from ee.onyx.db.external_perm import fetch_public_external_group_ids
from ee.onyx.db.user_group import fetch_user_groups_for_documents
from ee.onyx.db.user_group import fetch_user_groups_for_user
from ee.onyx.external_permissions.post_query_censoring import (
@@ -63,6 +64,8 @@ def _get_access_for_documents(
document_ids=document_ids,
)
all_public_ext_u_group_ids = set(fetch_public_external_group_ids(db_session))
access_map = {}
for document_id, non_ee_access in non_ee_access_dict.items():
document = doc_id_map[document_id]
@@ -89,7 +92,10 @@ def _get_access_for_documents(
# If its censored, then it's public anywhere during the search and then permissions are
# applied after the search
is_public_anywhere = (
document.is_public or non_ee_access.is_public or is_only_censored
document.is_public
or non_ee_access.is_public
or is_only_censored
or any(u_group in all_public_ext_u_group_ids for u_group in ext_u_groups)
)
# To avoid collisions of group namings between connectors, they need to be prefixed

View File

@@ -8,6 +8,7 @@ from sqlalchemy.orm import Session
from onyx.access.utils import build_ext_group_name_for_onyx
from onyx.configs.constants import DocumentSource
from onyx.db.models import PublicExternalUserGroup
from onyx.db.models import User
from onyx.db.models import User__ExternalUserGroupId
from onyx.db.users import batch_add_ext_perm_user_if_not_exists
@@ -20,6 +21,12 @@ logger = setup_logger()
class ExternalUserGroup(BaseModel):
id: str
user_emails: list[str]
# `True` for cases like a Folder in Google Drive that give domain-wide
# or "Anyone with link" access to all files in the folder.
# if this is set, `user_emails` don't really matter.
# When this is `True`, this `ExternalUserGroup` object doesn't really represent
# an actual "group" in the source.
gives_anyone_access: bool = False
def delete_user__ext_group_for_user__no_commit(
@@ -44,6 +51,17 @@ def delete_user__ext_group_for_cc_pair__no_commit(
)
def delete_public_external_group_for_cc_pair__no_commit(
db_session: Session,
cc_pair_id: int,
) -> None:
db_session.execute(
delete(PublicExternalUserGroup).where(
PublicExternalUserGroup.cc_pair_id == cc_pair_id
)
)
def replace_user__ext_group_for_cc_pair(
db_session: Session,
cc_pair_id: int,
@@ -72,13 +90,22 @@ def replace_user__ext_group_for_cc_pair(
db_session=db_session,
cc_pair_id=cc_pair_id,
)
delete_public_external_group_for_cc_pair__no_commit(
db_session=db_session,
cc_pair_id=cc_pair_id,
)
# map emails to ids
email_id_map = {user.email: user.id for user in all_group_members}
# use these ids to create new external user group relations relating group_id to user_ids
new_external_permissions = []
new_external_permissions: list[User__ExternalUserGroupId] = []
new_public_external_groups: list[PublicExternalUserGroup] = []
for external_group in group_defs:
external_group_id = build_ext_group_name_for_onyx(
ext_group_name=external_group.id,
source=source,
)
for user_email in external_group.user_emails:
user_id = email_id_map.get(user_email.lower())
if user_id is None:
@@ -87,10 +114,6 @@ def replace_user__ext_group_for_cc_pair(
f" with email {user_email} not found"
)
continue
external_group_id = build_ext_group_name_for_onyx(
ext_group_name=external_group.id,
source=source,
)
new_external_permissions.append(
User__ExternalUserGroupId(
user_id=user_id,
@@ -99,7 +122,16 @@ def replace_user__ext_group_for_cc_pair(
)
)
if external_group.gives_anyone_access:
new_public_external_groups.append(
PublicExternalUserGroup(
external_user_group_id=external_group_id,
cc_pair_id=cc_pair_id,
)
)
db_session.add_all(new_external_permissions)
db_session.add_all(new_public_external_groups)
db_session.commit()
@@ -130,3 +162,11 @@ def fetch_external_groups_for_user_email_and_group_ids(
)
).all()
return list(user_ext_groups)
def fetch_public_external_group_ids(
db_session: Session,
) -> list[str]:
return list(
db_session.scalars(select(PublicExternalUserGroup.external_user_group_id)).all()
)

View File

@@ -3,11 +3,15 @@ from datetime import datetime
from datetime import timezone
from typing import Any
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
from ee.onyx.external_permissions.google_drive.models import PermissionType
from ee.onyx.external_permissions.google_drive.permission_retrieval import (
get_permissions_by_ids,
)
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
from onyx.access.models import DocExternalAccess
from onyx.access.models import ExternalAccess
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.connectors.google_utils.resources import get_drive_service
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.models import SlimDocument
@@ -17,8 +21,6 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
_PERMISSION_ID_PERMISSION_MAP: dict[str, dict[str, Any]] = {}
def _get_slim_doc_generator(
cc_pair: ConnectorCredentialPair,
@@ -41,46 +43,28 @@ def _get_slim_doc_generator(
def _fetch_permissions_for_permission_ids(
google_drive_connector: GoogleDriveConnector,
permission_ids: list[str],
permission_info: dict[str, Any],
) -> list[dict[str, Any]]:
) -> list[GoogleDrivePermission]:
doc_id = permission_info.get("doc_id")
if not permission_info or not doc_id:
return []
permissions = [
_PERMISSION_ID_PERMISSION_MAP[pid]
for pid in permission_ids
if pid in _PERMISSION_ID_PERMISSION_MAP
]
if len(permissions) == len(permission_ids):
return permissions
owner_email = permission_info.get("owner_email")
permission_ids = permission_info.get("permission_ids", [])
if not permission_ids:
return []
drive_service = get_drive_service(
creds=google_drive_connector.creds,
user_email=(owner_email or google_drive_connector.primary_admin_email),
)
# We continue on 404 or 403 because the document may not exist or the user may not have access to it
fetched_permissions = execute_paginated_retrieval(
retrieval_function=drive_service.permissions().list,
list_key="permissions",
fileId=doc_id,
fields="permissions(id, emailAddress, type, domain),nextPageToken",
supportsAllDrives=True,
continue_on_404_or_403=True,
return get_permissions_by_ids(
drive_service=drive_service,
doc_id=doc_id,
permission_ids=permission_ids,
)
permissions_for_doc_id = []
for permission in fetched_permissions:
permissions_for_doc_id.append(permission)
_PERMISSION_ID_PERMISSION_MAP[permission["id"]] = permission
return permissions_for_doc_id
def _get_permissions_from_slim_doc(
google_drive_connector: GoogleDriveConnector,
@@ -88,14 +72,13 @@ def _get_permissions_from_slim_doc(
) -> ExternalAccess:
permission_info = slim_doc.perm_sync_data or {}
permissions_list = permission_info.get("permissions", [])
if not permissions_list:
if permission_ids := permission_info.get("permission_ids"):
permissions_list = _fetch_permissions_for_permission_ids(
google_drive_connector=google_drive_connector,
permission_ids=permission_ids,
permission_info=permission_info,
)
permissions_list: list[GoogleDrivePermission] = []
raw_permissions_list = permission_info.get("permissions", [])
if not raw_permissions_list:
permissions_list = _fetch_permissions_for_permission_ids(
google_drive_connector=google_drive_connector,
permission_info=permission_info,
)
if not permissions_list:
logger.warning(f"No permissions found for document {slim_doc.id}")
return ExternalAccess(
@@ -103,41 +86,71 @@ def _get_permissions_from_slim_doc(
external_user_group_ids=set(),
is_public=False,
)
else:
permissions_list = [
GoogleDrivePermission.from_drive_permission(p) for p in raw_permissions_list
]
company_domain = google_drive_connector.google_domain
folder_ids_to_inherit_permissions_from: set[str] = set()
user_emails: set[str] = set()
group_emails: set[str] = set()
public = False
skipped_permissions = 0
for permission in permissions_list:
if not permission:
skipped_permissions += 1
continue
# if the permission is inherited, do not add it directly to the file
# instead, add the folder ID as a group that has access to the file
# we will then handle mapping that folder to the list of Onyx users
# in the group sync job
# NOTE: this doesn't handle the case where a folder initially has no
# permissioning, but then later that folder is shared with a user or group.
# We could fetch all ancestors of the file to get the list of folders that
# might affect the permissions of the file, but this will get replaced with
# an audit-log based approach in the future so not doing it now.
if (
permission.permission_details
and permission.permission_details.inherited_from
):
folder_ids_to_inherit_permissions_from.add(
permission.permission_details.inherited_from
)
permission_type = permission["type"]
if permission_type == "user":
user_emails.add(permission["emailAddress"])
elif permission_type == "group":
group_emails.add(permission["emailAddress"])
elif permission_type == "domain" and company_domain:
if permission.get("domain") == company_domain:
if permission.type == PermissionType.USER:
if permission.email_address:
user_emails.add(permission.email_address)
else:
logger.error(
"Permission is type `user` but no email address is "
f"provided for document {slim_doc.id}"
f"\n {permission}"
)
elif permission.type == PermissionType.GROUP:
# groups are represented as email addresses within Drive
if permission.email_address:
group_emails.add(permission.email_address)
else:
logger.error(
"Permission is type `group` but no email address is "
f"provided for document {slim_doc.id}"
f"\n {permission}"
)
elif permission.type == PermissionType.DOMAIN and company_domain:
if permission.domain == company_domain:
public = True
else:
logger.warning(
"Permission is type domain but does not match company domain:"
f"\n {permission}"
)
elif permission_type == "anyone":
elif permission.type == PermissionType.ANYONE:
public = True
if skipped_permissions > 0:
logger.warning(
f"Skipped {skipped_permissions} permissions of {len(permissions_list)} for document {slim_doc.id}"
)
drive_id = permission_info.get("drive_id")
group_ids = group_emails | ({drive_id} if drive_id is not None else set())
group_ids = (
group_emails
| folder_ids_to_inherit_permissions_from
| ({drive_id} if drive_id is not None else set())
)
return ExternalAccess(
external_user_emails=user_emails,

View File

@@ -0,0 +1,84 @@
from collections.abc import Iterator
from googleapiclient.discovery import Resource # type: ignore
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
from ee.onyx.external_permissions.google_drive.permission_retrieval import (
get_permissions_by_ids,
)
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
from onyx.connectors.google_drive.file_retrieval import generate_time_range_filter
from onyx.connectors.google_drive.models import GoogleDriveFileType
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.utils.logger import setup_logger
logger = setup_logger()
# Only include fields we need - folder ID and permissions
# IMPORTANT: must fetch permissionIds, since sometimes the drive API
# seems to miss permissions when requesting them directly
FOLDER_PERMISSION_FIELDS = (
"nextPageToken, files(id, name, permissionIds, "
"permissions(id, emailAddress, type, domain, permissionDetails))"
)
def get_folder_permissions_by_ids(
service: Resource,
folder_id: str,
permission_ids: list[str],
) -> list[GoogleDrivePermission]:
"""
Retrieves permissions for a specific folder filtered by permission IDs.
Args:
service: The Google Drive service instance
folder_id: The ID of the folder to fetch permissions for
permission_ids: A list of permission IDs to filter by
Returns:
A list of permissions matching the provided permission IDs
"""
return get_permissions_by_ids(
drive_service=service,
doc_id=folder_id,
permission_ids=permission_ids,
)
def get_modified_folders(
service: Resource,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
"""
Retrieves all folders that were modified within the specified time range.
Only includes folder ID and permission information, not any contained files.
Args:
service: The Google Drive service instance
start: The start time as seconds since Unix epoch (inclusive)
end: The end time as seconds since Unix epoch (inclusive)
Returns:
An iterator yielding folder information including ID and permissions
"""
# Build query for folders
query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
query += " and trashed = false"
query += generate_time_range_filter(start, end)
# Retrieve and yield folders
for folder in execute_paginated_retrieval(
retrieval_function=service.files().list,
list_key="files",
continue_on_404_or_403=True,
corpora="allDrives",
supportsAllDrives=True,
includeItemsFromAllDrives=True,
includePermissionsForView="published",
fields=FOLDER_PERMISSION_FIELDS,
q=query,
):
yield folder

View File

@@ -1,6 +1,15 @@
from googleapiclient.errors import HttpError # type: ignore
from pydantic import BaseModel
from ee.onyx.db.external_perm import ExternalUserGroup
from ee.onyx.external_permissions.google_drive.folder_retrieval import (
get_folder_permissions_by_ids,
)
from ee.onyx.external_permissions.google_drive.folder_retrieval import (
get_modified_folders,
)
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
from ee.onyx.external_permissions.google_drive.models import PermissionType
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.connectors.google_utils.resources import AdminService
@@ -12,6 +21,72 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
"""
Folder Permission Sync.
Each folder is treated as a group. Each file has all ancestor folders
as groups.
"""
class FolderInfo(BaseModel):
id: str
permissions: list[GoogleDrivePermission]
def _get_all_folders(google_drive_connector: GoogleDriveConnector) -> list[FolderInfo]:
"""Have to get all folders since the group syncing system assumes all groups
are returned every time.
TODO: tweak things so we can fetch deltas.
"""
all_folders: list[FolderInfo] = []
seen_folder_ids: set[str] = set()
user_emails = google_drive_connector._get_all_user_emails()
for user_email in user_emails:
drive_service = get_drive_service(
google_drive_connector.creds,
user_email,
)
for folder in get_modified_folders(
service=drive_service,
):
folder_id = folder["id"]
if folder_id in seen_folder_ids:
logger.debug(f"Folder {folder_id} has already been seen. Skipping.")
continue
# Check if the folder has permission IDs but no permissions
permission_ids = folder.get("permissionIds", [])
raw_permissions = folder.get("permissions", [])
if not raw_permissions and permission_ids:
# Fetch permissions using the IDs
permissions = get_folder_permissions_by_ids(
drive_service, folder_id, permission_ids
)
else:
permissions = [
GoogleDrivePermission.from_drive_permission(permission)
for permission in raw_permissions
]
all_folders.append(
FolderInfo(
id=folder_id,
permissions=permissions,
)
)
seen_folder_ids.add(folder_id)
return all_folders
"""Individual Shared Drive / My Drive Permission Sync"""
def _get_drive_members(
google_drive_connector: GoogleDriveConnector,
admin_service: AdminService,
@@ -57,9 +132,11 @@ def _get_drive_members(
# is an admin
useDomainAdminAccess=is_admin,
):
if permission["type"] == "group":
# NOTE: don't need to check for PermissionType.ANYONE since
# you can't share a drive with the internet
if permission["type"] == PermissionType.GROUP:
group_emails.add(permission["emailAddress"])
elif permission["type"] == "user":
elif permission["type"] == PermissionType.USER:
user_emails.add(permission["emailAddress"])
except HttpError as e:
if e.status_code == 404:
@@ -118,6 +195,7 @@ def _map_group_email_to_member_emails(
def _build_onyx_groups(
drive_id_to_members_map: dict[str, tuple[set[str], set[str]]],
group_email_to_member_emails_map: dict[str, set[str]],
folder_info: list[FolderInfo],
) -> list[ExternalUserGroup]:
onyx_groups: list[ExternalUserGroup] = []
@@ -125,18 +203,52 @@ def _build_onyx_groups(
# This is because having drive level access means you have
# irrevocable access to all the files in the drive.
for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
all_member_emails: set[str] = user_emails
drive_member_emails: set[str] = user_emails
for group_email in group_emails:
if group_email not in group_email_to_member_emails_map:
logger.warning(
f"Group email {group_email} not found in group_email_to_member_emails_map"
f"Group email {group_email} for drive {drive_id} not found in "
"group_email_to_member_emails_map"
)
continue
all_member_emails.update(group_email_to_member_emails_map[group_email])
drive_member_emails.update(group_email_to_member_emails_map[group_email])
onyx_groups.append(
ExternalUserGroup(
id=drive_id,
user_emails=list(all_member_emails),
user_emails=list(drive_member_emails),
)
)
# Convert all folder permissions to onyx groups
for folder in folder_info:
anyone_can_access = False
folder_member_emails: set[str] = set()
for permission in folder.permissions:
if permission.type == PermissionType.USER:
if permission.email_address is None:
logger.warning(
f"User email is None for folder {folder.id} permission {permission}"
)
continue
folder_member_emails.add(permission.email_address)
elif permission.type == PermissionType.GROUP:
if permission.email_address not in group_email_to_member_emails_map:
logger.warning(
f"Group email {permission.email_address} for folder {folder.id} "
"not found in group_email_to_member_emails_map"
)
continue
folder_member_emails.update(
group_email_to_member_emails_map[permission.email_address]
)
elif permission.type == PermissionType.ANYONE:
anyone_can_access = True
onyx_groups.append(
ExternalUserGroup(
id=folder.id,
user_emails=list(folder_member_emails),
gives_anyone_access=anyone_can_access,
)
)
@@ -173,6 +285,9 @@ def gdrive_group_sync(
admin_service, google_drive_connector.google_domain
)
# Get all folder permissions
folder_info = _get_all_folders(google_drive_connector)
# Map group emails to their members
group_email_to_member_emails_map = _map_group_email_to_member_emails(
admin_service, all_group_emails
@@ -182,6 +297,7 @@ def gdrive_group_sync(
onyx_groups = _build_onyx_groups(
drive_id_to_members_map=drive_id_to_members_map,
group_email_to_member_emails_map=group_email_to_member_emails_map,
folder_info=folder_info,
)
return onyx_groups

View File

@@ -0,0 +1,59 @@
from enum import Enum
from typing import Any
from pydantic import BaseModel
class PermissionType(str, Enum):
USER = "user"
GROUP = "group"
DOMAIN = "domain"
ANYONE = "anyone"
class GoogleDrivePermissionDetails(BaseModel):
# this is "file", "member", etc.
# different from the `type` field within `GoogleDrivePermission`
# Sometimes can be not, although not sure why...
permission_type: str | None
# this is "reader", "writer", "owner", etc.
role: str
# this is the id of the parent permission
inherited_from: str | None
class GoogleDrivePermission(BaseModel):
id: str
# groups are also represented as email addresses within Drive
# will be None for domain/global permissions
email_address: str | None
type: PermissionType
domain: str | None # only applies to domain permissions
permission_details: GoogleDrivePermissionDetails | None
@classmethod
def from_drive_permission(
cls, drive_permission: dict[str, Any]
) -> "GoogleDrivePermission":
# we seem to only get details for permissions that are inherited
# we can get multiple details if a permission is inherited from multiple
# parents
permission_details_list = drive_permission.get("permissionDetails", [])
permission_details: dict[str, Any] | None = (
permission_details_list[0] if permission_details_list else None
)
return cls(
id=drive_permission["id"],
email_address=drive_permission.get("emailAddress"),
type=PermissionType(drive_permission["type"]),
domain=drive_permission.get("domain"),
permission_details=(
GoogleDrivePermissionDetails(
permission_type=permission_details.get("type"),
role=permission_details.get("role", ""),
inherited_from=permission_details.get("inheritedFrom"),
)
if permission_details
else None
),
)

View File

@@ -0,0 +1,60 @@
from googleapiclient.discovery import Resource # type: ignore
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.utils.logger import setup_logger
logger = setup_logger()
def get_permissions_by_ids(
drive_service: Resource,
doc_id: str,
permission_ids: list[str],
) -> list[GoogleDrivePermission]:
"""
Fetches permissions for a document based on a list of permission IDs.
Args:
drive_service: The Google Drive service instance
doc_id: The ID of the document to fetch permissions for
permission_ids: A list of permission IDs to filter by
Returns:
A list of GoogleDrivePermission objects matching the provided permission IDs
"""
if not permission_ids:
return []
# Create a set for faster lookup
permission_id_set = set(permission_ids)
# Fetch all permissions for the document
fetched_permissions = execute_paginated_retrieval(
retrieval_function=drive_service.permissions().list,
list_key="permissions",
fileId=doc_id,
fields="permissions(id, emailAddress, type, domain, permissionDetails),nextPageToken",
supportsAllDrives=True,
continue_on_404_or_403=True,
)
# Filter permissions by ID and convert to GoogleDrivePermission objects
filtered_permissions = []
for permission in fetched_permissions:
permission_id = permission.get("id")
if permission_id in permission_id_set:
google_drive_permission = GoogleDrivePermission.from_drive_permission(
permission
)
filtered_permissions.append(google_drive_permission)
# Log if we couldn't find all requested permission IDs
if len(filtered_permissions) < len(permission_ids):
missing_ids = permission_id_set - {p.id for p in filtered_permissions if p.id}
logger.warning(
f"Could not find all requested permission IDs for document {doc_id}. "
f"Missing IDs: {missing_ids}"
)
return filtered_permissions

View File

@@ -21,18 +21,21 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
PERMISSION_FULL_DESCRIPTION = (
"permissions(id, emailAddress, type, domain, permissionDetails)"
)
FILE_FIELDS = (
"nextPageToken, files(mimeType, id, name, permissions, modifiedTime, webViewLink, "
"shortcutDetails, owners(emailAddress), size)"
)
SLIM_FILE_FIELDS = (
"nextPageToken, files(mimeType, driveId, id, name, permissions(emailAddress, type, domain), "
f"nextPageToken, files(mimeType, driveId, id, name, {PERMISSION_FULL_DESCRIPTION}, "
"permissionIds, webViewLink, owners(emailAddress))"
)
FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
def _generate_time_range_filter(
def generate_time_range_filter(
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> str:
@@ -81,7 +84,7 @@ def _get_files_in_parent(
) -> Iterator[GoogleDriveFileType]:
query = f"mimeType != '{DRIVE_FOLDER_TYPE}' and '{parent_id}' in parents"
query += " and trashed = false"
query += _generate_time_range_filter(start, end)
query += generate_time_range_filter(start, end)
for file in execute_paginated_retrieval(
retrieval_function=service.files().list,
@@ -204,7 +207,7 @@ def get_files_in_shared_drive(
# Get all files in the shared drive
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false"
file_query += _generate_time_range_filter(start, end)
file_query += generate_time_range_filter(start, end)
for file in execute_paginated_retrieval(
retrieval_function=service.files().list,
@@ -264,7 +267,7 @@ def get_all_files_in_my_drive_and_shared(
file_query += " and trashed = false"
if not include_shared_with_me:
file_query += " and 'me' in owners"
file_query += _generate_time_range_filter(start, end)
file_query += generate_time_range_filter(start, end)
yield from execute_paginated_retrieval(
retrieval_function=service.files().list,
list_key="files",
@@ -297,7 +300,7 @@ def get_all_files_for_oauth(
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false"
file_query += _generate_time_range_filter(start, end)
file_query += generate_time_range_filter(start, end)
if not should_get_all:
if include_files_shared_with_me and not include_my_drives:

View File

@@ -2368,6 +2368,21 @@ class User__ExternalUserGroupId(Base):
)
class PublicExternalUserGroup(Base):
"""Stores all public external user "groups".
For example, things like Google Drive folders that are marked
as `Anyone with the link` or `Anyone in the domain`
"""
__tablename__ = "public_external_user_group"
external_user_group_id: Mapped[str] = mapped_column(String, primary_key=True)
cc_pair_id: Mapped[int] = mapped_column(
ForeignKey("connector_credential_pair.id", ondelete="CASCADE"), primary_key=True
)
class UsageReport(Base):
"""This stores metadata about usage reports generated by admin including user who generated
them as well las the period they cover. The actual zip file of the report is stored as a lo

View File

@@ -62,6 +62,28 @@ def parse_credentials(env_str: str) -> dict:
return json.loads(unescaped)
def get_credentials_from_env(email: str, oauth: bool) -> dict:
if oauth:
raw_credential_string = os.environ[_USER_TO_OAUTH_CREDENTIALS_MAP[email]]
else:
raw_credential_string = os.environ[
_USER_TO_SERVICE_ACCOUNT_CREDENTIALS_MAP[email]
]
refried_credential_string = json.dumps(parse_credentials(raw_credential_string))
cred_key = (
DB_CREDENTIALS_DICT_TOKEN_KEY
if oauth
else DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY
)
return {
cred_key: refried_credential_string,
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
DB_CREDENTIALS_AUTHENTICATION_METHOD: GoogleOAuthAuthenticationMethod.UPLOADED.value,
}
@pytest.fixture
def google_drive_oauth_uploaded_connector_factory() -> (
Callable[..., GoogleDriveConnector]
@@ -85,13 +107,7 @@ def google_drive_oauth_uploaded_connector_factory() -> (
shared_folder_urls=shared_folder_urls,
)
json_string = os.environ[_USER_TO_OAUTH_CREDENTIALS_MAP[primary_admin_email]]
refried_json_string = json.dumps(parse_credentials(json_string))
credentials_json = {
DB_CREDENTIALS_DICT_TOKEN_KEY: refried_json_string,
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: primary_admin_email,
DB_CREDENTIALS_AUTHENTICATION_METHOD: GoogleOAuthAuthenticationMethod.UPLOADED.value,
}
credentials_json = get_credentials_from_env(primary_admin_email, oauth=True)
connector.load_credentials(credentials_json)
return connector
@@ -123,19 +139,11 @@ def google_drive_service_acct_connector_factory() -> (
specific_user_emails=specific_user_emails,
)
json_string = os.environ[
_USER_TO_SERVICE_ACCOUNT_CREDENTIALS_MAP[primary_admin_email]
]
refried_json_string = json.dumps(parse_credentials(json_string))
# Load Service Account Credentials
connector.load_credentials(
{
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: refried_json_string,
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: primary_admin_email,
DB_CREDENTIALS_AUTHENTICATION_METHOD: GoogleOAuthAuthenticationMethod.UPLOADED.value,
}
credentials_json = get_credentials_from_env(
email=primary_admin_email, oauth=False
)
connector.load_credentials(credentials_json)
return connector
return _connector_factory

View File

@@ -0,0 +1,68 @@
{
"12": "https://drive.google.com/file/d/1u7nynrG4WuFZeuZs8yyhqJF_lbo-op-m/view?usp=drivesdk",
"10": "https://drive.google.com/file/d/1LFcVuXuXIdNJ7hkL0C40eYn_cQtryUVQ/view?usp=drivesdk",
"13": "https://drive.google.com/file/d/1muQMyYAJe0_F-HiDFIfFMt-4qsgMlREM/view?usp=drivesdk",
"11": "https://drive.google.com/file/d/1oHNtlsdJJtk7dE10NgH83Kn5_f2L-Su1/view?usp=drivesdk",
"14": "https://drive.google.com/file/d/1sAw-DrsqpnqLF5A8P59BZwIpt9-LrlaL/view?usp=drivesdk",
"18": "https://drive.google.com/file/d/1qqKH3esasdqV6ryEhdoSQezDPlKj11At/view?usp=drivesdk",
"17": "https://drive.google.com/file/d/1z08VsrCUTozpc5Quzb7mEDUwNkXU3foT/view?usp=drivesdk",
"15": "https://drive.google.com/file/d/1QQ6ZGyYP49IJNeGKNmqZISyVLzTOtK4v/view?usp=drivesdk",
"19": "https://drive.google.com/file/d/172as_pb7E15bXUd63mIIBRotk_tT7h56/view?usp=drivesdk",
"16": "https://drive.google.com/file/d/1552S6HEjJ81q8JXr46BtixQiVq9xlW_I/view?usp=drivesdk",
"5": "https://drive.google.com/file/d/1sv9epxLcNlgM6C-oPDeD_heFw7AIZMgp/view?usp=drivesdk",
"7": "https://drive.google.com/file/d/1S_S0LpQW90EUPPPjJX4jfu5p9gOQjiQF/view?usp=drivesdk",
"9": "https://drive.google.com/file/d/1wH2dBrWzmiGJ88ySHWu6srb7Jsj7qYbA/view?usp=drivesdk",
"8": "https://drive.google.com/file/d/14URUm6RKSZziH1lUtT6gs-xnCTWkXpSn/view?usp=drivesdk",
"6": "https://drive.google.com/file/d/1LBKBuTMRSss-kVw8ut3rMk51wSbTM95j/view?usp=drivesdk",
"3": "https://drive.google.com/file/d/1nNazkPrkuRXHFOl8gdA68pU2g8cy-h6n/view?usp=drivesdk",
"2": "https://drive.google.com/file/d/1miG_QpqXe2QIMApcrlNzaB6fsXW5WMFX/view?usp=drivesdk",
"4": "https://drive.google.com/file/d/1o-i8can6ciL1XXzy2pVUPHZEXEjBJi6C/view?usp=drivesdk",
"0": "https://drive.google.com/file/d/1d3Y59Sns8I0FIW9CtOAjVVLE2MEe_3nP/view?usp=drivesdk",
"1": "https://drive.google.com/file/d/1ipSqxJajs_NkfSKFxgltIMNc0ffdt-NX/view?usp=drivesdk",
"68": "https://drive.google.com/file/d/1rCBZsbhQ-ULWGztiKB0JYhFth9EChiSZ/view?usp=drivesdk",
"66": "https://drive.google.com/file/d/1WVAlbWcu9-Braa0aG6w3cShrY5dbIYcY/view?usp=drivesdk",
"67": "https://drive.google.com/file/d/1p44poOCdNLnVYMxTL9b3h-BXsOQ2RDgM/view?usp=drivesdk",
"69": "https://drive.google.com/file/d/1HFYsaqC14aE-EaobQdwkw0FOlAYMYqkV/view?usp=drivesdk",
"65": "https://drive.google.com/file/d/1RyE07CpTIDYMO3b-atwjWH6ZHFDjyoCl/view?usp=drivesdk",
"32": "https://drive.google.com/file/d/17egJ5W-0bvS2akLBqvxylTIViN0d9nG7/view?usp=drivesdk",
"28": "https://drive.google.com/file/d/1HNqSM2XGqgHnyNYT5wp8hyski18HMcfO/view?usp=drivesdk",
"37": "https://drive.google.com/file/d/16Tdu3gveWkFL0VBUzYSzKxFO4ffv-8h7/view?usp=drivesdk",
"30": "https://drive.google.com/file/d/1uj69jGyYnNOXXqKmLNIp-4KKrVC1qaPy/view?usp=drivesdk",
"25": "https://drive.google.com/file/d/1bw6NFlR4ZxOV6reQK1Oqeq_UaYFVpNV6/view?usp=drivesdk",
"33": "https://drive.google.com/file/d/1FkmXBkt__lOFXg_uhxLI0QIuxWbIGySL/view?usp=drivesdk",
"20": "https://drive.google.com/file/d/1r77uBVOHkuiDQFa9iz9FU8QbfjImOAjF/view?usp=drivesdk",
"24": "https://drive.google.com/file/d/1kwLrdhTgCdjNrOcSwRI14K3gXnS48xne/view?usp=drivesdk",
"39": "https://drive.google.com/file/d/1V3av9F47t44Nf3jcO12U6OIsjsX-B7L1/view?usp=drivesdk",
"29": "https://drive.google.com/file/d/172dCAUNaaoZX0RHqEi7Ev12eV930LtTa/view?usp=drivesdk",
"31": "https://drive.google.com/file/d/17zzfgMSWBVebWGnpSHKd6g1LFN4vn-YP/view?usp=drivesdk",
"38": "https://drive.google.com/file/d/1xOQvIBlBJ2swTGp78WkCZJUQ-d1F8pVu/view?usp=drivesdk",
"23": "https://drive.google.com/file/d/1X89y_CoTWWjh3BWq0ZgeGydCvg3gMZeJ/view?usp=drivesdk",
"34": "https://drive.google.com/file/d/1VNDhcbA_-Ckjp084hKyl9bwP4E3l9K_2/view?usp=drivesdk",
"47": "https://drive.google.com/file/d/1O8E7haA8WcJIma0iKcvebd4_dlC5Zr7S/view?usp=drivesdk",
"52": "https://drive.google.com/file/d/1o-ateliXHj4TyugOxb9zYYXwrkhFl4FX/view?usp=drivesdk",
"27": "https://drive.google.com/file/d/1aZ1CwNVWJt_OtIBVO-9zv1UUqXTDlM1F/view?usp=drivesdk",
"26": "https://drive.google.com/file/d/1qegrc27hYeECs0KexnEuuG0WQm-8Y9oZ/view?usp=drivesdk",
"59": "https://drive.google.com/file/d/1L9oWKHMTjQreGW_k8rNy7kBQ7c0FuXFm/view?usp=drivesdk",
"35": "https://drive.google.com/file/d/1NewjF092B9KKDBs-dpnZ9dzVl2GAs2LW/view?usp=drivesdk",
"49": "https://drive.google.com/file/d/1TsUrBlr2nxJtH122nKQ_GzdMc0DFFERB/view?usp=drivesdk",
"41": "https://drive.google.com/file/d/1gc2Vo3HZF-Bm_WhZ0zyFedWNfVL2BEol/view?usp=drivesdk",
"22": "https://drive.google.com/file/d/1iPfQeganYriuqHO2e5npUPeuX5VIbhG3/view?usp=drivesdk",
"36": "https://drive.google.com/file/d/1KyNoHRTfGMNR15dCRpcVW74l2z-wVm0V/view?usp=drivesdk",
"44": "https://drive.google.com/file/d/1PDuxwmrD20s54FHQIhXn3ucdFmXSX5kS/view?usp=drivesdk",
"21": "https://drive.google.com/file/d/1ZwO5cCfBJgGpZTIpoi8p2js8zuHT_qxe/view?usp=drivesdk",
"53": "https://drive.google.com/file/d/140NZAuAOoiqrNVqWmF4TPNv6njd_guwE/view?usp=drivesdk",
"50": "https://drive.google.com/file/d/1MBmy7nQi7pMwwIPZHJjB_iuQeO07QWsN/view?usp=drivesdk",
"54": "https://drive.google.com/file/d/1TtIJ-ULYWyv0yUvUVdfTPuBNlBt_j1Yd/view?usp=drivesdk",
"57": "https://drive.google.com/file/d/19V5d3NcR029AhGiRibk2nlTmFNCVGBgO/view?usp=drivesdk",
"43": "https://drive.google.com/file/d/1kLChcxIWZS_kHLEHThLcm7ekcgwYP0jF/view?usp=drivesdk",
"42": "https://drive.google.com/file/d/1HKW3C1B5vFYUuXmFieMKYAfq4CwtnEZ_/view?usp=drivesdk",
"48": "https://drive.google.com/file/d/1EJGd47XpWZDXJKWU0CGp84Hm7K47GNVt/view?usp=drivesdk",
"40": "https://drive.google.com/file/d/1Fr4dVKdOvth_O-Td8PTwgNGzZz8ridAl/view?usp=drivesdk",
"58": "https://drive.google.com/file/d/1lUFpiwE7ISzLbowHvCtEUj4sfG4w0Gst/view?usp=drivesdk",
"51": "https://drive.google.com/file/d/1V6fOoKgA8QSTJYWPP5GVHz8WFAQIRLNB/view?usp=drivesdk",
"45": "https://drive.google.com/file/d/1hSrPOwyxFEth4GWWN1e4BjBftmnKa8px/view?usp=drivesdk",
"46": "https://drive.google.com/file/d/1jCynzDt1r0EISpwcrFuk3RlKWHM9u7Mj/view?usp=drivesdk",
"55": "https://drive.google.com/file/d/1Db01f4I_Xn8Bs9piQgZU59ZWAeC2MaQm/view?usp=drivesdk",
"56": "https://drive.google.com/file/d/1NxVfwIxm6FVVR1XnxQNMWWbQEVX66cQm/view?usp=drivesdk",
"61": "https://docs.google.com/document/d/1eAaZJAqjXMZ2VvG_r04EGtn6EGcYycofdNUkDHEA8vY/edit?usp=drivesdk"
}

View File

@@ -0,0 +1,157 @@
import copy
import json
import os
from collections import defaultdict
from collections.abc import Callable
from unittest.mock import MagicMock
from unittest.mock import patch
from ee.onyx.external_permissions.google_drive.doc_sync import gdrive_doc_sync
from ee.onyx.external_permissions.google_drive.group_sync import gdrive_group_sync
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from onyx.db.models import ConnectorCredentialPair
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from tests.daily.connectors.google_drive.consts_and_utils import ACCESS_MAPPING
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
from tests.daily.connectors.google_drive.consts_and_utils import PUBLIC_RANGE
def _build_connector(
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
) -> GoogleDriveConnector:
connector = google_drive_service_acct_connector_factory(
primary_admin_email=ADMIN_EMAIL,
include_shared_drives=True,
include_my_drives=True,
include_files_shared_with_me=False,
shared_folder_urls=None,
shared_drive_urls=None,
my_drive_emails=None,
)
# don't need this anymore, it's been called in the factory
connector.load_credentials = MagicMock() # type: ignore
return connector
def test_gdrive_perm_sync_with_real_data(
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
) -> None:
"""
Test gdrive_doc_sync and gdrive_group_sync with real data from the test drive.
This test uses the real connector to make actual API calls to Google Drive
and verifies the permission structure returned.
"""
# Create a mock cc_pair that will use our real connector
mock_cc_pair = MagicMock(spec=ConnectorCredentialPair)
mock_cc_pair.connector = MagicMock()
mock_cc_pair.connector.connector_specific_config = {}
mock_cc_pair.credential_id = 1
mock_cc_pair.credential.credential_json = {}
mock_cc_pair.last_time_perm_sync = None
mock_cc_pair.last_time_external_group_sync = None
# Create a mock heartbeat
mock_heartbeat = MagicMock(spec=IndexingHeartbeatInterface)
mock_heartbeat.should_stop.return_value = False
# Load drive_id_mapping.json
with open(
os.path.join(os.path.dirname(__file__), "drive_id_mapping.json"), "r"
) as f:
drive_id_mapping = json.load(f)
# Invert the mapping to get URL -> ID
url_to_id_mapping = {url: int(id) for id, url in drive_id_mapping.items()}
# Use the connector directly without mocking Google Drive API calls
with patch(
"ee.onyx.external_permissions.google_drive.doc_sync.GoogleDriveConnector",
return_value=_build_connector(google_drive_service_acct_connector_factory),
):
# Call the function under test
doc_access_generator = gdrive_doc_sync(mock_cc_pair, lambda: [], mock_heartbeat)
doc_access_list = list(doc_access_generator)
# create new connector
with patch(
"ee.onyx.external_permissions.google_drive.group_sync.GoogleDriveConnector",
return_value=_build_connector(google_drive_service_acct_connector_factory),
):
external_user_groups = gdrive_group_sync("test_tenant", mock_cc_pair)
# Verify we got some results
assert len(doc_access_list) > 0
print(f"Found {len(doc_access_list)} documents with permissions")
# map group ids to emails
group_id_to_email_mapping: dict[str, set[str]] = defaultdict(set)
groups_with_anyone_access: set[str] = set()
for group in external_user_groups:
for email in group.user_emails:
group_id_to_email_mapping[group.id].add(email)
if group.gives_anyone_access:
groups_with_anyone_access.add(group.id)
# Map documents to their permissions (flattening groups)
doc_to_email_mapping: dict[str, set[str]] = {}
doc_to_raw_result_mapping: dict[str, set[str]] = {}
public_doc_ids: set[str] = set()
for doc_access in doc_access_list:
doc_id = doc_access.doc_id
# make sure they are new sets to avoid mutating the original
doc_to_email_mapping[doc_id] = copy.deepcopy(
doc_access.external_access.external_user_emails
)
doc_to_raw_result_mapping[doc_id] = copy.deepcopy(
doc_access.external_access.external_user_emails
)
for group_id in doc_access.external_access.external_user_group_ids:
doc_to_email_mapping[doc_id].update(group_id_to_email_mapping[group_id])
doc_to_raw_result_mapping[doc_id].add(group_id)
if doc_access.external_access.is_public:
public_doc_ids.add(doc_id)
if any(
group_id in groups_with_anyone_access
for group_id in doc_access.external_access.external_user_group_ids
):
public_doc_ids.add(doc_id)
# Check permissions based on drive_id_mapping.json and ACCESS_MAPPING
# For each document URL that exists in our mapping
checked_files = 0
for doc_id, emails_with_access in doc_to_email_mapping.items():
# Skip URLs that aren't in our mapping, we don't want new stuff to interfere
# with the test.
if doc_id not in url_to_id_mapping:
continue
file_numeric_id = url_to_id_mapping.get(doc_id)
if file_numeric_id is None:
raise ValueError(f"File {doc_id} not found in drive_id_mapping.json")
checked_files += 1
# Check which users should have access to this file according to ACCESS_MAPPING
expected_users = set()
for user_email, file_ids in ACCESS_MAPPING.items():
if file_numeric_id in file_ids:
expected_users.add(user_email)
# Verify the permissions match
if file_numeric_id in PUBLIC_RANGE:
assert (
doc_id in public_doc_ids
), f"File {doc_id} (ID: {file_numeric_id}) should be public but is not in the public_doc_ids set"
else:
assert expected_users == emails_with_access, (
f"File {doc_id} (ID: {file_numeric_id}) should be accessible to users {expected_users} "
f"but is accessible to {emails_with_access}. Raw result: {doc_to_raw_result_mapping[doc_id]} "
)
print(f"Checked permissions for {checked_files} files from drive_id_mapping.json")

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python
import json
import os
import pytest
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from tests.daily.connectors.google_drive.conftest import get_credentials_from_env
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import file_name_template
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_3_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import load_all_docs
from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_FILE_IDS
def generate_test_id_to_drive_id_mapping() -> dict[int, str]:
"""
Generate a mapping from test file IDs to actual Google Drive file IDs.
This is useful for writing tests that need to verify specific files
are accessible to specific users.
Returns:
dict: Mapping from test file ID (int) to Google Drive file ID (str)
"""
# Set up the connector with real credentials
connector = GoogleDriveConnector(
include_shared_drives=True,
include_my_drives=True,
include_files_shared_with_me=False,
)
# Load credentials
connector.load_credentials(get_credentials_from_env(email=ADMIN_EMAIL, oauth=False))
# Get all documents from the connector
docs = load_all_docs(connector)
# Create a mapping from test file ID to actual Drive file ID
test_id_to_drive_id = {}
# Process all documents retrieved from Drive
for doc in docs:
# Check if this document's name matches our test file naming pattern (file_X.txt)
if not doc.semantic_identifier.startswith(
file_name_template.format("").split("_")[0]
):
continue
try:
# Extract the test file ID from the filename (file_X.txt -> X)
file_id_str = doc.semantic_identifier.split("_")[1].split(".")[0]
test_file_id = int(file_id_str)
# Store the mapping from test ID to actual Drive ID
# Extract Drive ID from document URL
test_id_to_drive_id[test_file_id] = doc.id
except (ValueError, IndexError):
# Skip files that don't follow our naming convention
continue
# Print the mapping for all defined test file ID ranges
all_test_ranges = {
"ADMIN_FILE_IDS": ADMIN_FILE_IDS,
"TEST_USER_1_FILE_IDS": TEST_USER_1_FILE_IDS,
"TEST_USER_2_FILE_IDS": TEST_USER_2_FILE_IDS,
"TEST_USER_3_FILE_IDS": TEST_USER_3_FILE_IDS,
"SHARED_DRIVE_1_FILE_IDS": SHARED_DRIVE_1_FILE_IDS,
"SHARED_DRIVE_2_FILE_IDS": SHARED_DRIVE_2_FILE_IDS,
"FOLDER_1_FILE_IDS": FOLDER_1_FILE_IDS,
"FOLDER_1_1_FILE_IDS": FOLDER_1_1_FILE_IDS,
"FOLDER_1_2_FILE_IDS": FOLDER_1_2_FILE_IDS,
"FOLDER_2_FILE_IDS": FOLDER_2_FILE_IDS,
"FOLDER_2_1_FILE_IDS": FOLDER_2_1_FILE_IDS,
"FOLDER_2_2_FILE_IDS": FOLDER_2_2_FILE_IDS,
"FOLDER_3_FILE_IDS": FOLDER_3_FILE_IDS,
}
# Print the mapping for each test range
for range_name, file_ids in all_test_ranges.items():
print(f"\n{range_name}:")
for test_id in file_ids:
drive_id = test_id_to_drive_id.get(test_id, "NOT_FOUND")
print(f" {test_id} -> {drive_id}")
return test_id_to_drive_id
@pytest.mark.skipif(
not os.getenv("RUN_MANUAL_TESTS"),
reason="This test maps test IDs to actual Google Drive IDs. Set RUN_MANUAL_TESTS=1 to run.",
)
def test_generate_drive_id_mapping() -> None:
"""Test to generate mapping from test IDs to actual Google Drive IDs.
This test is skipped by default as it requires real Google Drive credentials
and is primarily used to generate mappings for other tests.
Run with:
RUN_MANUAL_TESTS=true pytest -xvs tests/daily/connectors/google_drive/test_map_test_ids.py::test_generate_drive_id_mapping
"""
mapping = generate_test_id_to_drive_id_mapping()
assert mapping, "Failed to generate any test ID to drive ID mappings"
# Write the mapping to a JSON file
output_dir = os.path.dirname(os.path.abspath(__file__))
mapping_file = os.path.join(output_dir, "drive_id_mapping.json")
# Convert int keys to strings for JSON compatibility
json_mapping = {str(k): v for k, v in mapping.items()}
# Write the mapping to a JSON file
with open(mapping_file, "w") as f:
json.dump(json_mapping, f, indent=2)
print(f"\nMapping written to: {mapping_file}")
raise RuntimeError("Mapping written to file, test complete")

View File

@@ -1,209 +0,0 @@
import time
from collections.abc import Callable
from unittest.mock import MagicMock
from unittest.mock import patch
from ee.onyx.external_permissions.google_drive.doc_sync import (
_get_permissions_from_slim_doc,
)
from onyx.access.models import ExternalAccess
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.connectors.google_utils.resources import get_admin_service
from tests.daily.connectors.google_drive.consts_and_utils import ACCESS_MAPPING
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import file_name_template
from tests.daily.connectors.google_drive.consts_and_utils import filter_invalid_prefixes
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import print_discrepancies
from tests.daily.connectors.google_drive.consts_and_utils import PUBLIC_RANGE
from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_EMAIL
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_EMAIL
from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_FILE_IDS
def get_keys_available_to_user_from_access_map(
user_email: str,
group_map: dict[str, list[str]],
access_map: dict[str, ExternalAccess],
) -> list[str]:
"""
Extracts the names of the files available to the user from the access map
through their own email or group memberships or public access
"""
group_emails_for_user = []
for group_email, user_in_group_email_list in group_map.items():
if user_email in user_in_group_email_list:
group_emails_for_user.append(group_email)
accessible_file_names_for_user = []
for file_name, external_access in access_map.items():
if external_access.is_public:
accessible_file_names_for_user.append(file_name)
elif user_email in external_access.external_user_emails:
accessible_file_names_for_user.append(file_name)
elif any(
group_email in external_access.external_user_group_ids
for group_email in group_emails_for_user
):
accessible_file_names_for_user.append(file_name)
return accessible_file_names_for_user
def assert_correct_access_for_user(
user_email: str,
expected_access_ids: list[int],
group_map: dict[str, list[str]],
retrieved_access_map: dict[str, ExternalAccess],
) -> None:
"""
compares the expected access range of the user to the keys available to the user
retrieved from the source
"""
retrieved_keys_available_to_user = get_keys_available_to_user_from_access_map(
user_email, group_map, retrieved_access_map
)
retrieved_file_names = set(retrieved_keys_available_to_user)
# Combine public and user-specific access IDs
all_accessible_ids = expected_access_ids + PUBLIC_RANGE
expected_file_names = {file_name_template.format(i) for i in all_accessible_ids}
filtered_retrieved_file_names = filter_invalid_prefixes(retrieved_file_names)
print_discrepancies(expected_file_names, filtered_retrieved_file_names)
assert expected_file_names == filtered_retrieved_file_names
# This function is supposed to map to the group_sync.py file for the google drive connector
# TODO: Call it directly
def get_group_map(google_drive_connector: GoogleDriveConnector) -> dict[str, list[str]]:
admin_service = get_admin_service(
creds=google_drive_connector.creds,
user_email=google_drive_connector.primary_admin_email,
)
group_map: dict[str, list[str]] = {}
for group in execute_paginated_retrieval(
admin_service.groups().list,
list_key="groups",
domain=google_drive_connector.google_domain,
fields="groups(email),nextPageToken",
):
# The id is the group email
group_email = group["email"]
# Gather group member emails
group_member_emails: list[str] = []
for member in execute_paginated_retrieval(
admin_service.members().list,
list_key="members",
groupKey=group_email,
fields="members(email),nextPageToken",
):
group_member_emails.append(member["email"])
group_map[group_email] = group_member_emails
return group_map
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_all_permissions(
mock_get_api_key: MagicMock,
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
) -> None:
google_drive_connector = google_drive_service_acct_connector_factory(
primary_admin_email=ADMIN_EMAIL,
include_shared_drives=True,
include_my_drives=True,
include_files_shared_with_me=False,
shared_folder_urls=None,
shared_drive_urls=None,
my_drive_emails=None,
)
access_map: dict[str, ExternalAccess] = {}
found_file_names = set()
for slim_doc_batch in google_drive_connector.retrieve_all_slim_documents(
0, time.time()
):
for slim_doc in slim_doc_batch:
name = (slim_doc.perm_sync_data or {})["name"]
access_map[name] = _get_permissions_from_slim_doc(
google_drive_connector=google_drive_connector,
slim_doc=slim_doc,
)
found_file_names.add(name)
for file_name, external_access in access_map.items():
print(file_name, external_access)
expected_file_range = (
ADMIN_FILE_IDS # Admin's My Drive
+ ADMIN_FOLDER_3_FILE_IDS # Admin's Folder 3
+ TEST_USER_1_FILE_IDS # TEST_USER_1's My Drive
+ TEST_USER_2_FILE_IDS # TEST_USER_2's My Drive
+ TEST_USER_3_FILE_IDS # TEST_USER_3's My Drive
+ SHARED_DRIVE_1_FILE_IDS # Shared Drive 1
+ FOLDER_1_FILE_IDS # Folder 1
+ FOLDER_1_1_FILE_IDS # Folder 1_1
+ FOLDER_1_2_FILE_IDS # Folder 1_2
+ SHARED_DRIVE_2_FILE_IDS # Shared Drive 2
+ FOLDER_2_FILE_IDS # Folder 2
+ FOLDER_2_1_FILE_IDS # Folder 2_1
+ FOLDER_2_2_FILE_IDS # Folder 2_2
+ SECTIONS_FILE_IDS # Sections
)
expected_file_names = {
file_name_template.format(file_id) for file_id in expected_file_range
}
# Should get everything
filtered_retrieved_file_names = filter_invalid_prefixes(found_file_names)
print_discrepancies(expected_file_names, filtered_retrieved_file_names)
assert expected_file_names == filtered_retrieved_file_names
group_map = get_group_map(google_drive_connector)
print("groups:\n", group_map)
assert_correct_access_for_user(
user_email=ADMIN_EMAIL,
expected_access_ids=ACCESS_MAPPING[ADMIN_EMAIL],
group_map=group_map,
retrieved_access_map=access_map,
)
assert_correct_access_for_user(
user_email=TEST_USER_1_EMAIL,
expected_access_ids=ACCESS_MAPPING[TEST_USER_1_EMAIL],
group_map=group_map,
retrieved_access_map=access_map,
)
assert_correct_access_for_user(
user_email=TEST_USER_2_EMAIL,
expected_access_ids=ACCESS_MAPPING[TEST_USER_2_EMAIL],
group_map=group_map,
retrieved_access_map=access_map,
)
assert_correct_access_for_user(
user_email=TEST_USER_3_EMAIL,
expected_access_ids=ACCESS_MAPPING[TEST_USER_3_EMAIL],
group_map=group_map,
retrieved_access_map=access_map,
)