mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-09 00:42:47 +00:00
Compare commits
6 Commits
cli/v0.2.1
...
temp/pr-73
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d92362c15 | ||
|
|
3dc6389190 | ||
|
|
a0c1438200 | ||
|
|
c65d3741c2 | ||
|
|
1748c0c0e4 | ||
|
|
9230997069 |
@@ -100,6 +100,15 @@ SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Box
|
||||
#####
|
||||
# In seconds, default is 30 minutes
|
||||
BOX_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("BOX_PERMISSION_GROUP_SYNC_FREQUENCY") or 30 * 60
|
||||
)
|
||||
|
||||
|
||||
####
|
||||
# Celery Job Frequency
|
||||
####
|
||||
|
||||
1
backend/ee/onyx/external_permissions/box/__init__.py
Normal file
1
backend/ee/onyx/external_permissions/box/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Box external permissions module for syncing document permissions."""
|
||||
170
backend/ee/onyx/external_permissions/box/doc_sync.py
Normal file
170
backend/ee/onyx/external_permissions/box/doc_sync.py
Normal file
@@ -0,0 +1,170 @@
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from box_sdk_gen.client import BoxClient
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from onyx.connectors.box.models import BoxFileType
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def get_external_access_for_raw_box_file(
|
||||
file: BoxFileType,
|
||||
company_domain: str | None,
|
||||
retriever_box_client: BoxClient | None,
|
||||
admin_box_client: BoxClient,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Extract permissions from a Box file's collaborations.
|
||||
|
||||
Box permissions are managed through collaborations, which can be:
|
||||
- User collaborations: direct access for specific users (by email)
|
||||
- Group collaborations: access for groups (by group ID)
|
||||
- Public links: shared links that may be publicly accessible
|
||||
"""
|
||||
file_id = file.get("id")
|
||||
if not file_id:
|
||||
raise ValueError("No file_id found in file")
|
||||
|
||||
user_emails: set[str] = set()
|
||||
group_ids: set[str] = set()
|
||||
public = False
|
||||
|
||||
# Use admin client to get permissions (has broader access)
|
||||
box_client = admin_box_client or retriever_box_client
|
||||
if not box_client:
|
||||
logger.warning(f"No Box client available for file {file_id}")
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=set(),
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
try:
|
||||
collaborations_response = box_client.collaborations.get_file_collaborations(
|
||||
file_id=file_id
|
||||
)
|
||||
|
||||
for collaboration in collaborations_response.entries:
|
||||
accessible_by = collaboration.accessible_by
|
||||
|
||||
if accessible_by:
|
||||
# User collaboration: extract email/login
|
||||
if hasattr(accessible_by, "login") and accessible_by.login:
|
||||
user_emails.add(accessible_by.login)
|
||||
elif hasattr(accessible_by, "email") and accessible_by.email:
|
||||
user_emails.add(accessible_by.email)
|
||||
|
||||
# Group collaboration: groups have name but no login/email
|
||||
if hasattr(accessible_by, "name") and not hasattr(
|
||||
accessible_by, "login"
|
||||
):
|
||||
if hasattr(accessible_by, "id") and accessible_by.id:
|
||||
group_ids.add(str(accessible_by.id))
|
||||
|
||||
# Public link collaboration: accessible_by is None for public links
|
||||
if accessible_by is None:
|
||||
if (
|
||||
hasattr(collaboration, "status")
|
||||
and collaboration.status == "accepted"
|
||||
and file.get("shared_link")
|
||||
):
|
||||
public = True
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to get collaborations for Box file {file_id}: {e}. "
|
||||
"Returning minimal access (file owner retains access via retriever user)."
|
||||
)
|
||||
|
||||
# Check for shared link (indicates potential public access)
|
||||
# Only mark as public if the shared link is actually public (access="open")
|
||||
# and not password-protected
|
||||
shared_link = file.get("shared_link")
|
||||
if shared_link:
|
||||
# shared_link can be a string (legacy) or a dict with access/password info
|
||||
if isinstance(shared_link, dict):
|
||||
access = shared_link.get("access")
|
||||
password = shared_link.get("password")
|
||||
# Only mark as public if access is "open" and not password-protected
|
||||
if access == "open" and not password:
|
||||
public = True
|
||||
elif isinstance(shared_link, str):
|
||||
# Legacy: if it's just a URL string, we can't determine access level
|
||||
# Don't assume it's public - only mark as public if we found a public
|
||||
# collaboration above
|
||||
pass
|
||||
|
||||
return ExternalAccess(
|
||||
external_user_emails=user_emails,
|
||||
external_user_group_ids=group_ids,
|
||||
is_public=public,
|
||||
)
|
||||
|
||||
|
||||
def _get_slim_doc_generator(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
box_connector: BoxConnector,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
current_time = datetime.now(timezone.utc)
|
||||
start_time = (
|
||||
cc_pair.last_time_perm_sync.replace(tzinfo=timezone.utc).timestamp()
|
||||
if cc_pair.last_time_perm_sync
|
||||
else 0.0
|
||||
)
|
||||
|
||||
return box_connector.retrieve_all_slim_docs_perm_sync(
|
||||
start=start_time,
|
||||
end=current_time.timestamp(),
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
|
||||
def box_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
Sync Box file permissions to documents in the database.
|
||||
|
||||
Retrieves slim documents from Box and extracts their permissions,
|
||||
yielding DocExternalAccess objects for each document with permissions.
|
||||
If a document doesn't exist yet, permissions are pre-populated
|
||||
so they're available when the document is created.
|
||||
"""
|
||||
box_connector = BoxConnector(**cc_pair.connector.connector_specific_config)
|
||||
box_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
slim_doc_generator = _get_slim_doc_generator(
|
||||
cc_pair, box_connector, callback=callback
|
||||
)
|
||||
|
||||
for slim_doc_batch in slim_doc_generator:
|
||||
for slim_doc in slim_doc_batch:
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
raise RuntimeError("box_doc_sync: Stop signal detected")
|
||||
|
||||
callback.progress("box_doc_sync", 1)
|
||||
|
||||
if slim_doc.external_access is None:
|
||||
logger.warning(f"No permissions found for document {slim_doc.id}")
|
||||
continue
|
||||
|
||||
yield DocExternalAccess(
|
||||
doc_id=slim_doc.id,
|
||||
external_access=slim_doc.external_access,
|
||||
)
|
||||
84
backend/ee/onyx/external_permissions/box/group_sync.py
Normal file
84
backend/ee/onyx/external_permissions/box/group_sync.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def box_group_sync(
|
||||
tenant_id: str,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
"""
|
||||
Sync Box groups and their members.
|
||||
|
||||
This function fetches all groups from Box and yields ExternalUserGroup
|
||||
objects containing the group ID and member emails.
|
||||
"""
|
||||
# Create Box connector and load credentials
|
||||
box_connector = BoxConnector(**cc_pair.connector.connector_specific_config)
|
||||
box_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
box_client = box_connector.box_client
|
||||
|
||||
logger.info("Starting Box group sync...")
|
||||
|
||||
try:
|
||||
# Get all groups in the enterprise
|
||||
# Box API: GET /groups
|
||||
groups_response = box_client.groups.get_groups()
|
||||
|
||||
for group in groups_response.entries:
|
||||
group_id = str(group.id)
|
||||
group_name = getattr(group, "name", None) or f"Group_{group_id}"
|
||||
|
||||
logger.debug(f"Processing Box group: {group_name} (ID: {group_id})")
|
||||
|
||||
# Get members of this group
|
||||
# Box API: GET /groups/{group_id}/memberships
|
||||
try:
|
||||
memberships_response = box_client.groups.get_group_memberships(
|
||||
group_id=group_id
|
||||
)
|
||||
|
||||
user_emails: set[str] = set()
|
||||
for membership in memberships_response.entries:
|
||||
user = getattr(membership, "user", None)
|
||||
if user:
|
||||
# Extract email from user object
|
||||
email = getattr(user, "login", None) or getattr(
|
||||
user, "email", None
|
||||
)
|
||||
if email:
|
||||
user_emails.add(email)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Group member {getattr(user, 'id', 'unknown')} "
|
||||
f"has no email/login in group {group_name}"
|
||||
)
|
||||
|
||||
if user_emails:
|
||||
logger.info(
|
||||
f"Found {len(user_emails)} members in Box group {group_name}"
|
||||
)
|
||||
yield ExternalUserGroup(
|
||||
id=group_id,
|
||||
user_emails=list(user_emails),
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Box group {group_name} (ID: {group_id}) has no members with emails"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error fetching members for Box group {group_name} (ID: {group_id}): {e}"
|
||||
)
|
||||
# Continue with other groups even if one fails
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during Box group sync: {e}")
|
||||
raise
|
||||
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ee.onyx.configs.app_configs import BOX_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
@@ -16,6 +17,8 @@ from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import SLACK_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import TEAMS_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.external_permissions.box.doc_sync import box_doc_sync
|
||||
from ee.onyx.external_permissions.box.group_sync import box_group_sync
|
||||
from ee.onyx.external_permissions.confluence.doc_sync import confluence_doc_sync
|
||||
from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
|
||||
from ee.onyx.external_permissions.github.doc_sync import github_doc_sync
|
||||
@@ -134,6 +137,18 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
|
||||
initial_index_should_sync=False,
|
||||
),
|
||||
),
|
||||
DocumentSource.BOX: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
doc_sync_func=box_doc_sync,
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
group_sync_config=GroupSyncConfig(
|
||||
group_sync_frequency=BOX_PERMISSION_GROUP_SYNC_FREQUENCY,
|
||||
group_sync_func=box_group_sync,
|
||||
group_sync_is_cc_pair_agnostic=False,
|
||||
),
|
||||
),
|
||||
DocumentSource.GITHUB: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=GITHUB_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
|
||||
@@ -570,6 +570,9 @@ EGNYTE_CLIENT_SECRET = os.getenv("EGNYTE_CLIENT_SECRET")
|
||||
LINEAR_CLIENT_ID = os.getenv("LINEAR_CLIENT_ID")
|
||||
LINEAR_CLIENT_SECRET = os.getenv("LINEAR_CLIENT_SECRET")
|
||||
|
||||
# Box specific configs
|
||||
BOX_DEVELOPER_TOKEN = os.getenv("BOX_DEVELOPER_TOKEN")
|
||||
|
||||
# Slack specific configs
|
||||
SLACK_NUM_THREADS = int(os.getenv("SLACK_NUM_THREADS") or 8)
|
||||
MAX_SLACK_QUERY_EXPANSIONS = int(os.environ.get("MAX_SLACK_QUERY_EXPANSIONS", "5"))
|
||||
|
||||
@@ -103,6 +103,7 @@ KV_GMAIL_CRED_KEY = "gmail_app_credential"
|
||||
KV_GMAIL_SERVICE_ACCOUNT_KEY = "gmail_service_account_key"
|
||||
KV_GOOGLE_DRIVE_CRED_KEY = "google_drive_app_credential"
|
||||
KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY = "google_drive_service_account_key"
|
||||
KV_BOX_JWT_CONFIG = "box_jwt_config"
|
||||
KV_GEN_AI_KEY_CHECK_TIME = "genai_api_key_last_check_time"
|
||||
KV_SETTINGS_KEY = "onyx_settings"
|
||||
KV_CUSTOMER_UUID_KEY = "customer_uuid"
|
||||
@@ -210,6 +211,7 @@ class DocumentSource(str, Enum):
|
||||
AIRTABLE = "airtable"
|
||||
HIGHSPOT = "highspot"
|
||||
DRUPAL_WIKI = "drupal_wiki"
|
||||
BOX = "box"
|
||||
|
||||
IMAP = "imap"
|
||||
BITBUCKET = "bitbucket"
|
||||
@@ -631,6 +633,7 @@ project management, and collaboration tools into a single, customizable platform
|
||||
DocumentSource.AIRTABLE: "airtable - database",
|
||||
DocumentSource.HIGHSPOT: "highspot - CRM data",
|
||||
DocumentSource.DRUPAL_WIKI: "drupal wiki - knowledge base content (pages, spaces, attachments)",
|
||||
DocumentSource.BOX: "box - files and folders",
|
||||
DocumentSource.IMAP: "imap - email data",
|
||||
DocumentSource.TESTRAIL: "testrail - test case management tool for QA processes",
|
||||
}
|
||||
|
||||
0
backend/onyx/connectors/box/__init__.py
Normal file
0
backend/onyx/connectors/box/__init__.py
Normal file
98
backend/onyx/connectors/box/box_kv.py
Normal file
98
backend/onyx/connectors/box/box_kv.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import KV_BOX_JWT_CONFIG
|
||||
from onyx.key_value_store.factory import get_kv_store
|
||||
from onyx.key_value_store.interface import KvKeyNotFoundError
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Key for Box JWT config in credentials dict
|
||||
DB_CREDENTIALS_DICT_BOX_JWT_CONFIG = "box_jwt_config"
|
||||
# Key for primary admin user ID in credentials dict
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID = "box_primary_admin_user_id"
|
||||
# Authentication method indicator
|
||||
DB_CREDENTIALS_AUTHENTICATION_METHOD = "authentication_method"
|
||||
BOX_AUTHENTICATION_METHOD_UPLOADED = "uploaded"
|
||||
|
||||
|
||||
class BoxJWTConfig(BaseModel):
|
||||
"""Box JWT configuration from JSON file."""
|
||||
|
||||
boxAppSettings: dict[str, Any]
|
||||
enterpriseID: str | None = None
|
||||
|
||||
@property
|
||||
def client_id(self) -> str:
|
||||
return self.boxAppSettings["clientID"]
|
||||
|
||||
@property
|
||||
def client_secret(self) -> str:
|
||||
return self.boxAppSettings["clientSecret"]
|
||||
|
||||
@property
|
||||
def private_key(self) -> str:
|
||||
return self.boxAppSettings["appAuth"]["privateKey"]
|
||||
|
||||
@property
|
||||
def passphrase(self) -> str | None:
|
||||
return self.boxAppSettings["appAuth"].get("passphrase")
|
||||
|
||||
@property
|
||||
def public_key_id(self) -> str:
|
||||
return self.boxAppSettings["appAuth"]["publicKeyID"]
|
||||
|
||||
|
||||
def get_box_jwt_config() -> BoxJWTConfig:
|
||||
"""Get Box JWT config from KV store."""
|
||||
try:
|
||||
creds_str = str(get_kv_store().load(KV_BOX_JWT_CONFIG))
|
||||
return BoxJWTConfig(**json.loads(creds_str))
|
||||
except KvKeyNotFoundError:
|
||||
raise KvKeyNotFoundError("Box JWT config not found in KV store")
|
||||
|
||||
|
||||
def upsert_box_jwt_config(jwt_config: BoxJWTConfig) -> None:
|
||||
"""Store Box JWT config in KV store (encrypted)."""
|
||||
get_kv_store().store(
|
||||
KV_BOX_JWT_CONFIG,
|
||||
jwt_config.model_dump_json(),
|
||||
encrypt=True,
|
||||
)
|
||||
|
||||
|
||||
def delete_box_jwt_config() -> None:
|
||||
"""Delete Box JWT config from KV store."""
|
||||
get_kv_store().delete(KV_BOX_JWT_CONFIG)
|
||||
|
||||
|
||||
def build_box_jwt_creds(
|
||||
primary_admin_user_id: str | None = None,
|
||||
name: str | None = None,
|
||||
) -> CredentialBase:
|
||||
"""Build CredentialBase from Box JWT config stored in KV store.
|
||||
|
||||
Note: JWT config (including private key) is stored encrypted in KV store,
|
||||
not in credential_json to avoid duplicating sensitive data in admin_public credentials.
|
||||
"""
|
||||
# Don't include JWT config in credential_json - it's stored encrypted in KV store
|
||||
# The connector will load it from KV store when needed
|
||||
credential_dict: dict[str, Any] = {}
|
||||
if primary_admin_user_id:
|
||||
credential_dict[DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID] = primary_admin_user_id
|
||||
|
||||
credential_dict[DB_CREDENTIALS_AUTHENTICATION_METHOD] = (
|
||||
BOX_AUTHENTICATION_METHOD_UPLOADED
|
||||
)
|
||||
|
||||
return CredentialBase(
|
||||
credential_json=credential_dict,
|
||||
admin_public=True,
|
||||
source=DocumentSource.BOX,
|
||||
name=name or "Box JWT (uploaded)",
|
||||
)
|
||||
706
backend/onyx/connectors/box/connector.py
Normal file
706
backend/onyx/connectors/box/connector.py
Normal file
@@ -0,0 +1,706 @@
|
||||
import copy
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from box_sdk_gen import BoxClient
|
||||
from box_sdk_gen import BoxJWTAuth
|
||||
from box_sdk_gen import JWTConfig
|
||||
from box_sdk_gen.box import BoxAPIError
|
||||
from box_sdk_gen.box import BoxDeveloperTokenAuth
|
||||
from typing_extensions import override
|
||||
|
||||
from onyx.configs.app_configs import BOX_DEVELOPER_TOKEN
|
||||
from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.connectors.box.box_kv import DB_CREDENTIALS_DICT_BOX_JWT_CONFIG
|
||||
from onyx.connectors.box.box_kv import DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID
|
||||
from onyx.connectors.box.doc_conversion import build_slim_document
|
||||
from onyx.connectors.box.doc_conversion import convert_box_item_to_document
|
||||
from onyx.connectors.box.doc_conversion import onyx_document_id_from_box_file
|
||||
from onyx.connectors.box.doc_conversion import PermissionSyncContext
|
||||
from onyx.connectors.box.file_retrieval import crawl_folders_for_files
|
||||
from onyx.connectors.box.file_retrieval import get_all_files_in_folder
|
||||
from onyx.connectors.box.models import BoxCheckpoint
|
||||
from onyx.connectors.box.models import BoxRetrievalStage
|
||||
from onyx.connectors.box.models import RetrievedBoxFile
|
||||
from onyx.connectors.box.models import StageCompletion
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.exceptions import CredentialExpiredError
|
||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||
from onyx.connectors.interfaces import CheckpointedConnectorWithPermSync
|
||||
from onyx.connectors.interfaces import CheckpointOutput
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.connectors.interfaces import NormalizationResult
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnectorWithPermSync
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import EntityFailure
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import ThreadSafeDict
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _sanitize_error_message(error: Exception) -> str:
|
||||
"""Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)."""
|
||||
import re
|
||||
|
||||
error_str = str(error)
|
||||
# Remove URLs
|
||||
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
|
||||
# Remove potential tokens (long alphanumeric strings)
|
||||
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
|
||||
return error_str
|
||||
|
||||
|
||||
def _parse_box_datetime_to_timestamp(modified_time_str: str | None) -> float | None:
|
||||
"""Parse Box datetime string to Unix timestamp."""
|
||||
if not modified_time_str:
|
||||
return None
|
||||
try:
|
||||
mod_dt = datetime.fromisoformat(modified_time_str.replace("Z", "+00:00"))
|
||||
return mod_dt.timestamp()
|
||||
except (ValueError, AttributeError):
|
||||
return None
|
||||
|
||||
|
||||
def _extract_str_list_from_comma_str(string: str | None) -> list[str]:
|
||||
"""Extract list of strings from comma-separated string."""
|
||||
if not string:
|
||||
return []
|
||||
return [s.strip() for s in string.split(",") if s.strip()]
|
||||
|
||||
|
||||
def _extract_ids_from_urls(urls: list[str]) -> list[str]:
|
||||
"""Extract Box folder/file IDs from URLs."""
|
||||
ids = []
|
||||
for url in urls:
|
||||
parsed = urlparse(url)
|
||||
# Box URLs can be: https://app.box.com/folder/123456789
|
||||
# or https://app.box.com/file/123456789
|
||||
path_parts = parsed.path.strip("/").split("/")
|
||||
if len(path_parts) >= 2:
|
||||
ids.append(path_parts[-1])
|
||||
return ids
|
||||
|
||||
|
||||
class BoxConnector(
|
||||
SlimConnectorWithPermSync,
|
||||
CheckpointedConnectorWithPermSync[BoxCheckpoint],
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
include_all_files: bool = False,
|
||||
folder_ids: str | list[str] | None = None,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
if not include_all_files and not folder_ids:
|
||||
raise ConnectorValidationError(
|
||||
"Nothing to index. Please specify either 'include_all_files=True' "
|
||||
"or provide 'folder_ids' (comma-separated list of folder IDs or URLs)."
|
||||
)
|
||||
|
||||
self.include_all_files = include_all_files
|
||||
# Handle both string and list inputs (frontend may send list)
|
||||
if isinstance(folder_ids, list):
|
||||
# Convert list to comma-separated string
|
||||
folder_ids_str = ",".join(str(fid).strip() for fid in folder_ids if fid)
|
||||
else:
|
||||
folder_ids_str = folder_ids or ""
|
||||
folder_id_list = _extract_str_list_from_comma_str(folder_ids_str)
|
||||
# Extract folder IDs from URLs if provided, otherwise use items as-is
|
||||
extracted_ids = []
|
||||
for item in folder_id_list:
|
||||
if item.startswith("http://") or item.startswith("https://"):
|
||||
url_ids = _extract_ids_from_urls([item])
|
||||
extracted_ids.extend(url_ids)
|
||||
else:
|
||||
extracted_ids.append(item)
|
||||
self._requested_folder_ids = set(extracted_ids)
|
||||
|
||||
self._box_client: BoxClient | None = None
|
||||
self._user_id: str | None = None
|
||||
self._creds_dict: dict[str, Any] | None = None
|
||||
|
||||
# IDs of folders that have been traversed
|
||||
self._retrieved_folder_ids: set[str] = set()
|
||||
|
||||
self.allow_images = False
|
||||
self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
|
||||
|
||||
def set_allow_images(self, value: bool) -> None:
|
||||
self.allow_images = value
|
||||
|
||||
@property
|
||||
def box_client(self) -> BoxClient:
|
||||
if self._box_client is None:
|
||||
raise RuntimeError(
|
||||
"Box client missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._box_client
|
||||
|
||||
@property
|
||||
def user_id(self) -> str:
|
||||
if self._user_id is None:
|
||||
raise RuntimeError(
|
||||
"User ID missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._user_id
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def normalize_url(cls, url: str) -> NormalizationResult:
|
||||
"""Normalize a Box URL to match the canonical Document.id format."""
|
||||
parsed = urlparse(url)
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
if not (netloc.startswith("app.box.com") or netloc.startswith("box.com")):
|
||||
return NormalizationResult(normalized_url=None, use_default=False)
|
||||
|
||||
# Extract file/folder ID from path
|
||||
path_parts = parsed.path.strip("/").split("/")
|
||||
if len(path_parts) >= 2:
|
||||
item_id = path_parts[-1]
|
||||
# Construct normalized URL
|
||||
normalized = f"https://app.box.com/file/{item_id}"
|
||||
return NormalizationResult(normalized_url=normalized, use_default=False)
|
||||
|
||||
return NormalizationResult(normalized_url=None, use_default=False)
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None:
|
||||
"""Load Box credentials and initialize client."""
|
||||
# Check if BOX_DEVELOPER_TOKEN is set (for TESTING only)
|
||||
if BOX_DEVELOPER_TOKEN:
|
||||
logger.info("Using BOX_DEVELOPER_TOKEN for authentication (TESTING ONLY)")
|
||||
auth = BoxDeveloperTokenAuth(token=BOX_DEVELOPER_TOKEN)
|
||||
self._box_client = BoxClient(auth=auth)
|
||||
try:
|
||||
current_user = self._box_client.users.get_user_me()
|
||||
self._user_id = current_user.id
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get current user info: {e}")
|
||||
self._user_id = credentials.get("box_user_id", "me")
|
||||
self._creds_dict = credentials
|
||||
return None
|
||||
|
||||
# Support JWT authentication from uploaded config
|
||||
# JWT config may be in credentials (legacy) or loaded from KV store (preferred)
|
||||
jwt_config_json_str = credentials.get(DB_CREDENTIALS_DICT_BOX_JWT_CONFIG)
|
||||
if not jwt_config_json_str:
|
||||
# Try loading from KV store (preferred method - avoids duplicating sensitive data)
|
||||
try:
|
||||
from onyx.connectors.box.box_kv import get_box_jwt_config
|
||||
|
||||
jwt_config_obj = get_box_jwt_config()
|
||||
jwt_config_json_str = jwt_config_obj.model_dump_json()
|
||||
logger.info("Loaded Box JWT config from KV store")
|
||||
except Exception:
|
||||
# If not in KV store either, continue to error below
|
||||
pass
|
||||
|
||||
if jwt_config_json_str:
|
||||
logger.info("Using JWT authentication")
|
||||
# Get primary admin user ID for impersonation
|
||||
primary_admin_user_id = credentials.get(
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID
|
||||
)
|
||||
|
||||
# Create BoxJWTAuth from config json string
|
||||
try:
|
||||
jwt_config = JWTConfig.from_config_json_string(jwt_config_json_str)
|
||||
auth = BoxJWTAuth(config=jwt_config)
|
||||
logger.info("Box JWT config loaded successfully")
|
||||
except Exception as e:
|
||||
# Sanitize error message to avoid leaking sensitive data
|
||||
sanitized_error = _sanitize_error_message(e)
|
||||
logger.error(f"Failed to initialize Box BoxJWTAuth: {sanitized_error}")
|
||||
raise ConnectorValidationError(
|
||||
f"Failed to initialize Box JWT authentication: {sanitized_error}"
|
||||
)
|
||||
|
||||
# If primary admin user ID is provided, use it for impersonation
|
||||
if primary_admin_user_id:
|
||||
logger.info(
|
||||
f"Using user impersonation with primary_admin_user_id: {primary_admin_user_id}"
|
||||
)
|
||||
user_auth = auth.with_user_subject(primary_admin_user_id)
|
||||
self._box_client = BoxClient(auth=user_auth)
|
||||
self._user_id = primary_admin_user_id
|
||||
else:
|
||||
# Use service account as user
|
||||
logger.info("Using Box service account (no user impersonation)")
|
||||
self._user_id = "me"
|
||||
self._box_client = BoxClient(auth=auth)
|
||||
|
||||
# Verify authentication by getting user info
|
||||
try:
|
||||
current_user = self._box_client.users.get_user_me()
|
||||
logger.info(
|
||||
f"Box JWT authentication successful. Authenticated as user: {current_user.id} "
|
||||
f"(name: {current_user.name}, login: {getattr(current_user, 'login', 'N/A')})"
|
||||
)
|
||||
self._user_id = current_user.id
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Could not get current user info: {e}. "
|
||||
f"Using user_id: {self._user_id}"
|
||||
)
|
||||
# Keep the user_id we set above
|
||||
|
||||
elif "box_developer_token" in credentials:
|
||||
# Developer token authentication (for testing/backward compatibility)
|
||||
logger.info("Using developer token from credentials (TESTING ONLY)")
|
||||
auth = BoxDeveloperTokenAuth(token=credentials["box_developer_token"])
|
||||
self._box_client = BoxClient(auth=auth)
|
||||
self._user_id = credentials.get("box_user_id", "me")
|
||||
else:
|
||||
raise ConnectorValidationError(
|
||||
"Box credentials missing. Need either JWT config (box_jwt_config) "
|
||||
"or box_developer_token in credentials. "
|
||||
"Please upload JWT config JSON file via the UI."
|
||||
)
|
||||
|
||||
self._creds_dict = credentials
|
||||
return None
|
||||
|
||||
def _update_traversed_folder_ids(self, folder_id: str) -> None:
|
||||
"""Mark a folder as traversed."""
|
||||
self._retrieved_folder_ids.add(folder_id)
|
||||
|
||||
def _fetch_box_items(
|
||||
self,
|
||||
checkpoint: BoxCheckpoint,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[RetrievedBoxFile]:
|
||||
"""Fetch Box files based on checkpoint state."""
|
||||
if checkpoint.completion_stage == BoxRetrievalStage.START:
|
||||
checkpoint.completion_stage = BoxRetrievalStage.FOLDER_FILES
|
||||
checkpoint.completion_map[self.user_id] = StageCompletion(
|
||||
stage=BoxRetrievalStage.START,
|
||||
completed_until=0,
|
||||
current_folder_id=None,
|
||||
)
|
||||
|
||||
completion = checkpoint.completion_map.get(self.user_id)
|
||||
if not completion:
|
||||
completion = StageCompletion(
|
||||
stage=BoxRetrievalStage.START,
|
||||
completed_until=0,
|
||||
current_folder_id=None,
|
||||
)
|
||||
checkpoint.completion_map[self.user_id] = completion
|
||||
|
||||
# Determine which folders to process
|
||||
if checkpoint.completion_stage == BoxRetrievalStage.FOLDER_FILES:
|
||||
if checkpoint.folder_ids_to_retrieve is None:
|
||||
if self.include_all_files:
|
||||
# Start from root folder (ID "0")
|
||||
checkpoint.folder_ids_to_retrieve = ["0"]
|
||||
logger.info("include_all_files=True, starting from root folder '0'")
|
||||
else:
|
||||
checkpoint.folder_ids_to_retrieve = sorted(
|
||||
self._requested_folder_ids
|
||||
)
|
||||
logger.info(
|
||||
f"Processing specific folders: {checkpoint.folder_ids_to_retrieve}"
|
||||
)
|
||||
folder_ids = checkpoint.folder_ids_to_retrieve
|
||||
else:
|
||||
folder_ids = checkpoint.folder_ids_to_retrieve or []
|
||||
|
||||
logger.info(f"Processing {len(folder_ids)} folder(s): {folder_ids}")
|
||||
|
||||
# Process folders
|
||||
for folder_id in folder_ids:
|
||||
if folder_id in self._retrieved_folder_ids:
|
||||
continue
|
||||
|
||||
# Resume from checkpoint if needed
|
||||
if completion.current_folder_id == folder_id and completion.next_marker:
|
||||
# Resume from marker - continue processing direct files in folder
|
||||
for file_or_marker in get_all_files_in_folder(
|
||||
client=self.box_client,
|
||||
folder_id=folder_id,
|
||||
user_id=self.user_id,
|
||||
start=(
|
||||
completion.completed_until
|
||||
if completion.completed_until > 0
|
||||
else start
|
||||
),
|
||||
end=end,
|
||||
marker=completion.next_marker,
|
||||
):
|
||||
if isinstance(file_or_marker, str):
|
||||
# This is a marker for next page
|
||||
completion.next_marker = file_or_marker
|
||||
return # Checkpoint and resume later
|
||||
yield file_or_marker
|
||||
# Update completion timestamp
|
||||
modified_time = file_or_marker.box_file.get("modified_at")
|
||||
timestamp = _parse_box_datetime_to_timestamp(modified_time)
|
||||
if timestamp is not None:
|
||||
completion.completed_until = timestamp
|
||||
|
||||
# After resuming direct files, also recurse into subfolders
|
||||
# (This ensures we don't skip nested content after pagination resume)
|
||||
logger.info(
|
||||
f"Resuming recursive crawl of subfolders in folder {folder_id}"
|
||||
)
|
||||
subfolder_files = 0
|
||||
for retrieved_file in crawl_folders_for_files(
|
||||
client=self.box_client,
|
||||
parent_id=folder_id,
|
||||
user_id=self.user_id,
|
||||
traversed_parent_ids=self._retrieved_folder_ids,
|
||||
update_traversed_ids_func=self._update_traversed_folder_ids,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
subfolder_files += 1
|
||||
yield retrieved_file
|
||||
logger.info(
|
||||
f"Found {subfolder_files} files in subfolders of folder {folder_id} (resumed)"
|
||||
)
|
||||
else:
|
||||
# Start fresh folder crawl
|
||||
logger.info(f"Starting fresh crawl of folder {folder_id}")
|
||||
completion.current_folder_id = folder_id
|
||||
completion.completed_until = 0
|
||||
completion.next_marker = None
|
||||
|
||||
files_in_folder = 0
|
||||
for file_or_marker in get_all_files_in_folder(
|
||||
client=self.box_client,
|
||||
folder_id=folder_id,
|
||||
user_id=self.user_id,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
if isinstance(file_or_marker, str):
|
||||
# This is a marker for next page
|
||||
logger.debug(
|
||||
f"Received pagination marker for folder {folder_id}: {file_or_marker}"
|
||||
)
|
||||
completion.next_marker = file_or_marker
|
||||
return # Checkpoint and resume later
|
||||
files_in_folder += 1
|
||||
yield file_or_marker
|
||||
# Update completion timestamp
|
||||
modified_time = file_or_marker.box_file.get("modified_at")
|
||||
timestamp = _parse_box_datetime_to_timestamp(modified_time)
|
||||
if timestamp is not None:
|
||||
completion.completed_until = timestamp
|
||||
|
||||
logger.info(
|
||||
f"Found {files_in_folder} files directly in folder {folder_id}"
|
||||
)
|
||||
|
||||
# Also crawl subfolders recursively
|
||||
logger.info(
|
||||
f"Starting recursive crawl of subfolders in folder {folder_id}"
|
||||
)
|
||||
subfolder_files = 0
|
||||
for retrieved_file in crawl_folders_for_files(
|
||||
client=self.box_client,
|
||||
parent_id=folder_id,
|
||||
user_id=self.user_id,
|
||||
traversed_parent_ids=self._retrieved_folder_ids,
|
||||
update_traversed_ids_func=self._update_traversed_folder_ids,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
subfolder_files += 1
|
||||
yield retrieved_file
|
||||
logger.info(
|
||||
f"Found {subfolder_files} files in subfolders of folder {folder_id}"
|
||||
)
|
||||
|
||||
# Mark folder as processed
|
||||
self._retrieved_folder_ids.add(folder_id)
|
||||
completion.current_folder_id = None
|
||||
completion.next_marker = None
|
||||
|
||||
checkpoint.completion_stage = BoxRetrievalStage.DONE
|
||||
|
||||
def _extract_docs_from_box(
|
||||
self,
|
||||
checkpoint: BoxCheckpoint,
|
||||
start: SecondsSinceUnixEpoch | None,
|
||||
end: SecondsSinceUnixEpoch | None,
|
||||
include_permissions: bool,
|
||||
) -> Iterator[Document | ConnectorFailure]:
|
||||
"""Retrieve and convert Box files to documents."""
|
||||
try:
|
||||
# Prepare conversion function
|
||||
permission_sync_context = (
|
||||
PermissionSyncContext(
|
||||
primary_user_id=self.user_id,
|
||||
box_domain=None, # Box uses user emails directly, not domain-based access
|
||||
)
|
||||
if include_permissions
|
||||
else None
|
||||
)
|
||||
|
||||
convert_func = partial(
|
||||
convert_box_item_to_document,
|
||||
self.box_client,
|
||||
self.allow_images,
|
||||
self.size_threshold,
|
||||
permission_sync_context,
|
||||
self.user_id,
|
||||
)
|
||||
|
||||
# Fetch files
|
||||
logger.info(
|
||||
f"Starting to fetch Box items for user_id: {self.user_id} "
|
||||
f"(include_permissions: {include_permissions})"
|
||||
)
|
||||
files_fetched = 0
|
||||
files_converted = 0
|
||||
files_skipped = 0
|
||||
files_failed = 0
|
||||
for retrieved_file in self._fetch_box_items(
|
||||
checkpoint=checkpoint,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
files_fetched += 1
|
||||
if retrieved_file.error is not None:
|
||||
failure_stage = retrieved_file.completion_stage.value
|
||||
sanitized_error = _sanitize_error_message(retrieved_file.error)
|
||||
failure_message = (
|
||||
f"retrieval failure during stage: {failure_stage}, "
|
||||
f"user: {retrieved_file.user_id}, "
|
||||
f"parent folder: {retrieved_file.parent_id}, "
|
||||
f"error: {sanitized_error}"
|
||||
)
|
||||
logger.error(failure_message)
|
||||
yield ConnectorFailure(
|
||||
failed_entity=EntityFailure(entity_id=failure_stage),
|
||||
failure_message=failure_message,
|
||||
exception=retrieved_file.error,
|
||||
)
|
||||
continue
|
||||
|
||||
box_file = retrieved_file.box_file
|
||||
if not box_file:
|
||||
continue
|
||||
|
||||
try:
|
||||
document_id = onyx_document_id_from_box_file(box_file)
|
||||
except KeyError:
|
||||
logger.warning(
|
||||
f"Box file missing id (stage={retrieved_file.completion_stage} "
|
||||
f"user={retrieved_file.user_id}). Skipping."
|
||||
)
|
||||
continue
|
||||
|
||||
# Check for duplicates
|
||||
if document_id in checkpoint.all_retrieved_file_ids:
|
||||
continue
|
||||
|
||||
checkpoint.all_retrieved_file_ids.add(document_id)
|
||||
|
||||
# Convert to document
|
||||
file_name = box_file.get("name", "unknown")
|
||||
logger.debug(f"Converting Box file to document: {file_name}")
|
||||
doc_or_failure = convert_func(box_file)
|
||||
if doc_or_failure:
|
||||
if isinstance(doc_or_failure, ConnectorFailure):
|
||||
files_failed += 1
|
||||
logger.warning(
|
||||
f"Failed to convert file {file_name}: {doc_or_failure.failure_message}"
|
||||
)
|
||||
else:
|
||||
files_converted += 1
|
||||
logger.debug(
|
||||
f"Successfully converted file {file_name} to document"
|
||||
)
|
||||
yield doc_or_failure
|
||||
else:
|
||||
files_skipped += 1
|
||||
logger.debug(
|
||||
f"convert_func returned None for file {file_name} (likely skipped due to "
|
||||
f"permissions, size, or content extraction failure)"
|
||||
)
|
||||
|
||||
checkpoint.retrieved_folder_ids = self._retrieved_folder_ids
|
||||
|
||||
logger.info(
|
||||
f"Finished fetching Box items for user_id: {self.user_id}. "
|
||||
f"Summary: fetched={files_fetched}, converted={files_converted}, "
|
||||
f"skipped={files_skipped}, failed={files_failed}, "
|
||||
f"unique_file_ids={len(checkpoint.all_retrieved_file_ids)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error extracting documents from Box: {e}")
|
||||
raise
|
||||
|
||||
def _load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: BoxCheckpoint,
|
||||
include_permissions: bool,
|
||||
) -> CheckpointOutput[BoxCheckpoint]:
|
||||
"""Entrypoint for the connector; first run is with an empty checkpoint."""
|
||||
if self._box_client is None or self._user_id is None:
|
||||
raise RuntimeError(
|
||||
"Credentials missing, should not call this method before calling load_credentials"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Loading from checkpoint with completion stage: {checkpoint.completion_stage}, "
|
||||
f"num retrieved ids: {len(checkpoint.all_retrieved_file_ids)}"
|
||||
)
|
||||
checkpoint = copy.deepcopy(checkpoint)
|
||||
self._retrieved_folder_ids = checkpoint.retrieved_folder_ids
|
||||
|
||||
yield from self._extract_docs_from_box(
|
||||
checkpoint, start, end, include_permissions
|
||||
)
|
||||
|
||||
checkpoint.retrieved_folder_ids = self._retrieved_folder_ids
|
||||
|
||||
logger.info(
|
||||
f"num box files retrieved: {len(checkpoint.all_retrieved_file_ids)}"
|
||||
)
|
||||
if checkpoint.completion_stage == BoxRetrievalStage.DONE:
|
||||
checkpoint.has_more = False
|
||||
return checkpoint
|
||||
|
||||
@override
|
||||
def load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: BoxCheckpoint,
|
||||
) -> CheckpointOutput[BoxCheckpoint]:
|
||||
return self._load_from_checkpoint(
|
||||
start, end, checkpoint, include_permissions=False
|
||||
)
|
||||
|
||||
@override
|
||||
def load_from_checkpoint_with_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: BoxCheckpoint,
|
||||
) -> CheckpointOutput[BoxCheckpoint]:
|
||||
return self._load_from_checkpoint(
|
||||
start, end, checkpoint, include_permissions=True
|
||||
)
|
||||
|
||||
def _extract_slim_docs_from_box(
|
||||
self,
|
||||
checkpoint: BoxCheckpoint,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""Extract slim documents for permission syncing."""
|
||||
from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
|
||||
|
||||
slim_batch = []
|
||||
for file in self._fetch_box_items(
|
||||
checkpoint=checkpoint,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
if file.error is not None:
|
||||
raise file.error
|
||||
|
||||
if doc := build_slim_document(
|
||||
self.box_client,
|
||||
file.box_file,
|
||||
PermissionSyncContext(
|
||||
primary_user_id=self.user_id,
|
||||
box_domain=None,
|
||||
),
|
||||
):
|
||||
slim_batch.append(doc)
|
||||
if len(slim_batch) >= SLIM_BATCH_SIZE:
|
||||
yield slim_batch
|
||||
slim_batch = []
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
raise RuntimeError(
|
||||
"_extract_slim_docs_from_box: Stop signal detected"
|
||||
)
|
||||
callback.progress("_extract_slim_docs_from_box", 1)
|
||||
yield slim_batch
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""Retrieve all slim documents for permission syncing."""
|
||||
checkpoint = self.build_dummy_checkpoint()
|
||||
while checkpoint.completion_stage != BoxRetrievalStage.DONE:
|
||||
yield from self._extract_slim_docs_from_box(
|
||||
checkpoint=checkpoint,
|
||||
start=start,
|
||||
end=end,
|
||||
callback=callback,
|
||||
)
|
||||
logger.info("Box perm sync: Slim doc retrieval complete")
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
"""Validate Box connector settings and credentials."""
|
||||
if self._box_client is None:
|
||||
raise ConnectorMissingCredentialError("Box credentials not loaded.")
|
||||
|
||||
try:
|
||||
# Test API access by getting current user
|
||||
current_user = self._box_client.users.get_user_me()
|
||||
logger.info(f"Box connector validated for user: {current_user.name}")
|
||||
|
||||
except BoxAPIError as e:
|
||||
status_code = e.status_code if hasattr(e, "status_code") else None
|
||||
if status_code == 401:
|
||||
raise CredentialExpiredError(
|
||||
"Invalid or expired Box credentials (401)."
|
||||
)
|
||||
elif status_code == 403:
|
||||
raise InsufficientPermissionsError(
|
||||
"Box app lacks required permissions (403). "
|
||||
"Please ensure the necessary scopes are granted."
|
||||
)
|
||||
else:
|
||||
raise ConnectorValidationError(
|
||||
f"Unexpected Box error (status={status_code}): {e}"
|
||||
)
|
||||
except Exception as e:
|
||||
raise ConnectorValidationError(
|
||||
f"Unexpected error during Box validation: {e}"
|
||||
)
|
||||
|
||||
@override
|
||||
def build_dummy_checkpoint(self) -> BoxCheckpoint:
|
||||
"""Build an initial empty checkpoint."""
|
||||
return BoxCheckpoint(
|
||||
retrieved_folder_ids=set(),
|
||||
completion_stage=BoxRetrievalStage.START,
|
||||
completion_map=ThreadSafeDict(),
|
||||
all_retrieved_file_ids=set(),
|
||||
has_more=True,
|
||||
)
|
||||
|
||||
@override
|
||||
def validate_checkpoint_json(self, checkpoint_json: str) -> BoxCheckpoint:
|
||||
"""Validate checkpoint JSON and return checkpoint object."""
|
||||
return BoxCheckpoint.model_validate_json(checkpoint_json)
|
||||
9
backend/onyx/connectors/box/constants.py
Normal file
9
backend/onyx/connectors/box/constants.py
Normal file
@@ -0,0 +1,9 @@
|
||||
BOX_FOLDER_TYPE = "folder"
|
||||
BOX_FILE_TYPE = "file"
|
||||
BOX_WEBLINK_BASE = "https://app.box.com/file/"
|
||||
|
||||
# Box API constants
|
||||
BOX_API_MAX_ITEMS_PER_PAGE = 1000 # Maximum items per page in Box API pagination
|
||||
|
||||
# Box download constants
|
||||
BOX_DOWNLOAD_CHUNK_SIZE = 8192 # Read files in 8KB chunks
|
||||
497
backend/onyx/connectors/box/doc_conversion.py
Normal file
497
backend/onyx/connectors/box/doc_conversion.py
Normal file
@@ -0,0 +1,497 @@
|
||||
import io
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import cast
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlunparse
|
||||
|
||||
from box_sdk_gen.client import BoxClient
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.box.constants import BOX_DOWNLOAD_CHUNK_SIZE
|
||||
from onyx.connectors.box.constants import BOX_FOLDER_TYPE
|
||||
from onyx.connectors.box.constants import BOX_WEBLINK_BASE
|
||||
from onyx.connectors.box.models import BoxFileType
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
from onyx.file_processing.extract_file_text import pptx_to_text
|
||||
from onyx.file_processing.extract_file_text import read_docx_file
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_processing.extract_file_text import xlsx_to_text
|
||||
from onyx.file_processing.file_types import OnyxFileExtensions
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.variable_functionality import (
|
||||
fetch_versioned_implementation_with_fallback,
|
||||
)
|
||||
from onyx.utils.variable_functionality import noop_fallback
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read
|
||||
|
||||
|
||||
def _handle_box_download_error(file_id: str, error: Exception) -> bytes:
|
||||
"""Handle Box download errors, logging appropriately based on error type."""
|
||||
from box_sdk_gen.box import BoxAPIError
|
||||
|
||||
is_403 = False
|
||||
status_code = None
|
||||
|
||||
# Check if it's a BoxAPIError with status code
|
||||
if isinstance(error, BoxAPIError):
|
||||
status_code = getattr(error, "status_code", None)
|
||||
if status_code == 403:
|
||||
is_403 = True
|
||||
else:
|
||||
# Check error message for 403 indicators
|
||||
error_message = str(error).lower()
|
||||
if (
|
||||
"403" in str(error)
|
||||
or "access_denied" in error_message
|
||||
or "insufficient permission" in error_message
|
||||
):
|
||||
is_403 = True
|
||||
|
||||
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
|
||||
error_str = str(error)
|
||||
# Remove potential URLs and tokens from error message
|
||||
import re
|
||||
|
||||
# Remove URLs
|
||||
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
|
||||
# Remove potential tokens (long alphanumeric strings)
|
||||
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
|
||||
|
||||
# Log based on error type
|
||||
if is_403:
|
||||
logger.warning(
|
||||
f"Permission denied (403) downloading Box file {file_id}. "
|
||||
f"This may be due to file-level permissions or Box app scope limitations. "
|
||||
f"Error: {error_str}"
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to download Box file {file_id}"
|
||||
+ (f" (status={status_code})" if status_code else "")
|
||||
+ f": {error_str}"
|
||||
)
|
||||
|
||||
return bytes()
|
||||
|
||||
|
||||
class PermissionSyncContext(BaseModel):
|
||||
"""
|
||||
This is the information that is needed to sync permissions for a document.
|
||||
"""
|
||||
|
||||
primary_user_id: str
|
||||
box_domain: str | None = None
|
||||
|
||||
|
||||
def onyx_document_id_from_box_file(file: BoxFileType) -> str:
|
||||
"""Generate Onyx document ID from Box file."""
|
||||
file_id = file.get("id")
|
||||
if not file_id:
|
||||
raise KeyError("Box file missing 'id' field.")
|
||||
|
||||
# Construct Box web link
|
||||
# shared_link may be a string URL or an object with a 'url' attribute/key
|
||||
shared_link = file.get("shared_link")
|
||||
link = None
|
||||
if shared_link:
|
||||
if isinstance(shared_link, str):
|
||||
link = shared_link
|
||||
elif isinstance(shared_link, dict):
|
||||
# Extract URL from object
|
||||
link = shared_link.get("url")
|
||||
elif hasattr(shared_link, "url"):
|
||||
# Handle object with url attribute
|
||||
link = shared_link.url
|
||||
else:
|
||||
# Fallback: treat as string
|
||||
link = str(shared_link)
|
||||
|
||||
if not link:
|
||||
link = f"{BOX_WEBLINK_BASE}{file_id}"
|
||||
|
||||
# Normalize the URL
|
||||
parsed_url = urlparse(link)
|
||||
parsed_url = parsed_url._replace(query="") # remove query parameters
|
||||
# Remove trailing slashes and normalize
|
||||
path = parsed_url.path.rstrip("/")
|
||||
parsed_url = parsed_url._replace(path=path)
|
||||
return urlunparse(parsed_url)
|
||||
|
||||
|
||||
def download_box_file(client: BoxClient, file_id: str, size_threshold: int) -> bytes:
|
||||
"""
|
||||
Download the file from Box.
|
||||
"""
|
||||
download_stream = None
|
||||
try:
|
||||
# Box SDK v10 downloads files using download_file method
|
||||
# This returns a stream that we need to read
|
||||
download_stream = client.downloads.download_file(file_id=file_id)
|
||||
# Use list to collect chunks for O(n) performance instead of O(n²) with +=
|
||||
chunks: list[bytes] = []
|
||||
total_size = 0
|
||||
chunk_size = BOX_DOWNLOAD_CHUNK_SIZE
|
||||
|
||||
# Read the stream in chunks
|
||||
while True:
|
||||
chunk = download_stream.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
if isinstance(chunk, bytes):
|
||||
chunks.append(chunk)
|
||||
total_size += len(chunk)
|
||||
else:
|
||||
# Handle string chunks (shouldn't happen but be safe)
|
||||
chunk_bytes = chunk.encode("utf-8")
|
||||
chunks.append(chunk_bytes)
|
||||
total_size += len(chunk)
|
||||
|
||||
if total_size > size_threshold:
|
||||
logger.warning(
|
||||
f"File {file_id} exceeds size threshold of {size_threshold}. Skipping."
|
||||
)
|
||||
return bytes()
|
||||
|
||||
# Join all chunks at once for O(n) performance
|
||||
return b"".join(chunks)
|
||||
except Exception as e:
|
||||
return _handle_box_download_error(file_id, e)
|
||||
finally:
|
||||
# Ensure stream is closed on all paths (success, exception, early return)
|
||||
if download_stream is not None:
|
||||
try:
|
||||
download_stream.close()
|
||||
except Exception as close_error:
|
||||
logger.warning(
|
||||
f"Error closing download stream for file {file_id}: {close_error}"
|
||||
)
|
||||
|
||||
|
||||
def _download_and_extract_sections(
|
||||
file: BoxFileType,
|
||||
client: BoxClient,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
) -> list[TextSection | ImageSection]:
|
||||
"""Extract text and images from a Box file."""
|
||||
file_id = file.get("id", "")
|
||||
file_name = file.get("name", "")
|
||||
file_type = file.get("type", "")
|
||||
# Handle shared_link as string or object
|
||||
shared_link = file.get("shared_link")
|
||||
if shared_link:
|
||||
if isinstance(shared_link, str):
|
||||
link = shared_link
|
||||
elif isinstance(shared_link, dict):
|
||||
link = shared_link.get("url")
|
||||
elif hasattr(shared_link, "url"):
|
||||
link = shared_link.url
|
||||
else:
|
||||
link = str(shared_link) if shared_link else None
|
||||
else:
|
||||
link = None
|
||||
if not link:
|
||||
link = f"{BOX_WEBLINK_BASE}{file_id}"
|
||||
|
||||
# Skip folders
|
||||
if file_type == BOX_FOLDER_TYPE:
|
||||
logger.info("Skipping folder.")
|
||||
return []
|
||||
|
||||
# Lazy evaluation to only download the file if necessary
|
||||
def response_call() -> bytes:
|
||||
return download_box_file(client, file_id, size_threshold)
|
||||
|
||||
# Check file size
|
||||
file_size = file.get("size", 0)
|
||||
if file_size and file_size > size_threshold:
|
||||
logger.warning(
|
||||
f"{file_name} exceeds size threshold of {size_threshold}. Skipping."
|
||||
)
|
||||
return []
|
||||
|
||||
# Get file extension for mime type detection
|
||||
file_ext = get_file_ext(file_name)
|
||||
|
||||
# Handle images
|
||||
if file_ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]:
|
||||
if not allow_images:
|
||||
return []
|
||||
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
try:
|
||||
section, embedded_id = store_image_and_create_section(
|
||||
image_data=response_call(),
|
||||
file_id=file_id,
|
||||
display_name=file_name,
|
||||
media_type=f"image/{file_ext[1:]}",
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
link=link,
|
||||
)
|
||||
sections.append(section)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process image {file_name}: {e}")
|
||||
return sections
|
||||
|
||||
# Process based on file extension
|
||||
try:
|
||||
file_bytes = response_call()
|
||||
if not file_bytes:
|
||||
logger.warning(f"Failed to download {file_name}")
|
||||
return []
|
||||
|
||||
file_io = io.BytesIO(file_bytes)
|
||||
|
||||
if file_ext == ".pdf":
|
||||
text, _pdf_meta, images = read_pdf_file(file_io)
|
||||
pdf_sections: list[TextSection | ImageSection] = [
|
||||
TextSection(link=link, text=text)
|
||||
]
|
||||
|
||||
# Process embedded images in the PDF only if images are allowed
|
||||
if allow_images:
|
||||
try:
|
||||
for idx, (img_data, img_name) in enumerate(images):
|
||||
section, embedded_id = store_image_and_create_section(
|
||||
image_data=img_data,
|
||||
file_id=f"{file_id}_img_{idx}",
|
||||
display_name=img_name or f"{file_name} - image {idx}",
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
pdf_sections.append(section)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process PDF images in {file_name}: {e}")
|
||||
return pdf_sections
|
||||
|
||||
elif file_ext in [".docx", ".doc"]:
|
||||
text, _ = read_docx_file(file_io)
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
elif file_ext == ".xlsx":
|
||||
text = xlsx_to_text(file_io, file_name=file_name)
|
||||
return [TextSection(link=link, text=text)] if text else []
|
||||
|
||||
elif file_ext == ".xls":
|
||||
# Legacy Excel format - use generic extractor which can handle via unstructured API
|
||||
text = extract_file_text(file_io, file_name)
|
||||
return [TextSection(link=link, text=text)] if text else []
|
||||
|
||||
elif file_ext == ".pptx":
|
||||
text = pptx_to_text(file_io, file_name=file_name)
|
||||
return [TextSection(link=link, text=text)] if text else []
|
||||
|
||||
elif file_ext == ".ppt":
|
||||
# Legacy PowerPoint format - use generic extractor which can handle via unstructured API
|
||||
text = extract_file_text(file_io, file_name)
|
||||
return [TextSection(link=link, text=text)] if text else []
|
||||
|
||||
elif file_ext == ".txt":
|
||||
text = file_bytes.decode("utf-8", errors="ignore")
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
# Final attempt at extracting text using generic extractor
|
||||
if file_ext not in OnyxFileExtensions.ALL_ALLOWED_EXTENSIONS:
|
||||
logger.warning(f"Skipping file {file_name} due to extension.")
|
||||
return []
|
||||
|
||||
try:
|
||||
text = extract_file_text(file_io, file_name)
|
||||
return [TextSection(link=link, text=text)]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract text from {file_name}: {e}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _get_external_access_for_raw_box_file(
|
||||
file: BoxFileType,
|
||||
company_domain: str | None,
|
||||
retriever_box_client: BoxClient | None,
|
||||
admin_box_client: BoxClient,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Get the external access for a raw Box file.
|
||||
"""
|
||||
external_access_fn = cast(
|
||||
Callable[
|
||||
[BoxFileType, str | None, BoxClient | None, BoxClient],
|
||||
ExternalAccess,
|
||||
],
|
||||
fetch_versioned_implementation_with_fallback(
|
||||
"onyx.external_permissions.box.doc_sync",
|
||||
"get_external_access_for_raw_box_file",
|
||||
fallback=noop_fallback,
|
||||
),
|
||||
)
|
||||
return external_access_fn(
|
||||
file,
|
||||
company_domain,
|
||||
retriever_box_client,
|
||||
admin_box_client,
|
||||
)
|
||||
|
||||
|
||||
def convert_box_item_to_document(
|
||||
client: BoxClient,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
permission_sync_context: PermissionSyncContext | None,
|
||||
retriever_user_id: str,
|
||||
file: BoxFileType,
|
||||
) -> Document | ConnectorFailure | None:
|
||||
"""
|
||||
Convert a Box file to an Onyx Document.
|
||||
"""
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
doc_id = "unknown"
|
||||
|
||||
try:
|
||||
# Skip folders
|
||||
if file.get("type") == BOX_FOLDER_TYPE:
|
||||
logger.info("Skipping folder.")
|
||||
return None
|
||||
|
||||
# Check file size
|
||||
size_str = file.get("size")
|
||||
if size_str:
|
||||
try:
|
||||
size_int = int(size_str)
|
||||
except ValueError:
|
||||
logger.warning(f"Parsing string to int failed: size_str={size_str}")
|
||||
else:
|
||||
if size_int > size_threshold:
|
||||
logger.warning(
|
||||
f"{file.get('name')} exceeds size threshold of {size_threshold}. Skipping."
|
||||
)
|
||||
return None
|
||||
|
||||
# Extract sections
|
||||
file_name = file.get("name", "unknown")
|
||||
file_id = file.get("id", "unknown")
|
||||
logger.debug(
|
||||
f"Attempting to extract content from file: {file_name} (id: {file_id})"
|
||||
)
|
||||
sections = _download_and_extract_sections(
|
||||
file, client, allow_images, size_threshold
|
||||
)
|
||||
|
||||
# If we still don't have any sections, skip this file
|
||||
if not sections:
|
||||
logger.warning(
|
||||
f"No content extracted from {file_name} (id: {file_id}). "
|
||||
f"This may be due to download permission issues, unsupported file type, "
|
||||
f"or empty file content."
|
||||
)
|
||||
return None
|
||||
|
||||
doc_id = onyx_document_id_from_box_file(file)
|
||||
external_access = (
|
||||
_get_external_access_for_raw_box_file(
|
||||
file=file,
|
||||
company_domain=permission_sync_context.box_domain,
|
||||
retriever_box_client=client,
|
||||
admin_box_client=client,
|
||||
)
|
||||
if permission_sync_context
|
||||
else None
|
||||
)
|
||||
|
||||
# Parse modified time to UTC datetime
|
||||
# Note: Must use exact timezone.utc object (not FixedOffset) for identity checks
|
||||
modified_time_str = file.get("modified_at")
|
||||
doc_updated_at = None
|
||||
if modified_time_str:
|
||||
try:
|
||||
parsed_dt = datetime.fromisoformat(
|
||||
modified_time_str.replace("Z", "+00:00")
|
||||
)
|
||||
if parsed_dt.tzinfo is None:
|
||||
doc_updated_at = parsed_dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
# Convert to UTC and recreate with exact timezone.utc object
|
||||
# (astimezone may return FixedOffset, which fails identity checks)
|
||||
utc_timestamp = parsed_dt.astimezone(timezone.utc).timestamp()
|
||||
doc_updated_at = datetime.fromtimestamp(
|
||||
utc_timestamp, tz=timezone.utc
|
||||
)
|
||||
except (ValueError, AttributeError) as e:
|
||||
logger.warning(
|
||||
f"Failed to parse modified_at timestamp '{modified_time_str}': {e}"
|
||||
)
|
||||
|
||||
# Create the document
|
||||
return Document(
|
||||
id=doc_id,
|
||||
sections=sections,
|
||||
source=DocumentSource.BOX,
|
||||
semantic_identifier=file.get("name", ""),
|
||||
metadata={},
|
||||
doc_updated_at=doc_updated_at,
|
||||
external_access=external_access,
|
||||
)
|
||||
except Exception as e:
|
||||
# Try to get doc_id for error reporting, but don't fail if it's unavailable
|
||||
try:
|
||||
doc_id = onyx_document_id_from_box_file(file)
|
||||
except Exception:
|
||||
doc_id = "unknown"
|
||||
|
||||
file_name = file.get("name", "unknown")
|
||||
error_str = f"Error converting file '{file_name}' to Document as {retriever_user_id}: {e}"
|
||||
logger.warning(error_str)
|
||||
|
||||
return ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=doc_id,
|
||||
document_link=(sections[0].link if sections else None),
|
||||
),
|
||||
failed_entity=None,
|
||||
failure_message=error_str,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
|
||||
def build_slim_document(
|
||||
client: BoxClient,
|
||||
file: BoxFileType,
|
||||
permission_sync_context: PermissionSyncContext | None,
|
||||
) -> SlimDocument | None:
|
||||
"""Build a slim document for pruning."""
|
||||
if file.get("type") == BOX_FOLDER_TYPE:
|
||||
return None
|
||||
|
||||
external_access = (
|
||||
_get_external_access_for_raw_box_file(
|
||||
file=file,
|
||||
company_domain=(
|
||||
permission_sync_context.box_domain if permission_sync_context else None
|
||||
),
|
||||
retriever_box_client=client,
|
||||
admin_box_client=client,
|
||||
)
|
||||
if permission_sync_context
|
||||
else None
|
||||
)
|
||||
return SlimDocument(
|
||||
id=onyx_document_id_from_box_file(file),
|
||||
external_access=external_access,
|
||||
)
|
||||
455
backend/onyx/connectors/box/file_retrieval.py
Normal file
455
backend/onyx/connectors/box/file_retrieval.py
Normal file
@@ -0,0 +1,455 @@
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
|
||||
from box_sdk_gen.client import BoxClient
|
||||
from box_sdk_gen.schemas import File as BoxFile
|
||||
from box_sdk_gen.schemas import Folder as BoxFolder
|
||||
|
||||
from onyx.connectors.box.constants import BOX_API_MAX_ITEMS_PER_PAGE
|
||||
from onyx.connectors.box.models import BoxFileType
|
||||
from onyx.connectors.box.models import BoxRetrievalStage
|
||||
from onyx.connectors.box.models import RetrievedBoxFile
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _should_include_file_by_time(
|
||||
file_dict: BoxFileType,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> bool:
|
||||
"""Check if a file should be included based on its modified time."""
|
||||
if start is None and end is None:
|
||||
return True
|
||||
|
||||
modified_time = file_dict.get("modified_at")
|
||||
if not modified_time:
|
||||
return True # Include files without timestamps
|
||||
|
||||
try:
|
||||
mod_dt = datetime.fromisoformat(modified_time.replace("Z", "+00:00"))
|
||||
mod_ts = mod_dt.timestamp()
|
||||
if start is not None and mod_ts < start:
|
||||
logger.debug(
|
||||
f"Skipping file {file_dict.get('name')} - "
|
||||
f"modified {mod_ts} < start {start}"
|
||||
)
|
||||
return False
|
||||
if end is not None and mod_ts > end:
|
||||
logger.debug(
|
||||
f"Skipping file {file_dict.get('name')} - "
|
||||
f"modified {mod_ts} > end {end}"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
except (ValueError, AttributeError):
|
||||
return True # Include files with invalid timestamps
|
||||
|
||||
|
||||
def _box_file_to_dict(file: BoxFile | BoxFolder) -> BoxFileType:
|
||||
"""Convert Box SDK file/folder object to dictionary."""
|
||||
|
||||
# Helper to safely convert datetime or string to ISO format
|
||||
def to_iso_string(dt_or_str):
|
||||
if dt_or_str is None:
|
||||
return None
|
||||
if isinstance(dt_or_str, str):
|
||||
return dt_or_str
|
||||
if hasattr(dt_or_str, "isoformat"):
|
||||
return dt_or_str.isoformat()
|
||||
return str(dt_or_str)
|
||||
|
||||
# Helper to safely get parent ID
|
||||
def get_parent_id(parent):
|
||||
if parent is None:
|
||||
return None
|
||||
if isinstance(parent, dict):
|
||||
return {"id": parent.get("id")} if parent.get("id") else None
|
||||
if hasattr(parent, "id"):
|
||||
return {"id": parent.id}
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": file.id,
|
||||
"name": file.name,
|
||||
"type": file.type.value if hasattr(file.type, "value") else str(file.type),
|
||||
"modified_at": (
|
||||
to_iso_string(file.modified_at)
|
||||
if hasattr(file, "modified_at") and file.modified_at
|
||||
else None
|
||||
),
|
||||
"created_at": (
|
||||
to_iso_string(file.created_at)
|
||||
if hasattr(file, "created_at") and file.created_at
|
||||
else None
|
||||
),
|
||||
"size": file.size if hasattr(file, "size") and file.size is not None else 0,
|
||||
"parent": (
|
||||
get_parent_id(file.parent)
|
||||
if hasattr(file, "parent") and file.parent
|
||||
else None
|
||||
),
|
||||
"shared_link": (
|
||||
{
|
||||
"url": file.shared_link.url,
|
||||
"access": (
|
||||
file.shared_link.access.value
|
||||
if hasattr(file.shared_link, "access")
|
||||
and hasattr(file.shared_link.access, "value")
|
||||
else None
|
||||
),
|
||||
"password": (
|
||||
file.shared_link.password
|
||||
if hasattr(file.shared_link, "password")
|
||||
else None
|
||||
),
|
||||
}
|
||||
if hasattr(file, "shared_link") and file.shared_link
|
||||
else None
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _get_folders_in_parent(
|
||||
client: BoxClient,
|
||||
parent_id: str = "0", # "0" is root folder in Box
|
||||
) -> Iterator[BoxFileType]:
|
||||
"""Get all folders in a parent folder."""
|
||||
logger.info(f"Getting folders in parent {parent_id}")
|
||||
try:
|
||||
limit = BOX_API_MAX_ITEMS_PER_PAGE
|
||||
marker: str | None = None
|
||||
total_folders = 0
|
||||
page_num = 0
|
||||
|
||||
while True:
|
||||
page_num += 1
|
||||
items = client.folders.get_folder_items(
|
||||
folder_id=parent_id,
|
||||
fields=["id", "name", "type", "modified_at", "created_at", "parent"],
|
||||
limit=limit,
|
||||
marker=marker,
|
||||
)
|
||||
logger.debug(
|
||||
f"Box API page {page_num} for parent {parent_id}: {len(items.entries)} items"
|
||||
)
|
||||
|
||||
for item in items.entries:
|
||||
if item.type.value == "folder":
|
||||
total_folders += 1
|
||||
logger.debug(
|
||||
f"Found folder in parent {parent_id}: {item.name} (id: {item.id})"
|
||||
)
|
||||
yield _box_file_to_dict(item)
|
||||
|
||||
# Box API pagination: check if there are more pages
|
||||
# Box markers are opaque tokens and must come from next_marker.
|
||||
# Using item IDs as markers can cause duplicates, skipped items, or infinite loops.
|
||||
next_marker = getattr(items, "next_marker", None)
|
||||
if next_marker:
|
||||
marker = next_marker
|
||||
elif items.entries and len(items.entries) == limit:
|
||||
# Box API should always provide next_marker when there are more pages.
|
||||
# If it doesn't, we cannot safely continue pagination.
|
||||
logger.error(
|
||||
f"Box API did not return next_marker for parent {parent_id} despite full page. "
|
||||
f"Stopping pagination to avoid duplicates or infinite loops. "
|
||||
f"This may indicate a Box API issue or incomplete data retrieval."
|
||||
)
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
logger.info(f"Found {total_folders} folders in parent {parent_id}")
|
||||
except Exception as e:
|
||||
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
|
||||
import re
|
||||
|
||||
error_str = str(e)
|
||||
# Remove URLs
|
||||
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
|
||||
# Remove potential tokens (long alphanumeric strings)
|
||||
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
|
||||
logger.warning(f"Error getting folders in parent {parent_id}: {error_str}")
|
||||
# Continue on error, similar to Google Drive behavior
|
||||
|
||||
|
||||
def _get_files_in_parent(
|
||||
client: BoxClient,
|
||||
parent_id: str = "0",
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[BoxFileType]:
|
||||
"""Get all files in a parent folder."""
|
||||
logger.info(f"Getting files in parent {parent_id} (start={start}, end={end})")
|
||||
try:
|
||||
# Box API pagination: uses limit and marker (last item ID from previous page)
|
||||
limit = BOX_API_MAX_ITEMS_PER_PAGE
|
||||
marker: str | None = None
|
||||
total_files = 0
|
||||
page_num = 0
|
||||
|
||||
while True:
|
||||
page_num += 1
|
||||
items = client.folders.get_folder_items(
|
||||
folder_id=parent_id,
|
||||
fields=[
|
||||
"id",
|
||||
"name",
|
||||
"type",
|
||||
"modified_at",
|
||||
"created_at",
|
||||
"size",
|
||||
"parent",
|
||||
"shared_link",
|
||||
],
|
||||
limit=limit,
|
||||
marker=marker,
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Box API page {page_num} for parent {parent_id}: {len(items.entries)} items"
|
||||
)
|
||||
|
||||
for item in items.entries:
|
||||
if item.type.value == "file":
|
||||
file_dict = _box_file_to_dict(item)
|
||||
if not _should_include_file_by_time(file_dict, start, end):
|
||||
continue
|
||||
total_files += 1
|
||||
yield file_dict
|
||||
|
||||
# Box API pagination: check if there are more pages
|
||||
# The Box API response should have a next_marker field when there are more pages
|
||||
# Box markers are opaque tokens and must come from next_marker.
|
||||
# Using item IDs as markers can cause duplicates, skipped items, or infinite loops.
|
||||
next_marker = getattr(items, "next_marker", None)
|
||||
if next_marker:
|
||||
# Use the API-provided next_marker token for the next page
|
||||
marker = next_marker
|
||||
elif items.entries and len(items.entries) == limit:
|
||||
# Box API should always provide next_marker when there are more pages.
|
||||
# If it doesn't, we cannot safely continue pagination.
|
||||
logger.error(
|
||||
f"Box API did not return next_marker for parent {parent_id} despite full page. "
|
||||
f"Stopping pagination to avoid duplicates or infinite loops. "
|
||||
f"This may indicate a Box API issue or incomplete data retrieval."
|
||||
)
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
logger.info(f"Found {total_files} files in parent {parent_id}")
|
||||
|
||||
except Exception as e:
|
||||
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
|
||||
import re
|
||||
|
||||
error_str = str(e)
|
||||
# Remove URLs
|
||||
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
|
||||
# Remove potential tokens (long alphanumeric strings)
|
||||
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
|
||||
logger.error(
|
||||
f"Error getting files in parent {parent_id}: {error_str}. "
|
||||
f"Re-raising to prevent folder from being marked as traversed."
|
||||
)
|
||||
# Re-raise the exception so the caller can handle it and avoid marking
|
||||
# the folder as traversed after a failed/partial retrieval
|
||||
raise
|
||||
|
||||
|
||||
def crawl_folders_for_files(
|
||||
client: BoxClient,
|
||||
parent_id: str,
|
||||
user_id: str,
|
||||
traversed_parent_ids: set[str],
|
||||
update_traversed_ids_func: Callable[[str], None],
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[RetrievedBoxFile]:
|
||||
"""
|
||||
Recursively crawl folders to get all files.
|
||||
This function starts crawling from any folder.
|
||||
"""
|
||||
logger.debug(f"Crawling folder {parent_id}")
|
||||
if parent_id not in traversed_parent_ids:
|
||||
try:
|
||||
files_yielded = 0
|
||||
for file_dict in _get_files_in_parent(
|
||||
client=client,
|
||||
parent_id=parent_id,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
logger.debug(f"Found file: {file_dict.get('name')}")
|
||||
yield RetrievedBoxFile(
|
||||
box_file=file_dict,
|
||||
user_id=user_id,
|
||||
parent_id=parent_id,
|
||||
completion_stage=BoxRetrievalStage.FOLDER_FILES,
|
||||
)
|
||||
files_yielded += 1
|
||||
# Mark folder as traversed only after successfully processing all files
|
||||
# (even if no files were found, to avoid re-processing empty folders)
|
||||
# Only mark as traversed if we completed without exceptions
|
||||
update_traversed_ids_func(parent_id)
|
||||
logger.debug(
|
||||
f"Successfully traversed folder {parent_id}, found {files_yielded} files"
|
||||
)
|
||||
except Exception as e:
|
||||
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
|
||||
import re
|
||||
|
||||
error_str = str(e)
|
||||
# Remove URLs
|
||||
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
|
||||
# Remove potential tokens (long alphanumeric strings)
|
||||
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
|
||||
logger.error(
|
||||
f"Error getting files in parent {parent_id}: {error_str}. "
|
||||
f"Folder will not be marked as traversed and may be retried in future crawls."
|
||||
)
|
||||
# Do NOT mark folder as traversed when file listing aborts on error
|
||||
# This allows the folder to be retried in future crawls
|
||||
yield RetrievedBoxFile(
|
||||
box_file={},
|
||||
user_id=user_id,
|
||||
parent_id=parent_id,
|
||||
completion_stage=BoxRetrievalStage.FOLDER_FILES,
|
||||
error=e,
|
||||
)
|
||||
else:
|
||||
logger.debug(f"Skipping folder {parent_id} (already traversed)")
|
||||
|
||||
# Recursively process subfolders
|
||||
for folder_dict in _get_folders_in_parent(client=client, parent_id=parent_id):
|
||||
folder_id = folder_dict.get("id")
|
||||
if folder_id:
|
||||
logger.debug(f"Recursively crawling subfolder: {folder_dict.get('name')}")
|
||||
yield from crawl_folders_for_files(
|
||||
client=client,
|
||||
parent_id=folder_id,
|
||||
user_id=user_id,
|
||||
traversed_parent_ids=traversed_parent_ids,
|
||||
update_traversed_ids_func=update_traversed_ids_func,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
|
||||
def get_all_files_in_folder(
|
||||
client: BoxClient,
|
||||
folder_id: str = "0",
|
||||
user_id: str = "me",
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
marker: str | None = None,
|
||||
) -> Iterator[RetrievedBoxFile | str]:
|
||||
"""
|
||||
Get all files in a folder (non-recursive).
|
||||
Returns RetrievedBoxFile objects or a marker string for pagination.
|
||||
"""
|
||||
logger.info(
|
||||
f"Getting files in folder {folder_id} (user: {user_id}, "
|
||||
f"start={start}, end={end}, marker={marker})"
|
||||
)
|
||||
try:
|
||||
limit = BOX_API_MAX_ITEMS_PER_PAGE
|
||||
current_marker = marker
|
||||
total_files = 0
|
||||
page_num = 0
|
||||
|
||||
while True:
|
||||
page_num += 1
|
||||
items = client.folders.get_folder_items(
|
||||
folder_id=folder_id,
|
||||
fields=[
|
||||
"id",
|
||||
"name",
|
||||
"type",
|
||||
"modified_at",
|
||||
"created_at",
|
||||
"size",
|
||||
"parent",
|
||||
"shared_link",
|
||||
],
|
||||
limit=limit,
|
||||
marker=current_marker,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Box API returned {len(items.entries)} items for folder {folder_id} "
|
||||
f"(page {page_num}, marker={current_marker})"
|
||||
)
|
||||
|
||||
for item in items.entries:
|
||||
logger.debug(
|
||||
f"Found item in folder {folder_id}: type={item.type.value}, "
|
||||
f"name={item.name if hasattr(item, 'name') else 'N/A'}"
|
||||
)
|
||||
if item.type.value == "file":
|
||||
file_dict = _box_file_to_dict(item)
|
||||
# Apply time filter
|
||||
if not _should_include_file_by_time(file_dict, start, end):
|
||||
continue
|
||||
|
||||
total_files += 1
|
||||
logger.debug(f"Yielding file: {file_dict.get('name')}")
|
||||
yield RetrievedBoxFile(
|
||||
box_file=file_dict,
|
||||
user_id=user_id,
|
||||
parent_id=folder_id,
|
||||
completion_stage=BoxRetrievalStage.FOLDER_FILES,
|
||||
)
|
||||
|
||||
# Box API pagination: check if there are more pages
|
||||
# The Box API response should have a next_marker field when there are more pages
|
||||
# Box markers are opaque tokens and must come from next_marker.
|
||||
# Using item IDs as markers can cause duplicates, skipped items, or infinite loops.
|
||||
next_marker = getattr(items, "next_marker", None)
|
||||
if next_marker:
|
||||
# Use the API-provided next_marker token for the next page
|
||||
current_marker = next_marker
|
||||
logger.debug(
|
||||
f"More pages available for folder {folder_id}, next_marker: {current_marker}"
|
||||
)
|
||||
yield current_marker # Yield marker for checkpoint resumption
|
||||
break
|
||||
elif items.entries and len(items.entries) == limit:
|
||||
# Box API should always provide next_marker when there are more pages.
|
||||
# If it doesn't, we cannot safely continue pagination.
|
||||
logger.error(
|
||||
f"Box API did not return next_marker for folder {folder_id} despite full page. "
|
||||
f"Stopping pagination to avoid duplicates or infinite loops. "
|
||||
f"This may indicate a Box API issue or incomplete data retrieval."
|
||||
)
|
||||
# Don't yield a marker - we can't safely continue
|
||||
break
|
||||
else:
|
||||
# No more pages
|
||||
break
|
||||
|
||||
logger.info(f"Found {total_files} files in folder {folder_id}")
|
||||
|
||||
except Exception as e:
|
||||
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
|
||||
error_str = str(e)
|
||||
# Remove potential URLs and tokens from error message
|
||||
import re
|
||||
|
||||
# Remove URLs
|
||||
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
|
||||
# Remove potential tokens (long alphanumeric strings)
|
||||
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
|
||||
logger.error(f"Error getting all files in folder {folder_id}: {error_str}")
|
||||
yield RetrievedBoxFile(
|
||||
box_file={},
|
||||
user_id=user_id,
|
||||
parent_id=folder_id,
|
||||
completion_stage=BoxRetrievalStage.FOLDER_FILES,
|
||||
error=e,
|
||||
)
|
||||
88
backend/onyx/connectors/box/models.py
Normal file
88
backend/onyx/connectors/box/models.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import ConfigDict
|
||||
from pydantic import field_serializer
|
||||
from pydantic import field_validator
|
||||
|
||||
from onyx.connectors.interfaces import ConnectorCheckpoint
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.utils.threadpool_concurrency import ThreadSafeDict
|
||||
|
||||
|
||||
BoxFileType = dict[str, Any]
|
||||
|
||||
|
||||
class BoxRetrievalStage(str, Enum):
|
||||
"""Stages of retrieval for Box connector."""
|
||||
|
||||
START = "start"
|
||||
FOLDER_FILES = "folder_files"
|
||||
DONE = "done"
|
||||
|
||||
|
||||
class StageCompletion(BaseModel):
|
||||
"""
|
||||
Tracks progress through the retrieval process for a user.
|
||||
|
||||
completed_until: Timestamp of the latest file retrieved or error yielded.
|
||||
current_folder_id: Folder currently being processed (for resumption).
|
||||
next_marker: Pagination marker for resuming from a specific page.
|
||||
"""
|
||||
|
||||
stage: BoxRetrievalStage
|
||||
completed_until: SecondsSinceUnixEpoch
|
||||
current_folder_id: str | None = None
|
||||
next_marker: str | None = None
|
||||
|
||||
|
||||
class RetrievedBoxFile(BaseModel):
|
||||
"""
|
||||
Represents a file retrieved from Box.
|
||||
|
||||
If an error occurs during retrieval, the error field is set
|
||||
and will be propagated as a ConnectorFailure.
|
||||
"""
|
||||
|
||||
# The stage at which this file was retrieved
|
||||
completion_stage: BoxRetrievalStage
|
||||
|
||||
# The file that was retrieved
|
||||
box_file: BoxFileType
|
||||
|
||||
# The ID of the user that the file was retrieved by
|
||||
user_id: str
|
||||
|
||||
# The id of the parent folder of the file
|
||||
parent_id: str | None = None
|
||||
|
||||
# Any unexpected error that occurred while retrieving the file.
|
||||
error: Exception | None = None
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
|
||||
class BoxCheckpoint(ConnectorCheckpoint):
|
||||
"""Checkpoint for Box connector retrieval state."""
|
||||
|
||||
retrieved_folder_ids: set[str]
|
||||
completion_stage: BoxRetrievalStage
|
||||
completion_map: ThreadSafeDict[str, StageCompletion]
|
||||
all_retrieved_file_ids: set[str] = set()
|
||||
folder_ids_to_retrieve: list[str] | None = None
|
||||
|
||||
@field_serializer("completion_map")
|
||||
def serialize_completion_map(
|
||||
self, completion_map: ThreadSafeDict[str, StageCompletion], _info: Any
|
||||
) -> dict[str, StageCompletion]:
|
||||
# Use copy() method to get a thread-safe snapshot instead of accessing _dict directly
|
||||
# This maintains thread safety and avoids exposing mutable internal state
|
||||
return completion_map.copy()
|
||||
|
||||
@field_validator("completion_map", mode="before")
|
||||
def validate_completion_map(cls, v: Any) -> ThreadSafeDict[str, StageCompletion]:
|
||||
assert isinstance(v, dict) or isinstance(v, ThreadSafeDict)
|
||||
return ThreadSafeDict(
|
||||
{k: StageCompletion.model_validate(val) for k, val in v.items()}
|
||||
)
|
||||
73
backend/onyx/connectors/box/utils.py
Normal file
73
backend/onyx/connectors/box/utils.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Utility functions for Box connector."""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
|
||||
def parse_box_jwt_config(env_str: str) -> dict[str, Any]:
|
||||
"""
|
||||
Parse a Box JWT configuration JSON string from environment variables into a Python dictionary.
|
||||
|
||||
Handles double-escaped JSON strings that may come from environment variables.
|
||||
Also ensures that newline sequences in the private key are converted to actual newlines.
|
||||
|
||||
Args:
|
||||
env_str: The JSON string from environment variables (may be double-escaped)
|
||||
|
||||
Returns:
|
||||
Parsed JWT config dictionary
|
||||
|
||||
Raises:
|
||||
json.JSONDecodeError: If the string cannot be parsed as JSON
|
||||
"""
|
||||
# First try parsing normally
|
||||
try:
|
||||
config = json.loads(env_str)
|
||||
except json.JSONDecodeError:
|
||||
# Try removing extra escaping backslashes
|
||||
unescaped = env_str.replace('\\"', '"')
|
||||
# Remove leading/trailing quotes if present
|
||||
unescaped = unescaped.strip('"')
|
||||
# Now parse the JSON
|
||||
config = json.loads(unescaped)
|
||||
|
||||
# Handle case where double-parsing returns a string instead of dict
|
||||
# (e.g., if the JSON was double-encoded as a JSON string)
|
||||
if isinstance(config, str):
|
||||
# Try parsing the string as JSON again
|
||||
try:
|
||||
config = json.loads(config)
|
||||
except json.JSONDecodeError:
|
||||
# If it's not valid JSON, raise an error
|
||||
raise json.JSONDecodeError(
|
||||
"Double-parsed JSON returned a string that is not valid JSON",
|
||||
config,
|
||||
0,
|
||||
)
|
||||
|
||||
# Validate that config is a dictionary with the expected structure
|
||||
if not isinstance(config, dict):
|
||||
raise TypeError(
|
||||
f"Expected Box JWT config to be a dict, got {type(config).__name__}"
|
||||
)
|
||||
if "boxAppSettings" not in config:
|
||||
raise ValueError("Box JWT config missing required 'boxAppSettings' field")
|
||||
if not isinstance(config["boxAppSettings"], dict):
|
||||
raise TypeError(
|
||||
f"Expected boxAppSettings to be a dict, got {type(config['boxAppSettings']).__name__}"
|
||||
)
|
||||
|
||||
# Ensure private key has actual newlines (not \n sequences)
|
||||
if "appAuth" in config["boxAppSettings"]:
|
||||
app_auth = config["boxAppSettings"]["appAuth"]
|
||||
# Type check: appAuth must be a dict
|
||||
if not isinstance(app_auth, dict):
|
||||
raise TypeError(
|
||||
f"Expected appAuth to be a dict, got {type(app_auth).__name__}"
|
||||
)
|
||||
private_key = app_auth.get("privateKey", "")
|
||||
if private_key and "\\n" in private_key:
|
||||
# Convert \n sequences to actual newlines
|
||||
app_auth["privateKey"] = private_key.replace("\\n", "\n")
|
||||
|
||||
return config
|
||||
@@ -208,6 +208,10 @@ CONNECTOR_CLASS_MAP = {
|
||||
module_path="onyx.connectors.bitbucket.connector",
|
||||
class_name="BitbucketConnector",
|
||||
),
|
||||
DocumentSource.BOX: ConnectorMapping(
|
||||
module_path="onyx.connectors.box.connector",
|
||||
class_name="BoxConnector",
|
||||
),
|
||||
DocumentSource.TESTRAIL: ConnectorMapping(
|
||||
module_path="onyx.connectors.testrail.connector",
|
||||
class_name="TestRailConnector",
|
||||
|
||||
@@ -482,6 +482,37 @@ def cleanup_google_drive_credentials(db_session: Session) -> None:
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def cleanup_box_jwt_credentials(db_session: Session) -> None:
|
||||
"""Clean up Box JWT credentials that reference the deleted JWT config.
|
||||
|
||||
This function properly handles deletion of related connector/document pairs
|
||||
to avoid foreign key constraint violations.
|
||||
"""
|
||||
from onyx.connectors.box.box_kv import (
|
||||
BOX_AUTHENTICATION_METHOD_UPLOADED,
|
||||
DB_CREDENTIALS_AUTHENTICATION_METHOD,
|
||||
)
|
||||
|
||||
box_credentials = fetch_credentials_by_source(
|
||||
db_session=db_session, document_source=DocumentSource.BOX
|
||||
)
|
||||
for credential in box_credentials:
|
||||
# Only delete credentials that use uploaded JWT config
|
||||
credential_json = credential.credential_json or {}
|
||||
if (
|
||||
credential_json.get(DB_CREDENTIALS_AUTHENTICATION_METHOD)
|
||||
== BOX_AUTHENTICATION_METHOD_UPLOADED
|
||||
):
|
||||
# Use _delete_credential_internal with force=True to properly clean up
|
||||
# related connector/document pairs and avoid FK constraint violations
|
||||
_delete_credential_internal(
|
||||
credential=credential,
|
||||
credential_id=credential.id,
|
||||
db_session=db_session,
|
||||
force=True,
|
||||
)
|
||||
|
||||
|
||||
def delete_service_account_credentials(
|
||||
user: User | None, db_session: Session, source: DocumentSource
|
||||
) -> None:
|
||||
|
||||
@@ -37,6 +37,10 @@ from onyx.configs.constants import MilestoneRecordType
|
||||
from onyx.configs.constants import ONYX_METADATA_FILENAME
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.connectors.box.box_kv import build_box_jwt_creds
|
||||
from onyx.connectors.box.box_kv import delete_box_jwt_config
|
||||
from onyx.connectors.box.box_kv import get_box_jwt_config
|
||||
from onyx.connectors.box.box_kv import upsert_box_jwt_config
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.factory import validate_ccpair_for_user
|
||||
from onyx.connectors.google_utils.google_auth import (
|
||||
@@ -90,6 +94,7 @@ from onyx.db.connector_credential_pair import get_connector_credential_pairs_for
|
||||
from onyx.db.connector_credential_pair import (
|
||||
get_connector_credential_pairs_for_user_parallel,
|
||||
)
|
||||
from onyx.db.credentials import cleanup_box_jwt_credentials
|
||||
from onyx.db.credentials import cleanup_gmail_credentials
|
||||
from onyx.db.credentials import cleanup_google_drive_credentials
|
||||
from onyx.db.credentials import create_credential
|
||||
@@ -118,6 +123,8 @@ from onyx.key_value_store.interface import KvKeyNotFoundError
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.server.documents.models import AuthStatus
|
||||
from onyx.server.documents.models import AuthUrl
|
||||
from onyx.server.documents.models import BoxJWTConfig
|
||||
from onyx.server.documents.models import BoxJWTCredentialRequest
|
||||
from onyx.server.documents.models import ConnectorBase
|
||||
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
|
||||
from onyx.server.documents.models import ConnectorFileInfo
|
||||
@@ -393,6 +400,78 @@ def upsert_gmail_service_account_credential(
|
||||
return ObjectCreationIdResponse(id=credential.id)
|
||||
|
||||
|
||||
@router.get("/admin/connector/box/jwt-config")
|
||||
def check_box_jwt_config_exist(
|
||||
_: User = Depends(current_admin_user),
|
||||
) -> dict[str, str]:
|
||||
"""Check if Box JWT config exists."""
|
||||
try:
|
||||
jwt_config = get_box_jwt_config()
|
||||
return {
|
||||
"client_id": jwt_config.client_id,
|
||||
"enterprise_id": jwt_config.enterpriseID or "Not set",
|
||||
}
|
||||
except KvKeyNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="Box JWT config not found")
|
||||
|
||||
|
||||
@router.put("/admin/connector/box/jwt-config")
|
||||
def upsert_box_jwt_config_endpoint(
|
||||
jwt_config: BoxJWTConfig, _: User = Depends(current_admin_user)
|
||||
) -> StatusResponse:
|
||||
"""Upload Box JWT config JSON."""
|
||||
try:
|
||||
upsert_box_jwt_config(jwt_config)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
return StatusResponse(success=True, message="Successfully saved Box JWT config")
|
||||
|
||||
|
||||
@router.delete("/admin/connector/box/jwt-config")
|
||||
def delete_box_jwt_config_endpoint(
|
||||
_: User = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse:
|
||||
"""Delete Box JWT config."""
|
||||
try:
|
||||
delete_box_jwt_config()
|
||||
# Clean up Box JWT credentials that reference the deleted JWT config
|
||||
cleanup_box_jwt_credentials(db_session=db_session)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
return StatusResponse(success=True, message="Successfully deleted Box JWT config")
|
||||
|
||||
|
||||
@router.put("/admin/connector/box/jwt-credential")
|
||||
def upsert_box_jwt_credential(
|
||||
jwt_credential_request: BoxJWTCredentialRequest,
|
||||
user: User | None = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> ObjectCreationIdResponse:
|
||||
"""Special API which allows the creation of a credential for Box JWT.
|
||||
Combines the input with the saved JWT config to create an entry in the
|
||||
`Credential` table."""
|
||||
try:
|
||||
credential_base = build_box_jwt_creds(
|
||||
primary_admin_user_id=jwt_credential_request.box_primary_admin_user_id,
|
||||
name="Box JWT (uploaded)",
|
||||
)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
# Clean up existing Box JWT credentials before creating a new one
|
||||
# This prevents accumulation of stale/duplicate credentials
|
||||
# Note: cleanup_box_jwt_credentials handles deletion of related connector/document pairs
|
||||
cleanup_box_jwt_credentials(db_session=db_session)
|
||||
# `user=None` since this credential is not a personal credential
|
||||
credential = create_credential(
|
||||
credential_data=credential_base, user=user, db_session=db_session
|
||||
)
|
||||
return ObjectCreationIdResponse(id=credential.id)
|
||||
|
||||
|
||||
@router.get("/admin/connector/google-drive/check-auth/{credential_id}")
|
||||
def check_drive_tokens(
|
||||
credential_id: int,
|
||||
|
||||
@@ -565,6 +565,52 @@ class GoogleServiceAccountCredentialRequest(BaseModel):
|
||||
google_primary_admin: str | None = None # email of user to impersonate
|
||||
|
||||
|
||||
class BoxJWTConfig(BaseModel):
|
||||
"""Box JWT configuration from JSON file."""
|
||||
|
||||
boxAppSettings: dict[str, Any]
|
||||
enterpriseID: str | None = None
|
||||
|
||||
model_config = {"extra": "allow"} # Allow extra fields in case Box adds more
|
||||
|
||||
def model_post_init(self, __context: Any) -> None:
|
||||
"""Validate required nested keys after model initialization."""
|
||||
# Validate boxAppSettings structure
|
||||
if not isinstance(self.boxAppSettings, dict):
|
||||
raise ValueError(
|
||||
f"boxAppSettings must be a dict, got {type(self.boxAppSettings).__name__}"
|
||||
)
|
||||
|
||||
# Validate required top-level fields in boxAppSettings
|
||||
if "clientID" not in self.boxAppSettings:
|
||||
raise ValueError("boxAppSettings missing required 'clientID' field")
|
||||
if "clientSecret" not in self.boxAppSettings:
|
||||
raise ValueError("boxAppSettings missing required 'clientSecret' field")
|
||||
if "appAuth" not in self.boxAppSettings:
|
||||
raise ValueError("boxAppSettings missing required 'appAuth' field")
|
||||
|
||||
# Validate appAuth structure
|
||||
app_auth = self.boxAppSettings["appAuth"]
|
||||
if not isinstance(app_auth, dict):
|
||||
raise ValueError(
|
||||
f"boxAppSettings.appAuth must be a dict, got {type(app_auth).__name__}"
|
||||
)
|
||||
|
||||
# Validate required fields in appAuth
|
||||
if "privateKey" not in app_auth:
|
||||
raise ValueError(
|
||||
"boxAppSettings.appAuth missing required 'privateKey' field"
|
||||
)
|
||||
if "publicKeyID" not in app_auth:
|
||||
raise ValueError(
|
||||
"boxAppSettings.appAuth missing required 'publicKeyID' field"
|
||||
)
|
||||
|
||||
|
||||
class BoxJWTCredentialRequest(BaseModel):
|
||||
box_primary_admin_user_id: str | None = None # user ID to impersonate
|
||||
|
||||
|
||||
class FileUploadResponse(BaseModel):
|
||||
file_paths: list[str]
|
||||
file_names: list[str]
|
||||
|
||||
@@ -91,6 +91,8 @@ botocore==1.39.11
|
||||
# s3transfer
|
||||
botocore-stubs==1.40.74
|
||||
# via boto3-stubs
|
||||
boxsdk==10.3.0
|
||||
# via onyx
|
||||
braintrust==0.3.9
|
||||
# via onyx
|
||||
brotli==1.2.0
|
||||
@@ -896,6 +898,7 @@ regex==2025.11.3
|
||||
requests==2.32.5
|
||||
# via
|
||||
# atlassian-python-api
|
||||
# boxsdk
|
||||
# braintrust
|
||||
# cohere
|
||||
# dropbox
|
||||
@@ -945,6 +948,7 @@ requests-oauthlib==1.3.1
|
||||
# onyx
|
||||
requests-toolbelt==1.0.0
|
||||
# via
|
||||
# boxsdk
|
||||
# jira
|
||||
# langsmith
|
||||
# python-gitlab
|
||||
|
||||
718
backend/scripts/setup_box_test_env.py
Normal file
718
backend/scripts/setup_box_test_env.py
Normal file
@@ -0,0 +1,718 @@
|
||||
"""
|
||||
Script to set up Box test environment for connector tests.
|
||||
|
||||
This script:
|
||||
1. Reads Box credentials and user IDs from .test.env
|
||||
2. Creates the required folder structure
|
||||
3. Creates test files with proper naming and content
|
||||
4. Sets up sharing/permissions between users
|
||||
5. Updates consts_and_utils.py with actual folder and user IDs
|
||||
|
||||
Usage:
|
||||
cd backend
|
||||
python scripts/setup_box_test_env.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Add backend to path before importing onyx modules
|
||||
backend_path = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(backend_path))
|
||||
|
||||
from box_sdk_gen import BoxClient # noqa: E402
|
||||
from box_sdk_gen import BoxJWTAuth # noqa: E402
|
||||
from box_sdk_gen import JWTConfig # noqa: E402
|
||||
from box_sdk_gen.managers.folders import CreateFolderParent # noqa: E402
|
||||
from box_sdk_gen.schemas import File # noqa: E402
|
||||
from box_sdk_gen.schemas import Folder # noqa: E402
|
||||
|
||||
from onyx.connectors.box.utils import parse_box_jwt_config # noqa: E402
|
||||
|
||||
|
||||
def load_env_vars() -> None:
|
||||
"""Load environment variables from .test.env."""
|
||||
env_file = backend_path / ".test.env"
|
||||
if not env_file.exists():
|
||||
raise FileNotFoundError(f".test.env file not found at {env_file}")
|
||||
|
||||
with open(env_file, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#"):
|
||||
if "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
os.environ[key] = value.strip('"')
|
||||
|
||||
|
||||
def get_box_client(user_key: str = "admin") -> tuple[BoxClient, str]:
|
||||
"""Get Box client for a specific user.
|
||||
|
||||
Uses the same JWT config for all users, impersonating via user ID.
|
||||
"""
|
||||
# Always use the same JWT config
|
||||
jwt_config_str = os.environ.get("BOX_JWT_CONFIG_JSON_STR")
|
||||
if not jwt_config_str:
|
||||
raise ValueError("BOX_JWT_CONFIG_JSON_STR not found in .test.env")
|
||||
|
||||
# Get the user ID for impersonation
|
||||
user_id_map = {
|
||||
"admin": "BOX_PRIMARY_ADMIN_USER_ID",
|
||||
"test_user_1": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1",
|
||||
"test_user_2": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2",
|
||||
"test_user_3": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3",
|
||||
}
|
||||
|
||||
primary_admin_id = os.environ.get(
|
||||
user_id_map.get(user_key, "BOX_PRIMARY_ADMIN_USER_ID")
|
||||
)
|
||||
|
||||
# Parse and normalize the JWT config string
|
||||
jwt_config_dict = parse_box_jwt_config(jwt_config_str)
|
||||
# Re-serialize to ensure proper JSON format
|
||||
normalized_jwt_config_str = json.dumps(jwt_config_dict)
|
||||
|
||||
# Use from_config_json_string (as used in connector)
|
||||
try:
|
||||
jwt_config = JWTConfig.from_config_json_string(normalized_jwt_config_str)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Failed to parse JWT config: {e}. Please check your BOX_JWT_CONFIG_JSON_STR format."
|
||||
)
|
||||
|
||||
auth = BoxJWTAuth(config=jwt_config)
|
||||
|
||||
# Use primary admin user ID for impersonation if provided
|
||||
if primary_admin_id:
|
||||
user_auth = auth.with_user_subject(primary_admin_id)
|
||||
client = BoxClient(auth=user_auth)
|
||||
user_id = primary_admin_id
|
||||
else:
|
||||
client = BoxClient(auth=auth)
|
||||
# Get user ID
|
||||
user = client.users.get_user_me()
|
||||
user_id = user.id
|
||||
|
||||
return client, user_id
|
||||
|
||||
|
||||
def create_folder(client: BoxClient, name: str, parent_id: str = "0") -> Folder:
|
||||
"""Create a folder in Box."""
|
||||
print(f"Creating folder '{name}' in parent {parent_id}...")
|
||||
try:
|
||||
from box_sdk_gen.box.errors import BoxAPIError
|
||||
|
||||
folder = client.folders.create_folder(
|
||||
name=name,
|
||||
parent=CreateFolderParent(id=parent_id),
|
||||
)
|
||||
print(f" ✓ Created folder '{name}' with ID: {folder.id}")
|
||||
return folder
|
||||
except BoxAPIError as e:
|
||||
# Handle folder already exists (409)
|
||||
error_msg = str(e)
|
||||
error_code = getattr(e, "code", None)
|
||||
|
||||
if "409" in error_msg or error_code == "item_name_in_use":
|
||||
# Try to get the existing folder ID from the error
|
||||
try:
|
||||
if hasattr(e, "response") and hasattr(e.response, "body"):
|
||||
body = e.response.body
|
||||
if isinstance(body, dict):
|
||||
context_info = body.get("context_info", {})
|
||||
conflicts = context_info.get("conflicts", [])
|
||||
if conflicts:
|
||||
# Conflicts can be a list or dict
|
||||
if isinstance(conflicts, list) and len(conflicts) > 0:
|
||||
folder_id = conflicts[0].get("id")
|
||||
elif isinstance(conflicts, dict):
|
||||
folder_id = conflicts.get("id")
|
||||
else:
|
||||
folder_id = None
|
||||
|
||||
if folder_id:
|
||||
folder = client.folders.get_folder_by_id(folder_id)
|
||||
print(
|
||||
f" ℹ Folder '{name}' already exists (ID: {folder_id})"
|
||||
)
|
||||
return folder
|
||||
except Exception:
|
||||
pass # Will try listing approach below
|
||||
|
||||
# If we can't get the folder from error response, try to find it by listing parent folder
|
||||
try:
|
||||
# List items in parent folder to find the folder by name
|
||||
items_response = client.folders.get_folder_items(parent_id)
|
||||
if hasattr(items_response, "entries"):
|
||||
for item in items_response.entries:
|
||||
# Check if this item matches the folder name
|
||||
item_name = getattr(item, "name", None)
|
||||
if item_name == name:
|
||||
# Check if it's a folder (not a file)
|
||||
item_type = getattr(item, "type", None)
|
||||
# Box SDK Gen uses type.value or type enum
|
||||
if hasattr(item_type, "value"):
|
||||
item_type_str = item_type.value
|
||||
else:
|
||||
item_type_str = str(item_type)
|
||||
|
||||
if (
|
||||
item_type_str == "folder"
|
||||
or "folder" in item_type_str.lower()
|
||||
):
|
||||
folder_id = item.id
|
||||
folder = client.folders.get_folder_by_id(folder_id)
|
||||
print(
|
||||
f" ℹ Folder '{name}' already exists (ID: {folder_id})"
|
||||
)
|
||||
return folder
|
||||
except Exception:
|
||||
# If listing also fails, we'll try one more approach
|
||||
pass
|
||||
|
||||
# Last resort: try to search for the folder
|
||||
try:
|
||||
# Use search to find the folder
|
||||
search_results = client.search.search(
|
||||
query=name,
|
||||
type="folder",
|
||||
ancestor_folders=[parent_id],
|
||||
)
|
||||
if hasattr(search_results, "entries"):
|
||||
for item in search_results.entries:
|
||||
if getattr(item, "name", None) == name:
|
||||
folder_id = item.id
|
||||
folder = client.folders.get_folder_by_id(folder_id)
|
||||
print(
|
||||
f" ℹ Folder '{name}' already exists (ID: {folder_id})"
|
||||
)
|
||||
return folder
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# If we still can't get the folder, inform the user
|
||||
print(
|
||||
f" ⚠️ Folder '{name}' already exists but could not retrieve it automatically"
|
||||
)
|
||||
print(" You may need to delete it manually or use a different name")
|
||||
raise ValueError(
|
||||
f"Folder '{name}' already exists. Please delete it manually or use a different name."
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f" ✗ Error creating folder '{name}': {e}")
|
||||
raise
|
||||
|
||||
|
||||
def upload_file(client: BoxClient, name: str, content: str, parent_id: str) -> File:
|
||||
"""Upload a file to Box."""
|
||||
print(f" Uploading file '{name}'...")
|
||||
try:
|
||||
import io
|
||||
from box_sdk_gen.box.errors import BoxAPIError
|
||||
|
||||
file_content = content.encode("utf-8")
|
||||
file_size = len(file_content)
|
||||
file_io = io.BytesIO(file_content)
|
||||
|
||||
# Use uploads.upload_file for small files (< 20MB)
|
||||
# Use chunked_uploads.upload_big_file for large files (>= 20MB)
|
||||
from box_sdk_gen.managers.uploads import UploadFileAttributes
|
||||
from box_sdk_gen.managers.uploads import UploadFileAttributesParentField
|
||||
|
||||
if file_size < 20 * 1024 * 1024: # 20MB threshold
|
||||
# Small file - use regular upload
|
||||
try:
|
||||
file_result = client.uploads.upload_file(
|
||||
attributes=UploadFileAttributes(
|
||||
name=name,
|
||||
parent=UploadFileAttributesParentField(id=parent_id),
|
||||
),
|
||||
file=file_io,
|
||||
)
|
||||
# upload_file returns Files object which contains entries list
|
||||
if hasattr(file_result, "entries") and file_result.entries:
|
||||
uploaded_file = file_result.entries[0]
|
||||
else:
|
||||
uploaded_file = file_result
|
||||
except BoxAPIError as e:
|
||||
# Handle file already exists (409) - check error message/code
|
||||
error_msg = str(e)
|
||||
error_code = getattr(e, "code", None)
|
||||
|
||||
# Check if it's a 409 conflict error
|
||||
if "409" in error_msg or error_code == "item_name_in_use":
|
||||
# Try to extract file ID from error response body
|
||||
# The error response body contains conflicts with the file id
|
||||
try:
|
||||
# Parse the error to get the file ID from conflicts
|
||||
# The error message shows conflicts in context_info
|
||||
if hasattr(e, "response") and hasattr(e.response, "body"):
|
||||
pass
|
||||
|
||||
body = e.response.body
|
||||
if isinstance(body, dict):
|
||||
context_info = body.get("context_info", {})
|
||||
conflicts = context_info.get("conflicts", {})
|
||||
if conflicts and "id" in conflicts:
|
||||
file_id = conflicts["id"]
|
||||
uploaded_file = client.files.get_file_by_id(file_id)
|
||||
print(
|
||||
f" ℹ File '{name}' already exists (ID: {file_id})"
|
||||
)
|
||||
return uploaded_file
|
||||
except Exception:
|
||||
pass
|
||||
# If we can't get the file ID, just skip with a message
|
||||
print(f" ℹ File '{name}' already exists, skipping upload")
|
||||
# Return a dummy file object - the script will continue
|
||||
from box_sdk_gen.schemas import File
|
||||
|
||||
return File(id="existing", name=name, type="file")
|
||||
raise
|
||||
else:
|
||||
# Large file - use chunked upload
|
||||
uploaded_file = client.chunked_uploads.upload_big_file(
|
||||
file=file_io,
|
||||
file_name=name,
|
||||
file_size=file_size,
|
||||
parent_folder_id=parent_id,
|
||||
)
|
||||
|
||||
file_id = uploaded_file.id if hasattr(uploaded_file, "id") else "unknown"
|
||||
print(f" ✓ Uploaded '{name}' with ID: {file_id}")
|
||||
return uploaded_file
|
||||
except Exception as e:
|
||||
print(f" ✗ Error uploading '{name}': {e}")
|
||||
raise
|
||||
|
||||
|
||||
def share_folder(
|
||||
client: BoxClient, folder_id: str, user_id: str, role: str = "viewer"
|
||||
) -> None:
|
||||
"""Share a folder with a user by creating a collaboration."""
|
||||
print(f" Sharing folder {folder_id} with user {user_id} as {role}...")
|
||||
try:
|
||||
from box_sdk_gen import (
|
||||
CreateCollaborationAccessibleBy,
|
||||
CreateCollaborationAccessibleByTypeField,
|
||||
CreateCollaborationItem,
|
||||
CreateCollaborationItemTypeField,
|
||||
CreateCollaborationRole,
|
||||
)
|
||||
|
||||
# Map role string to CreateCollaborationRole enum
|
||||
role_map = {
|
||||
"viewer": CreateCollaborationRole.VIEWER,
|
||||
"editor": CreateCollaborationRole.EDITOR,
|
||||
"co-owner": CreateCollaborationRole.CO_OWNER,
|
||||
"previewer": CreateCollaborationRole.PREVIEWER,
|
||||
"uploader": CreateCollaborationRole.UPLOADER,
|
||||
"previewer-uploader": CreateCollaborationRole.PREVIEWER_UPLOADER,
|
||||
"viewer-uploader": CreateCollaborationRole.VIEWER_UPLOADER,
|
||||
}
|
||||
|
||||
collaboration_role = role_map.get(role.lower(), CreateCollaborationRole.VIEWER)
|
||||
|
||||
# Create the collaboration
|
||||
collaboration = client.user_collaborations.create_collaboration(
|
||||
item=CreateCollaborationItem(
|
||||
type=CreateCollaborationItemTypeField.FOLDER,
|
||||
id=folder_id,
|
||||
),
|
||||
accessible_by=CreateCollaborationAccessibleBy(
|
||||
type=CreateCollaborationAccessibleByTypeField.USER,
|
||||
id=user_id,
|
||||
),
|
||||
role=collaboration_role,
|
||||
)
|
||||
|
||||
print(
|
||||
f" ✓ Successfully shared folder {folder_id} with user {user_id} as {role}"
|
||||
)
|
||||
if hasattr(collaboration, "id"):
|
||||
print(f" Collaboration ID: {collaboration.id}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
error_code = None
|
||||
if hasattr(e, "code"):
|
||||
error_code = e.code
|
||||
elif hasattr(e, "response") and hasattr(e.response, "status_code"):
|
||||
error_code = str(e.response.status_code)
|
||||
|
||||
# Check if collaboration already exists (409 conflict or user_already_collaborator)
|
||||
if (
|
||||
error_code == "409"
|
||||
or "409" in error_msg
|
||||
or "already exists" in error_msg.lower()
|
||||
or "user_already_collaborator" in error_msg.lower()
|
||||
or getattr(e, "code", None) == "user_already_collaborator"
|
||||
):
|
||||
print(
|
||||
f" ℹ Collaboration already exists for folder {folder_id} and user {user_id}"
|
||||
)
|
||||
else:
|
||||
print(f" ✗ Warning: Could not share folder: {e}")
|
||||
print(f" Error code: {error_code}")
|
||||
print(" You may need to share this folder manually via Box UI:")
|
||||
print(f" - Folder ID: {folder_id}")
|
||||
print(f" - User ID: {user_id}")
|
||||
print(f" - Role: {role}")
|
||||
|
||||
|
||||
def remove_user_access(client: BoxClient, folder_id: str, user_id: str) -> None:
|
||||
"""Remove a user's access to a folder by deleting their collaboration."""
|
||||
print(f" Removing access for user {user_id} from folder {folder_id}...")
|
||||
try:
|
||||
# First, get all collaborations for the folder
|
||||
collaborations_response = client.list_collaborations.get_folder_collaborations(
|
||||
folder_id
|
||||
)
|
||||
|
||||
# Find the collaboration for this user
|
||||
collaboration_to_delete = None
|
||||
if hasattr(collaborations_response, "entries"):
|
||||
for collab in collaborations_response.entries:
|
||||
accessible_by = getattr(collab, "accessible_by", None)
|
||||
if accessible_by:
|
||||
collab_user_id = getattr(accessible_by, "id", None)
|
||||
if collab_user_id == user_id:
|
||||
collaboration_to_delete = collab
|
||||
break
|
||||
|
||||
if collaboration_to_delete:
|
||||
# Delete the collaboration
|
||||
collaboration_id = getattr(collaboration_to_delete, "id", None)
|
||||
if collaboration_id:
|
||||
client.user_collaborations.delete_collaboration_by_id(collaboration_id)
|
||||
print(
|
||||
f" ✓ Removed access for user {user_id} from folder {folder_id}"
|
||||
)
|
||||
else:
|
||||
print(" ⚠️ Found collaboration but no ID available")
|
||||
else:
|
||||
print(
|
||||
f" ℹ User {user_id} does not have explicit access to folder {folder_id}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
str(e)
|
||||
error_code = None
|
||||
if hasattr(e, "code"):
|
||||
error_code = e.code
|
||||
elif hasattr(e, "response") and hasattr(e.response, "status_code"):
|
||||
error_code = str(e.response.status_code)
|
||||
|
||||
print(f" ✗ Warning: Could not remove user access: {e}")
|
||||
print(f" Error code: {error_code}")
|
||||
print(" You may need to remove access manually via Box UI")
|
||||
|
||||
|
||||
def create_file_structure(
|
||||
client: BoxClient, parent_id: str, file_ids: list[int]
|
||||
) -> None:
|
||||
"""Create files in a folder."""
|
||||
# Import here to avoid module-level import after non-import statements (E402)
|
||||
try:
|
||||
from tests.daily.connectors.box.consts_and_utils import (
|
||||
SPECIAL_FILE_ID_TO_CONTENT_MAP as _SPECIAL_MAP,
|
||||
)
|
||||
except Exception as e:
|
||||
raise ImportError(
|
||||
f"Failed to import SPECIAL_FILE_ID_TO_CONTENT_MAP from consts_and_utils: {e}. "
|
||||
"This is required for special test file content. Please fix the import error."
|
||||
) from e
|
||||
|
||||
for file_id in file_ids:
|
||||
file_name = f"file_{file_id}.txt"
|
||||
if file_id in _SPECIAL_MAP:
|
||||
content = _SPECIAL_MAP[file_id]
|
||||
else:
|
||||
content = f"This is file {file_id}"
|
||||
upload_file(client, file_name, content, parent_id)
|
||||
|
||||
|
||||
def setup_box_test_environment() -> dict[str, Any]:
|
||||
"""Set up the complete Box test environment."""
|
||||
# Import test constants here to avoid E402 and ensure sys.path has been adjusted
|
||||
from tests.daily.connectors.box.consts_and_utils import (
|
||||
ADMIN_FILE_IDS,
|
||||
ADMIN_FOLDER_3_FILE_IDS,
|
||||
FOLDER_1_1_FILE_IDS,
|
||||
FOLDER_1_2_FILE_IDS,
|
||||
FOLDER_1_FILE_IDS,
|
||||
FOLDER_2_1_FILE_IDS,
|
||||
FOLDER_2_2_FILE_IDS,
|
||||
FOLDER_2_FILE_IDS,
|
||||
FOLDER_3_FILE_IDS,
|
||||
SECTIONS_FILE_IDS,
|
||||
TEST_USER_1_FILE_IDS,
|
||||
TEST_USER_2_FILE_IDS,
|
||||
TEST_USER_3_FILE_IDS,
|
||||
)
|
||||
|
||||
print("=" * 80)
|
||||
print("Setting up Box test environment...")
|
||||
print("=" * 80)
|
||||
|
||||
# Load environment variables
|
||||
load_env_vars()
|
||||
|
||||
# Get parent folder ID from env, default to root ("0")
|
||||
parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID", "0")
|
||||
if parent_folder_id == "0":
|
||||
print("\n⚠️ Creating test structure in ROOT folder (ID: 0)")
|
||||
print(
|
||||
" To use a different folder, set BOX_TEST_PARENT_FOLDER_ID in .test.env"
|
||||
)
|
||||
else:
|
||||
print(f"\nCreating test structure in folder ID: {parent_folder_id}")
|
||||
|
||||
# Get admin client
|
||||
admin_client, admin_user_id = get_box_client("admin")
|
||||
print(f"\nAdmin user ID: {admin_user_id}")
|
||||
|
||||
# Get test user IDs (if configured)
|
||||
test_user_ids = {}
|
||||
for user_key in ["test_user_1", "test_user_2", "test_user_3"]:
|
||||
try:
|
||||
_, user_id = get_box_client(user_key)
|
||||
test_user_ids[user_key] = user_id
|
||||
print(f"{user_key} ID: {user_id}")
|
||||
except Exception as e:
|
||||
print(f"{user_key} not configured: {e}")
|
||||
|
||||
# Store created folder IDs
|
||||
folder_ids = {}
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Creating folder structure...")
|
||||
print("=" * 80)
|
||||
|
||||
# Create root-level files
|
||||
print("\nCreating root-level files...")
|
||||
create_file_structure(admin_client, parent_folder_id, ADMIN_FILE_IDS)
|
||||
create_file_structure(admin_client, parent_folder_id, TEST_USER_1_FILE_IDS)
|
||||
if test_user_ids.get("test_user_2"):
|
||||
create_file_structure(admin_client, parent_folder_id, TEST_USER_2_FILE_IDS)
|
||||
if test_user_ids.get("test_user_3"):
|
||||
create_file_structure(admin_client, parent_folder_id, TEST_USER_3_FILE_IDS)
|
||||
|
||||
# Create Folder 1 and subfolders
|
||||
print("\nCreating Folder 1 structure...")
|
||||
folder_1 = create_folder(admin_client, "Folder 1", parent_folder_id)
|
||||
folder_ids["FOLDER_1_ID"] = folder_1.id
|
||||
create_file_structure(admin_client, folder_1.id, FOLDER_1_FILE_IDS)
|
||||
|
||||
folder_1_1 = create_folder(admin_client, "Folder 1-1", folder_1.id)
|
||||
folder_ids["FOLDER_1_1_ID"] = folder_1_1.id
|
||||
create_file_structure(admin_client, folder_1_1.id, FOLDER_1_1_FILE_IDS)
|
||||
|
||||
folder_1_2 = create_folder(admin_client, "Folder 1-2", folder_1.id)
|
||||
folder_ids["FOLDER_1_2_ID"] = folder_1_2.id
|
||||
create_file_structure(admin_client, folder_1_2.id, FOLDER_1_2_FILE_IDS)
|
||||
|
||||
# Create Folder 2 and subfolders
|
||||
print("\nCreating Folder 2 structure...")
|
||||
folder_2 = create_folder(admin_client, "Folder 2", parent_folder_id)
|
||||
folder_ids["FOLDER_2_ID"] = folder_2.id
|
||||
create_file_structure(admin_client, folder_2.id, FOLDER_2_FILE_IDS)
|
||||
|
||||
folder_2_1 = create_folder(admin_client, "Folder 2-1", folder_2.id)
|
||||
folder_ids["FOLDER_2_1_ID"] = folder_2_1.id
|
||||
create_file_structure(admin_client, folder_2_1.id, FOLDER_2_1_FILE_IDS)
|
||||
|
||||
folder_2_2 = create_folder(admin_client, "Folder 2-2", folder_2.id)
|
||||
folder_ids["FOLDER_2_2_ID"] = folder_2_2.id
|
||||
create_file_structure(admin_client, folder_2_2.id, FOLDER_2_2_FILE_IDS)
|
||||
|
||||
# Create Folder 3
|
||||
print("\nCreating Folder 3...")
|
||||
folder_3 = create_folder(admin_client, "Folder 3", parent_folder_id)
|
||||
folder_ids["FOLDER_3_ID"] = folder_3.id
|
||||
create_file_structure(admin_client, folder_3.id, FOLDER_3_FILE_IDS)
|
||||
|
||||
# Create Admin's Folder 3 (separate folder for sharing test)
|
||||
print("\nCreating Admin's Folder 3...")
|
||||
admin_folder_3 = create_folder(admin_client, "Admin Folder 3", parent_folder_id)
|
||||
folder_ids["ADMIN_FOLDER_3_ID"] = admin_folder_3.id
|
||||
create_file_structure(admin_client, admin_folder_3.id, ADMIN_FOLDER_3_FILE_IDS)
|
||||
|
||||
# Create Sections folder
|
||||
print("\nCreating Sections folder...")
|
||||
sections_folder = create_folder(admin_client, "Sections Folder", parent_folder_id)
|
||||
folder_ids["SECTIONS_FOLDER_ID"] = sections_folder.id
|
||||
create_file_structure(admin_client, sections_folder.id, SECTIONS_FILE_IDS)
|
||||
|
||||
# Set up sharing/permissions
|
||||
print("\n" + "=" * 80)
|
||||
print("Setting up sharing and permissions...")
|
||||
print("=" * 80)
|
||||
|
||||
if test_user_ids.get("test_user_1"):
|
||||
user_1_id = test_user_ids["test_user_1"]
|
||||
print(f"\nSetting up permissions for test_user_1 ({user_1_id})...")
|
||||
# Share Folder 1 with test_user_1
|
||||
share_folder(admin_client, folder_1.id, user_1_id, "viewer")
|
||||
# Share Admin's Folder 3 with test_user_1
|
||||
share_folder(admin_client, admin_folder_3.id, user_1_id, "viewer")
|
||||
# Note: Individual file sharing would need to be done separately if needed
|
||||
|
||||
# Set up permissions for test_user_3
|
||||
if test_user_ids.get("test_user_3"):
|
||||
user_3_id = test_user_ids["test_user_3"]
|
||||
print(f"\nSetting up permissions for test_user_3 ({user_3_id})...")
|
||||
# Share Folder 1-2 (public folder) with test_user_3 so they can access public files
|
||||
share_folder(admin_client, folder_1_2.id, user_3_id, "viewer")
|
||||
# Note: test_user_3's own files are in the root, which they should have access to
|
||||
# via their own account, but we don't need to explicitly share those
|
||||
|
||||
# Explicitly restrict test_user_3 from ADMIN_FOLDER_3
|
||||
# This ensures the test_restricted_access test is useful
|
||||
if test_user_ids.get("test_user_3"):
|
||||
user_3_id = test_user_ids["test_user_3"]
|
||||
print(f"\nRestricting test_user_3 ({user_3_id}) from ADMIN_FOLDER_3...")
|
||||
remove_user_access(admin_client, admin_folder_3.id, user_3_id)
|
||||
|
||||
if test_user_ids.get("test_user_2"):
|
||||
user_2_id = test_user_ids["test_user_2"]
|
||||
print(f"\nSetting up permissions for test_user_2 ({user_2_id})...")
|
||||
# Share Folder 1 with test_user_2
|
||||
share_folder(admin_client, folder_1.id, user_2_id, "viewer")
|
||||
# Share Folder 2-1 with test_user_2
|
||||
share_folder(admin_client, folder_2_1.id, user_2_id, "viewer")
|
||||
|
||||
# Make Folder 1-2 public (if needed)
|
||||
print("\nMaking Folder 1-2 public...")
|
||||
try:
|
||||
# Try to update folder shared link settings
|
||||
from box_sdk_gen.managers.folders import (
|
||||
UpdateFolderByIdSharedLink,
|
||||
UpdateFolderByIdSharedLinkAccessField,
|
||||
)
|
||||
|
||||
admin_client.folders.update_folder_by_id(
|
||||
folder_id=folder_1_2.id,
|
||||
shared_link=UpdateFolderByIdSharedLink(
|
||||
access=UpdateFolderByIdSharedLinkAccessField.OPEN
|
||||
),
|
||||
)
|
||||
print(" ✓ Folder 1-2 is now public")
|
||||
except Exception as e:
|
||||
print(f" ✗ Warning: Could not make folder public: {e}")
|
||||
print(" (This is optional - folder can be shared manually via Box UI)")
|
||||
print(f" To make it public manually: Folder ID {folder_1_2.id}")
|
||||
|
||||
# Compile results
|
||||
results = {
|
||||
"admin_user_id": admin_user_id,
|
||||
"test_user_ids": test_user_ids,
|
||||
"folder_ids": folder_ids,
|
||||
}
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Setup complete!")
|
||||
print("=" * 80)
|
||||
print("\nCreated folder IDs:")
|
||||
for key, value in folder_ids.items():
|
||||
print(f" {key}: {value}")
|
||||
print(f"\nAdmin User ID: {admin_user_id}")
|
||||
if test_user_ids:
|
||||
print("\nTest User IDs:")
|
||||
for key, value in test_user_ids.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def update_consts_file(results: dict[str, Any]) -> None:
|
||||
"""Update consts_and_utils.py with actual IDs."""
|
||||
consts_file = (
|
||||
backend_path / "tests" / "daily" / "connectors" / "box" / "consts_and_utils.py"
|
||||
)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Updating consts_and_utils.py...")
|
||||
print("=" * 80)
|
||||
|
||||
with open(consts_file, "r") as f:
|
||||
content = f.read()
|
||||
|
||||
# Update folder IDs
|
||||
folder_ids = results["folder_ids"]
|
||||
for key, value in folder_ids.items():
|
||||
# Replace placeholder values - try multiple patterns
|
||||
patterns = [
|
||||
(f'{key} = "123456789"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456790"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456791"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456792"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456793"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456794"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456795"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456796"', f'{key} = "{value}"'),
|
||||
(f'{key} = "123456797"', f'{key} = "{value}"'),
|
||||
]
|
||||
for pattern, replacement in patterns:
|
||||
if pattern in content:
|
||||
content = content.replace(pattern, replacement)
|
||||
print(f" Updated {key} = {value}")
|
||||
break
|
||||
else:
|
||||
# Try regex pattern as fallback
|
||||
pattern = f'{key} = "[^"]*"'
|
||||
new_content = re.sub(pattern, f'{key} = "{value}"', content)
|
||||
if new_content != content:
|
||||
content = new_content
|
||||
print(f" Updated {key} = {value}")
|
||||
|
||||
# Update user IDs
|
||||
admin_user_id = results["admin_user_id"]
|
||||
content = re.sub(
|
||||
r'ADMIN_USER_ID = "[^"]*"',
|
||||
f'ADMIN_USER_ID = "{admin_user_id}"',
|
||||
content,
|
||||
)
|
||||
print(f" Updated ADMIN_USER_ID = {admin_user_id}")
|
||||
|
||||
test_user_ids = results["test_user_ids"]
|
||||
user_id_map = {
|
||||
"test_user_1": "TEST_USER_1_ID",
|
||||
"test_user_2": "TEST_USER_2_ID",
|
||||
"test_user_3": "TEST_USER_3_ID",
|
||||
}
|
||||
for user_key, const_name in user_id_map.items():
|
||||
if user_key in test_user_ids:
|
||||
content = re.sub(
|
||||
f'{const_name} = "[^"]*"',
|
||||
f'{const_name} = "{test_user_ids[user_key]}"',
|
||||
content,
|
||||
)
|
||||
print(f" Updated {const_name} = {test_user_ids[user_key]}")
|
||||
|
||||
with open(consts_file, "w") as f:
|
||||
f.write(content)
|
||||
|
||||
print("\n✓ Updated consts_and_utils.py with actual IDs")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
results = setup_box_test_environment()
|
||||
update_consts_file(results)
|
||||
print("\n" + "=" * 80)
|
||||
print("✅ Box test environment setup complete!")
|
||||
print("=" * 80)
|
||||
print("\nYou can now run the tests with:")
|
||||
print(" pytest -v -s backend/tests/daily/connectors/box/")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error setting up test environment: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
287
backend/tests/daily/connectors/box/README.md
Normal file
287
backend/tests/daily/connectors/box/README.md
Normal file
@@ -0,0 +1,287 @@
|
||||
# Box Connector Test Suite
|
||||
|
||||
## Overview
|
||||
|
||||
The Box connector test suite contains comprehensive integration tests for the Box connector. These tests validate that the connector properly:
|
||||
- Authenticates with the Box API using JWT authentication
|
||||
- Retrieves files and folders from Box
|
||||
- Handles folder scoping and filtering
|
||||
- Generates properly structured Onyx `Document` objects
|
||||
- Handles batch processing and checkpointing
|
||||
- Manages permissions and access control
|
||||
- Supports nested folder traversal
|
||||
- Handles error cases gracefully
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Box Enterprise Account**: You need a Box enterprise account with admin access
|
||||
2. **Box JWT Application**: A Box application configured with JWT authentication
|
||||
3. **Test Users**: At least one test user (test_user_1) is required for permission tests
|
||||
4. **Python Environment**: Backend dependencies installed (see `backend/requirements`)
|
||||
5. **Read Permissions**: The Box application must have read permissions to download files
|
||||
|
||||
## Setting Up Box JWT Application
|
||||
|
||||
### 1. Create a Box Application
|
||||
|
||||
1. Go to the [Box Developer Console](https://developer.box.com/)
|
||||
2. Navigate to **My Apps** → **Create New App**
|
||||
3. Select **Custom App** → **Server Authentication (with JWT)**
|
||||
4. Give your app a name (e.g., "Onyx Box Connector Tests")
|
||||
|
||||
### 2. Configure Application Settings
|
||||
|
||||
1. In your app settings, go to the **Configuration** tab
|
||||
2. **Important**: Enable **Read** permissions in the application scopes
|
||||
- This is required for the connector to download files
|
||||
- **Note**: For running the test setup script (`setup_box_test_env.py`), you will also need **Write** permissions to create folders and files. However, for normal connector operation (indexing files), only **Read** permissions are required.
|
||||
3. Note your **Client ID** and **Client Secret** (you'll need these later)
|
||||
|
||||
### 3. Generate and Download JWT Configuration
|
||||
|
||||
1. In the **Configuration** tab, scroll to **Add and Manage Public Keys**
|
||||
2. Click **Generate a Public/Private Keypair**
|
||||
3. Download the **JSON configuration file** - this is your `config.json`
|
||||
- This file contains all the necessary authentication information
|
||||
- **Keep this file secure** - it contains sensitive credentials
|
||||
|
||||
### 4. Set Up User Access
|
||||
|
||||
1. In your Box enterprise admin console, go to **Users and Groups**
|
||||
2. Create test users (at least `test_user_1`, optionally `test_user_2` and `test_user_3`)
|
||||
3. Note the **User IDs** for each test user (you'll need these for impersonation)
|
||||
|
||||
### 5. Authorize the Application
|
||||
|
||||
1. In the Box Developer Console, go to your app's **Authorization** tab
|
||||
2. Click **Review and Submit** to submit your app for authorization
|
||||
3. Once authorized, you can use the JWT authentication
|
||||
|
||||
## Environment Variables
|
||||
|
||||
The test suite requires the following environment variables in `backend/.test.env`:
|
||||
|
||||
### Required (Admin User)
|
||||
|
||||
- **`BOX_JWT_CONFIG_JSON_STR`**: The JWT configuration JSON string
|
||||
- This is the content of the `config.json` file you downloaded
|
||||
- It should be a JSON string (may need to be escaped for the .env file)
|
||||
- Example format: `{"boxAppSettings": {...}, "enterpriseID": "..."}`
|
||||
|
||||
- **`BOX_PRIMARY_ADMIN_USER_ID`**: The Box user ID of the admin user
|
||||
- This is used for user impersonation
|
||||
- Find this in the Box admin console or via the Box API
|
||||
|
||||
### Optional (Test Users)
|
||||
|
||||
For full test coverage, you can also configure test user IDs for impersonation:
|
||||
|
||||
- **`BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1`**: User ID for test_user_1 (required for permission tests)
|
||||
- **`BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2`**: User ID for test_user_2 (optional)
|
||||
- **`BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3`**: User ID for test_user_3 (optional)
|
||||
|
||||
**Note**: The same JWT config (`BOX_JWT_CONFIG_JSON_STR`) is used for all users. Box JWT authentication supports user impersonation, so you only need to provide different user IDs. Each user ID is used to impersonate that user when making API calls.
|
||||
|
||||
### Example `.test.env` File
|
||||
|
||||
```bash
|
||||
# Box JWT Configuration (same config used for all users via impersonation)
|
||||
BOX_JWT_CONFIG_JSON_STR="{\"boxAppSettings\":{...},\"enterpriseID\":\"...\"}"
|
||||
|
||||
# User IDs for impersonation
|
||||
BOX_PRIMARY_ADMIN_USER_ID="12345678"
|
||||
|
||||
# Test User 1 (required for permission tests)
|
||||
BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1="12345679"
|
||||
|
||||
# Test User 2 (optional)
|
||||
BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2=""
|
||||
|
||||
# Test User 3 (optional)
|
||||
BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3=""
|
||||
```
|
||||
|
||||
## Setting Up the Test Environment
|
||||
|
||||
### Automated Setup (Recommended)
|
||||
|
||||
We provide a script that automatically creates the required folder structure, test files, and permissions:
|
||||
|
||||
```bash
|
||||
cd backend
|
||||
python scripts/setup_box_test_env.py
|
||||
```
|
||||
|
||||
This script will:
|
||||
1. Read credentials from `.test.env`
|
||||
2. Create the required folder structure (Folder 1, Folder 2, Folder 3, etc.)
|
||||
3. Create test files with proper naming (`file_0.txt`, `file_1.txt`, etc.)
|
||||
4. Set up sharing and permissions between users
|
||||
5. Update `consts_and_utils.py` with actual folder and user IDs
|
||||
|
||||
**Note**: The script requires write permissions in your Box account. Make sure your JWT application has write access enabled.
|
||||
|
||||
### Manual Setup
|
||||
|
||||
If you prefer to set up manually, you'll need to:
|
||||
|
||||
1. Create the following folder structure in your Box account:
|
||||
```
|
||||
Root/
|
||||
├── file_0.txt through file_4.txt (admin files)
|
||||
├── file_5.txt through file_9.txt (test_user_1 files)
|
||||
├── Folder 1/
|
||||
│ ├── file_25.txt through file_29.txt
|
||||
│ ├── Folder 1-1/
|
||||
│ │ └── file_30.txt through file_34.txt
|
||||
│ └── Folder 1-2/ (public folder)
|
||||
│ └── file_35.txt through file_39.txt
|
||||
├── Folder 2/
|
||||
│ ├── file_45.txt through file_49.txt
|
||||
│ ├── Folder 2-1/
|
||||
│ │ └── file_50.txt through file_54.txt
|
||||
│ └── Folder 2-2/
|
||||
│ └── file_55.txt through file_59.txt
|
||||
├── Folder 3/
|
||||
│ └── file_62.txt through file_64.txt
|
||||
└── Sections Folder/
|
||||
└── file_61.txt (special content)
|
||||
```
|
||||
|
||||
2. Create files with naming pattern: `file_{id}.txt` with content: `This is file {id}`
|
||||
3. Set up sharing permissions as defined in `consts_and_utils.py` (see `ACCESS_MAPPING`)
|
||||
4. Update `consts_and_utils.py` with actual folder IDs and user IDs
|
||||
|
||||
## Running the Tests
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Before running tests, ensure:
|
||||
1. Your `.test.env` file is configured with valid credentials
|
||||
2. The test environment has been set up (either via script or manually)
|
||||
3. You're in the `backend/` directory
|
||||
|
||||
### Run All Box Connector Tests
|
||||
|
||||
```bash
|
||||
cd backend
|
||||
pytest -v -s tests/daily/connectors/box/
|
||||
```
|
||||
|
||||
### Run Specific Test Files
|
||||
|
||||
```bash
|
||||
# Run basic connector tests
|
||||
pytest -v -s tests/daily/connectors/box/test_basic.py
|
||||
|
||||
# Run permission tests
|
||||
pytest -v -s tests/daily/connectors/box/test_permissions.py
|
||||
|
||||
# Run permission sync tests
|
||||
pytest -v -s tests/daily/connectors/box/test_perm_sync.py
|
||||
```
|
||||
|
||||
### Run Specific Test Functions
|
||||
|
||||
```bash
|
||||
# Run a specific test
|
||||
pytest -v -s tests/daily/connectors/box/test_basic.py::test_include_all_files
|
||||
|
||||
# Run tests matching a pattern
|
||||
pytest -v -s tests/daily/connectors/box/ -k "permission"
|
||||
```
|
||||
|
||||
### Run Tests Without Skipped Tests
|
||||
|
||||
Some tests are marked with `@pytest.mark.skip` if they require additional setup:
|
||||
|
||||
```bash
|
||||
# Run all tests, excluding skipped ones (default behavior)
|
||||
pytest -v -s tests/daily/connectors/box/
|
||||
|
||||
# To run skipped tests, you need to remove the @pytest.mark.skip decorator from the test functions
|
||||
# or use pytest's marker filtering (skipped tests are not included by default)
|
||||
```
|
||||
|
||||
## Test Structure
|
||||
|
||||
### Test Files
|
||||
|
||||
- **`test_basic.py`**: Basic connector functionality tests
|
||||
- Folder traversal
|
||||
- File retrieval
|
||||
- Folder scoping
|
||||
- Checkpointing
|
||||
- Size thresholds
|
||||
|
||||
- **`test_permissions.py`**: Permission and access control tests
|
||||
- User access mapping
|
||||
- Public file access
|
||||
- Restricted access
|
||||
- Collaboration permissions
|
||||
- Shared folders
|
||||
|
||||
- **`test_perm_sync.py`**: Permission synchronization tests
|
||||
- Permission extraction
|
||||
- Access control validation
|
||||
|
||||
- **`test_box_basic.py`**: Basic initialization tests (currently skipped)
|
||||
|
||||
### Test Constants
|
||||
|
||||
The `consts_and_utils.py` file contains:
|
||||
- File ID ranges for different test scenarios
|
||||
- Folder IDs (should match actual Box folder IDs)
|
||||
- User IDs (should match actual Box user IDs)
|
||||
- Access mapping (defines which users can access which files)
|
||||
- Helper functions for assertions and document loading
|
||||
|
||||
**Important**: After running the setup script or manual setup, the folder IDs and user IDs in `consts_and_utils.py` should be updated with actual values from your Box account.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Authentication Errors
|
||||
|
||||
- **"Failed to initialize Box JWT authentication"**
|
||||
- Verify your `BOX_JWT_CONFIG_JSON_STR` is correctly formatted
|
||||
- Ensure the JSON string is properly escaped in `.test.env`
|
||||
- Check that the JWT application is authorized
|
||||
|
||||
- **"User ID missing"**
|
||||
- Verify `BOX_PRIMARY_ADMIN_USER_ID` is set correctly
|
||||
- Ensure the user ID exists in your Box enterprise
|
||||
|
||||
### Permission Errors
|
||||
|
||||
- **"Insufficient permissions"**
|
||||
- Ensure your Box JWT application has **Write** permissions enabled
|
||||
- Check that the application is authorized in your Box enterprise
|
||||
- Verify user impersonation is working correctly
|
||||
|
||||
### File Not Found Errors
|
||||
|
||||
- **"File not found" or "Folder not found"**
|
||||
- Run the setup script to create the test environment
|
||||
- Verify folder IDs in `consts_and_utils.py` match actual Box folder IDs
|
||||
- Check that files were created with the correct naming pattern
|
||||
|
||||
### Test Failures
|
||||
|
||||
- **Tests fail with "expected file IDs not found"**
|
||||
- Ensure the test environment was set up correctly
|
||||
- Verify file naming matches the pattern: `file_{id}.txt`
|
||||
- Check that file content matches: `This is file {id}`
|
||||
- Run the setup script again to recreate the environment
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Box Developer Documentation](https://developer.box.com/)
|
||||
- [Box Python SDK Documentation](https://github.com/box/box-python-sdk-gen)
|
||||
- [Box JWT Authentication Guide](https://developer.box.com/guides/authentication/jwt/jwt-setup/)
|
||||
|
||||
## Notes
|
||||
|
||||
- The test environment creates a significant number of files and folders. Consider using a dedicated Box enterprise or test account.
|
||||
- Some tests require multiple users for full coverage. At minimum, `test_user_1` is required for permission tests.
|
||||
- The setup script handles existing folders and files gracefully (detects and reuses them when possible), but if you run it multiple times without cleaning up, it may create duplicate files with the same names in the same folders. For a clean test environment, delete the test folders between runs or use a fresh test account.
|
||||
- File IDs in the tests are placeholders. The actual file IDs in Box will be different, but the connector uses file names for matching.
|
||||
1
backend/tests/daily/connectors/box/__init__.py
Normal file
1
backend/tests/daily/connectors/box/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Tests for the Box connector."""
|
||||
139
backend/tests/daily/connectors/box/conftest.py
Normal file
139
backend/tests/daily/connectors/box/conftest.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""Test fixtures for Box connector tests."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import resource
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.box.box_kv import BOX_AUTHENTICATION_METHOD_UPLOADED
|
||||
from onyx.connectors.box.box_kv import DB_CREDENTIALS_AUTHENTICATION_METHOD
|
||||
from onyx.connectors.box.box_kv import DB_CREDENTIALS_DICT_BOX_JWT_CONFIG
|
||||
from onyx.connectors.box.box_kv import DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from onyx.connectors.box.utils import parse_box_jwt_config
|
||||
from tests.load_env_vars import load_env_vars
|
||||
|
||||
|
||||
# Load environment variables at the module level
|
||||
load_env_vars()
|
||||
|
||||
|
||||
_USER_TO_PRIMARY_ADMIN_USER_ID_MAP = {
|
||||
"admin": "BOX_PRIMARY_ADMIN_USER_ID",
|
||||
"test_user_1": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1",
|
||||
"test_user_2": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2",
|
||||
"test_user_3": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3",
|
||||
}
|
||||
|
||||
|
||||
def get_credentials_from_env(user_key: str) -> dict:
|
||||
"""
|
||||
Get Box JWT credentials from environment variables.
|
||||
|
||||
Uses the same JWT config for all users, impersonating via user ID.
|
||||
|
||||
Args:
|
||||
user_key (str): Key to look up user credentials (e.g., "admin", "test_user_1")
|
||||
|
||||
Returns:
|
||||
dict: Credentials dictionary with JWT config and primary admin user ID
|
||||
"""
|
||||
# Always use the same JWT config for all users
|
||||
raw_credential_string = os.environ["BOX_JWT_CONFIG_JSON_STR"]
|
||||
|
||||
# Parse and re-serialize to ensure proper JSON formatting
|
||||
parsed_config = parse_box_jwt_config(raw_credential_string)
|
||||
normalized_credential_string = json.dumps(parsed_config)
|
||||
|
||||
credentials = {
|
||||
DB_CREDENTIALS_DICT_BOX_JWT_CONFIG: normalized_credential_string,
|
||||
DB_CREDENTIALS_AUTHENTICATION_METHOD: BOX_AUTHENTICATION_METHOD_UPLOADED,
|
||||
}
|
||||
|
||||
# Get the user ID for impersonation
|
||||
if user_key in _USER_TO_PRIMARY_ADMIN_USER_ID_MAP:
|
||||
primary_admin_user_id = os.environ.get(
|
||||
_USER_TO_PRIMARY_ADMIN_USER_ID_MAP[user_key]
|
||||
)
|
||||
if primary_admin_user_id:
|
||||
credentials[DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID] = primary_admin_user_id
|
||||
|
||||
return credentials
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def box_jwt_connector_factory() -> Callable[..., BoxConnector]:
|
||||
"""
|
||||
Factory for creating Box connectors with JWT credentials.
|
||||
|
||||
Similar to google_drive_service_acct_connector_factory but for Box JWT authentication.
|
||||
|
||||
Note: When include_all_files=True, this factory automatically scopes to the test parent
|
||||
folder (BOX_TEST_PARENT_FOLDER_ID) instead of the Box account root to avoid loading
|
||||
all files in the account during tests.
|
||||
"""
|
||||
|
||||
def _connector_factory(
|
||||
user_key: str = "admin",
|
||||
include_all_files: bool = False,
|
||||
folder_ids: str | None = None,
|
||||
) -> BoxConnector:
|
||||
print(f"Creating BoxConnector with JWT credentials for user: {user_key}")
|
||||
|
||||
# For tests, when include_all_files=True, scope to test parent folder instead of root
|
||||
# This prevents loading all files in the Box account during tests
|
||||
test_parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID")
|
||||
if include_all_files and test_parent_folder_id:
|
||||
print(
|
||||
f"Scoping include_all_files to test parent folder: {test_parent_folder_id}"
|
||||
)
|
||||
# Use folder_ids with the test parent folder instead of include_all_files
|
||||
connector = BoxConnector(
|
||||
include_all_files=False,
|
||||
folder_ids=test_parent_folder_id,
|
||||
)
|
||||
else:
|
||||
connector = BoxConnector(
|
||||
include_all_files=include_all_files,
|
||||
folder_ids=folder_ids,
|
||||
)
|
||||
|
||||
credentials_json = get_credentials_from_env(user_key)
|
||||
connector.load_credentials(credentials_json)
|
||||
return connector
|
||||
|
||||
return _connector_factory
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def box_connector() -> BoxConnector:
|
||||
"""Create a Box connector instance for testing."""
|
||||
return BoxConnector(
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def set_resource_limits() -> None:
|
||||
"""
|
||||
Set resource limits for Box SDK if needed.
|
||||
|
||||
Similar to Google Drive tests, this may be needed if Box SDK is aggressive
|
||||
about using file descriptors.
|
||||
"""
|
||||
RLIMIT_MINIMUM = 2048
|
||||
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||
desired_soft = min(RLIMIT_MINIMUM, hard)
|
||||
|
||||
print(f"Open file limit: soft={soft} hard={hard} soft_required={RLIMIT_MINIMUM}")
|
||||
|
||||
if soft < desired_soft:
|
||||
print(f"Raising open file limit: {soft} -> {desired_soft}")
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE, (desired_soft, hard))
|
||||
|
||||
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||
print(f"New open file limit: soft={soft} hard={hard}")
|
||||
return
|
||||
300
backend/tests/daily/connectors/box/consts_and_utils.py
Normal file
300
backend/tests/daily/connectors/box/consts_and_utils.py
Normal file
@@ -0,0 +1,300 @@
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import TextSection
|
||||
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
|
||||
from tests.daily.connectors.utils import load_everything_from_checkpoint_connector
|
||||
|
||||
# File ID ranges for different test scenarios
|
||||
# These should match actual file IDs in the test Box account
|
||||
ALL_FILES = list(range(0, 60))
|
||||
ROOT_FOLDER_FILES = list(range(0, 10))
|
||||
|
||||
ADMIN_FILE_IDS = list(range(0, 5))
|
||||
ADMIN_FOLDER_3_FILE_IDS = list(range(65, 70)) # This folder is shared with test_user_1
|
||||
TEST_USER_1_FILE_IDS = list(range(5, 10))
|
||||
TEST_USER_2_FILE_IDS = list(range(10, 15))
|
||||
TEST_USER_3_FILE_IDS = list(range(15, 20))
|
||||
FOLDER_1_FILE_IDS = list(range(25, 30))
|
||||
FOLDER_1_1_FILE_IDS = list(range(30, 35))
|
||||
FOLDER_1_2_FILE_IDS = list(range(35, 40)) # This folder is public
|
||||
FOLDER_2_FILE_IDS = list(range(45, 50))
|
||||
FOLDER_2_1_FILE_IDS = list(range(50, 55))
|
||||
FOLDER_2_2_FILE_IDS = list(range(55, 60))
|
||||
SECTIONS_FILE_IDS = [61]
|
||||
FOLDER_3_FILE_IDS = list(range(62, 65))
|
||||
|
||||
DOWNLOAD_REVOKED_FILE_ID = 21
|
||||
|
||||
PUBLIC_FOLDER_RANGE = FOLDER_1_2_FILE_IDS
|
||||
PUBLIC_FILE_IDS = list(range(55, 57))
|
||||
PUBLIC_RANGE = PUBLIC_FOLDER_RANGE + PUBLIC_FILE_IDS
|
||||
|
||||
# Box folder IDs (these should match actual folder IDs in test account)
|
||||
FOLDER_1_ID = "360287594085" # Replace with actual folder ID
|
||||
FOLDER_1_1_ID = "360286151062" # Replace with actual folder ID
|
||||
FOLDER_1_2_ID = "360285966218" # Replace with actual folder ID
|
||||
FOLDER_2_ID = "360288222616" # Replace with actual folder ID
|
||||
FOLDER_2_1_ID = "360287577597" # Replace with actual folder ID
|
||||
FOLDER_2_2_ID = "360286012378" # Replace with actual folder ID
|
||||
FOLDER_3_ID = "360285724765" # Replace with actual folder ID
|
||||
ADMIN_FOLDER_3_ID = "360286714903" # Admin's Folder 3 (shared with test_user_1)
|
||||
SECTIONS_FOLDER_ID = "360288138769" # Replace with actual folder ID
|
||||
|
||||
# Box folder URLs
|
||||
FOLDER_1_URL = f"https://app.box.com/folder/{FOLDER_1_ID}"
|
||||
FOLDER_1_1_URL = f"https://app.box.com/folder/{FOLDER_1_1_ID}"
|
||||
FOLDER_1_2_URL = f"https://app.box.com/folder/{FOLDER_1_2_ID}"
|
||||
FOLDER_2_URL = f"https://app.box.com/folder/{FOLDER_2_ID}"
|
||||
FOLDER_2_1_URL = f"https://app.box.com/folder/{FOLDER_2_1_ID}"
|
||||
FOLDER_2_2_URL = f"https://app.box.com/folder/{FOLDER_2_2_ID}"
|
||||
FOLDER_3_URL = f"https://app.box.com/folder/{FOLDER_3_ID}"
|
||||
ADMIN_FOLDER_3_URL = f"https://app.box.com/folder/{ADMIN_FOLDER_3_ID}"
|
||||
SECTIONS_FOLDER_URL = f"https://app.box.com/folder/{SECTIONS_FOLDER_ID}"
|
||||
|
||||
RESTRICTED_ACCESS_FOLDER_ID = "123456797" # Replace with actual folder ID
|
||||
RESTRICTED_ACCESS_FOLDER_URL = (
|
||||
f"https://app.box.com/folder/{RESTRICTED_ACCESS_FOLDER_ID}"
|
||||
)
|
||||
|
||||
# User IDs (these should match actual Box user IDs)
|
||||
ADMIN_USER_ID = "13089353657" # Replace with actual user ID
|
||||
TEST_USER_1_ID = "48129700105" # Replace with actual user ID
|
||||
TEST_USER_2_ID = "48129680809" # Replace with actual user ID
|
||||
TEST_USER_3_ID = "48129580359" # Replace with actual user ID
|
||||
|
||||
# Dictionary for access permissions
|
||||
# All users have access to their own files as well as public files
|
||||
ACCESS_MAPPING: dict[str, list[int]] = {
|
||||
# Admin has access to everything in the test parent folder
|
||||
ADMIN_USER_ID: (
|
||||
ADMIN_FILE_IDS
|
||||
+ ADMIN_FOLDER_3_FILE_IDS
|
||||
+ FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
+ FOLDER_3_FILE_IDS
|
||||
+ SECTIONS_FILE_IDS
|
||||
# Admin can also see all test user files in the test parent folder
|
||||
+ TEST_USER_1_FILE_IDS
|
||||
+ TEST_USER_2_FILE_IDS
|
||||
+ TEST_USER_3_FILE_IDS
|
||||
),
|
||||
TEST_USER_1_ID: (
|
||||
TEST_USER_1_FILE_IDS
|
||||
# This user has access to folder 1
|
||||
+ FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
# This user has been given shared access to Admin's Folder 3
|
||||
+ ADMIN_FOLDER_3_FILE_IDS
|
||||
# This user has been given shared access to files 0 and 1 in Admin's root
|
||||
+ list(range(0, 2))
|
||||
# When scoped to test parent folder, user can see all subfolders
|
||||
# So they can also see FOLDER_3 and other folders
|
||||
+ FOLDER_3_FILE_IDS
|
||||
+ SECTIONS_FILE_IDS
|
||||
# They can also see files 2-4, 10-19 from other users' folders
|
||||
# because they have access to the test parent folder
|
||||
+ list(range(2, 5))
|
||||
+ list(range(10, 20))
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
),
|
||||
TEST_USER_2_ID: (
|
||||
TEST_USER_2_FILE_IDS
|
||||
# Group 1 includes this user, giving access to folder 1
|
||||
+ FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
# This folder is public
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
# Folder 2-1 is shared with this user
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
# This user has been given shared access to files 45 and 46 in folder 2
|
||||
+ list(range(45, 47))
|
||||
),
|
||||
# When include_all_files=True is scoped to test parent folder,
|
||||
# all users can see all subfolders (Box behavior when user has access to parent folder)
|
||||
TEST_USER_3_ID: (
|
||||
TEST_USER_3_FILE_IDS
|
||||
+ FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
+ FOLDER_3_FILE_IDS
|
||||
+ SECTIONS_FILE_IDS
|
||||
+ ADMIN_FILE_IDS
|
||||
+ TEST_USER_1_FILE_IDS
|
||||
+ TEST_USER_2_FILE_IDS
|
||||
+ ADMIN_FOLDER_3_FILE_IDS
|
||||
),
|
||||
}
|
||||
|
||||
SPECIAL_FILE_ID_TO_CONTENT_MAP: dict[int, str] = {
|
||||
61: (
|
||||
"Title\n"
|
||||
"This is a Box document with sections - "
|
||||
"Section 1\n"
|
||||
"Section 1 content - "
|
||||
"Sub-Section 1-1\n"
|
||||
"Sub-Section 1-1 content - "
|
||||
"Sub-Section 1-2\n"
|
||||
"Sub-Section 1-2 content - "
|
||||
"Section 2\n"
|
||||
"Section 2 content"
|
||||
),
|
||||
}
|
||||
|
||||
file_name_template = "file_{}.txt"
|
||||
file_text_template = "This is file {}"
|
||||
|
||||
# This is done to prevent different tests from interfering with each other
|
||||
# So each test type should have its own valid prefix
|
||||
_VALID_PREFIX = "file_"
|
||||
|
||||
|
||||
def filter_invalid_prefixes(names: set[str]) -> set[str]:
|
||||
"""Filter out file names that don't match the valid prefix."""
|
||||
return {name for name in names if name.startswith(_VALID_PREFIX)}
|
||||
|
||||
|
||||
def print_discrepancies(
|
||||
expected: set[str],
|
||||
retrieved: set[str],
|
||||
) -> None:
|
||||
"""Print discrepancies between expected and retrieved file names."""
|
||||
if expected != retrieved:
|
||||
expected_list = sorted(expected)
|
||||
retrieved_list = sorted(retrieved)
|
||||
print(expected_list)
|
||||
print(retrieved_list)
|
||||
print("Extra:")
|
||||
print(sorted(retrieved - expected))
|
||||
print("Missing:")
|
||||
print(sorted(expected - retrieved))
|
||||
|
||||
|
||||
def _get_expected_file_content(file_id: int) -> str:
|
||||
"""Get expected file content for a given file ID."""
|
||||
if file_id in SPECIAL_FILE_ID_TO_CONTENT_MAP:
|
||||
return SPECIAL_FILE_ID_TO_CONTENT_MAP[file_id]
|
||||
|
||||
return file_text_template.format(file_id)
|
||||
|
||||
|
||||
def id_to_name(file_id: int) -> str:
|
||||
"""Convert file ID to expected filename."""
|
||||
return file_name_template.format(file_id)
|
||||
|
||||
|
||||
def assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs: list[Document],
|
||||
expected_file_ids: Sequence[int],
|
||||
) -> None:
|
||||
"""
|
||||
Assert that expected file IDs are present in retrieved documents.
|
||||
|
||||
NOTE: This asserts for an exact match after filtering to valid prefixes.
|
||||
It filters retrieved docs to those with the valid prefix, then asserts
|
||||
that the expected file names and texts exactly match the filtered retrieved docs.
|
||||
"""
|
||||
expected_file_names = {id_to_name(file_id) for file_id in expected_file_ids}
|
||||
expected_file_texts = {
|
||||
_get_expected_file_content(file_id) for file_id in expected_file_ids
|
||||
}
|
||||
|
||||
retrieved_docs.sort(key=lambda x: x.semantic_identifier)
|
||||
|
||||
for doc in retrieved_docs:
|
||||
print(f"retrieved doc: doc.semantic_identifier={doc.semantic_identifier}")
|
||||
|
||||
# Filter out invalid prefixes to prevent different tests from interfering with each other
|
||||
valid_retrieved_docs = [
|
||||
doc
|
||||
for doc in retrieved_docs
|
||||
if doc.semantic_identifier.startswith(_VALID_PREFIX)
|
||||
]
|
||||
|
||||
# Check for duplicate semantic identifiers before building mapping
|
||||
semantic_identifiers = [doc.semantic_identifier for doc in valid_retrieved_docs]
|
||||
seen_identifiers = set()
|
||||
duplicates = []
|
||||
for identifier in semantic_identifiers:
|
||||
if identifier in seen_identifiers:
|
||||
duplicates.append(identifier)
|
||||
seen_identifiers.add(identifier)
|
||||
if duplicates:
|
||||
raise AssertionError(
|
||||
f"Found duplicate semantic_identifiers in retrieved docs: {duplicates}. "
|
||||
f"This indicates a bug in the connector that returns the same document multiple times."
|
||||
)
|
||||
|
||||
# Create mapping from file name to file text to detect mismatches
|
||||
retrieved_name_to_text: dict[str, str] = {}
|
||||
for doc in valid_retrieved_docs:
|
||||
text = " - ".join(
|
||||
[
|
||||
section.text
|
||||
for section in doc.sections
|
||||
if isinstance(section, TextSection) and section.text is not None
|
||||
]
|
||||
)
|
||||
retrieved_name_to_text[doc.semantic_identifier] = text
|
||||
|
||||
valid_retrieved_file_names = set(retrieved_name_to_text.keys())
|
||||
valid_retrieved_texts = set(retrieved_name_to_text.values())
|
||||
|
||||
# Check file names
|
||||
print_discrepancies(
|
||||
expected=expected_file_names,
|
||||
retrieved=valid_retrieved_file_names,
|
||||
)
|
||||
assert expected_file_names == valid_retrieved_file_names
|
||||
|
||||
# Check file texts
|
||||
print_discrepancies(
|
||||
expected=expected_file_texts,
|
||||
retrieved=valid_retrieved_texts,
|
||||
)
|
||||
assert expected_file_texts == valid_retrieved_texts
|
||||
|
||||
# Verify that each file name has the correct corresponding text
|
||||
# (This prevents swapped or mismatched file content per name from passing)
|
||||
for file_id in expected_file_ids:
|
||||
expected_name = id_to_name(file_id)
|
||||
expected_text = _get_expected_file_content(file_id)
|
||||
if expected_name in retrieved_name_to_text:
|
||||
retrieved_text = retrieved_name_to_text[expected_name]
|
||||
assert retrieved_text == expected_text, (
|
||||
f"File {expected_name} has incorrect content. "
|
||||
f"Expected: {expected_text}, Got: {retrieved_text}"
|
||||
)
|
||||
|
||||
|
||||
def load_all_docs(connector: BoxConnector) -> list[Document]:
|
||||
"""Load all documents from a Box connector."""
|
||||
return load_all_docs_from_checkpoint_connector(
|
||||
connector,
|
||||
0,
|
||||
time.time(),
|
||||
)
|
||||
|
||||
|
||||
def load_all_docs_with_failures(
|
||||
connector: BoxConnector,
|
||||
) -> list[Document | ConnectorFailure]:
|
||||
"""Load all documents from a Box connector, including failures."""
|
||||
return load_everything_from_checkpoint_connector(
|
||||
connector,
|
||||
0,
|
||||
time.time(),
|
||||
)
|
||||
374
backend/tests/daily/connectors/box/test_basic.py
Normal file
374
backend/tests/daily/connectors/box/test_basic.py
Normal file
@@ -0,0 +1,374 @@
|
||||
"""Basic tests for Box connector."""
|
||||
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from tests.daily.connectors.box.consts_and_utils import ADMIN_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import (
|
||||
assert_expected_docs_in_retrieved_docs,
|
||||
)
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_ID
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_URL
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_ID
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_URL
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_3_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import load_all_docs
|
||||
from tests.daily.connectors.box.consts_and_utils import SECTIONS_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import TEST_USER_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import TEST_USER_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import TEST_USER_3_FILE_IDS
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_include_all_files(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that include_all_files=True indexes everything from root."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
retrieved_docs = load_all_docs(connector)
|
||||
|
||||
# Should get everything accessible from root (test parent folder)
|
||||
expected_file_ids = (
|
||||
ADMIN_FILE_IDS
|
||||
+ ADMIN_FOLDER_3_FILE_IDS
|
||||
+ TEST_USER_1_FILE_IDS
|
||||
+ TEST_USER_2_FILE_IDS
|
||||
+ TEST_USER_3_FILE_IDS
|
||||
+ FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
+ FOLDER_3_FILE_IDS # Folder 3 is in the test structure
|
||||
+ SECTIONS_FILE_IDS
|
||||
)
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=retrieved_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_specific_folders(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that folder_ids with specific folder IDs works."""
|
||||
folder_ids = f"{FOLDER_1_ID},{FOLDER_2_ID}"
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=folder_ids,
|
||||
)
|
||||
retrieved_docs = load_all_docs(connector)
|
||||
|
||||
# Should get files from folder 1 and folder 2 (including subfolders)
|
||||
expected_file_ids = (
|
||||
FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
)
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=retrieved_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_folder_urls(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that folder_ids with Box URLs extracts IDs correctly."""
|
||||
folder_urls = f"{FOLDER_1_URL},{FOLDER_2_URL}"
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=folder_urls,
|
||||
)
|
||||
retrieved_docs = load_all_docs(connector)
|
||||
|
||||
# Should get files from folder 1 and folder 2 (including subfolders)
|
||||
expected_file_ids = (
|
||||
FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
)
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=retrieved_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_mixed_folder_ids_and_urls(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test combination of folder IDs and URLs."""
|
||||
mixed_ids = f"{FOLDER_1_ID},{FOLDER_2_URL}"
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=mixed_ids,
|
||||
)
|
||||
retrieved_docs = load_all_docs(connector)
|
||||
|
||||
# Should get files from both folders
|
||||
expected_file_ids = (
|
||||
FOLDER_1_FILE_IDS
|
||||
+ FOLDER_1_1_FILE_IDS
|
||||
+ FOLDER_1_2_FILE_IDS
|
||||
+ FOLDER_2_FILE_IDS
|
||||
+ FOLDER_2_1_FILE_IDS
|
||||
+ FOLDER_2_2_FILE_IDS
|
||||
)
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=retrieved_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_single_folder(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test indexing a single folder."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=FOLDER_1_ID,
|
||||
)
|
||||
retrieved_docs = load_all_docs(connector)
|
||||
|
||||
# Should get files from folder 1 and its subfolders
|
||||
expected_file_ids = FOLDER_1_FILE_IDS + FOLDER_1_1_FILE_IDS + FOLDER_1_2_FILE_IDS
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=retrieved_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_nested_folders(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test recursive folder traversal with deeply nested structure."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=FOLDER_2_ID,
|
||||
)
|
||||
retrieved_docs = load_all_docs(connector)
|
||||
|
||||
# Should get files from folder 2 and all nested subfolders (2-1 and 2-2)
|
||||
expected_file_ids = FOLDER_2_FILE_IDS + FOLDER_2_1_FILE_IDS + FOLDER_2_2_FILE_IDS
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=retrieved_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_size_threshold(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""
|
||||
Test that size_threshold is applied correctly.
|
||||
|
||||
Since all test files are small (< 1KB), this verifies the threshold
|
||||
doesn't block all files rather than testing exclusion of large files.
|
||||
"""
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_URL
|
||||
|
||||
# Test with a reasonable size threshold (16KB) - test files are small text files
|
||||
connector_with_threshold = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=FOLDER_1_URL,
|
||||
)
|
||||
connector_with_threshold.size_threshold = 16384 # 16KB
|
||||
|
||||
retrieved_docs = load_all_docs(connector_with_threshold)
|
||||
threshold_doc_names = {doc.semantic_identifier for doc in retrieved_docs}
|
||||
|
||||
# With a 16KB threshold, all small test files should still be retrieved
|
||||
# (test files are small text files, typically < 1KB each)
|
||||
assert (
|
||||
len(retrieved_docs) > 0
|
||||
), "Should retrieve at least some files with 16KB threshold"
|
||||
|
||||
# Verify that files were retrieved (threshold didn't block all files)
|
||||
# Since test files are small, they should all pass the 16KB threshold
|
||||
assert len(threshold_doc_names) > 0, (
|
||||
f"With 16KB threshold, should retrieve files from {FOLDER_1_URL}. "
|
||||
f"Got {len(retrieved_docs)} documents."
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_checkpoint_resumption(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test checkpointing and resuming from checkpoint."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
# Create initial checkpoint
|
||||
checkpoint = connector.build_dummy_checkpoint()
|
||||
assert checkpoint is not None
|
||||
assert checkpoint.has_more is True
|
||||
|
||||
# Load some documents
|
||||
from onyx.connectors.connector_runner import CheckpointOutputWrapper
|
||||
|
||||
start_time = 0
|
||||
end_time = time.time()
|
||||
|
||||
# Load first batch and get updated checkpoint
|
||||
first_checkpoint_file_count = len(checkpoint.all_retrieved_file_ids)
|
||||
doc_batch_generator = CheckpointOutputWrapper[BoxConnector]()(
|
||||
connector.load_from_checkpoint(start_time, end_time, checkpoint)
|
||||
)
|
||||
first_batch_docs = []
|
||||
for document, failure, next_checkpoint in doc_batch_generator:
|
||||
if failure is not None:
|
||||
raise RuntimeError(f"Failed to load documents: {failure}")
|
||||
if document is not None:
|
||||
first_batch_docs.append(document)
|
||||
if next_checkpoint is not None:
|
||||
checkpoint = next_checkpoint
|
||||
|
||||
# Load a few more batches to verify checkpointing works
|
||||
all_docs = first_batch_docs.copy()
|
||||
max_iterations = 2 # Test a few batches to verify checkpointing
|
||||
iteration_count = 0
|
||||
while checkpoint.has_more and iteration_count < max_iterations:
|
||||
iteration_count += 1
|
||||
|
||||
doc_batch_generator = CheckpointOutputWrapper[BoxConnector]()(
|
||||
connector.load_from_checkpoint(start_time, end_time, checkpoint)
|
||||
)
|
||||
batch_docs = []
|
||||
for document, failure, next_checkpoint in doc_batch_generator:
|
||||
if failure is not None:
|
||||
raise RuntimeError(f"Failed to load documents: {failure}")
|
||||
if document is not None:
|
||||
batch_docs.append(document)
|
||||
if next_checkpoint is not None:
|
||||
checkpoint = next_checkpoint
|
||||
|
||||
all_docs.extend(batch_docs)
|
||||
if checkpoint.has_more:
|
||||
# Checkpoint should be updated with more file IDs
|
||||
assert len(checkpoint.all_retrieved_file_ids) > first_checkpoint_file_count
|
||||
|
||||
# Verify we got documents and checkpointing is working
|
||||
assert len(all_docs) > 0, "Should have retrieved at least some documents"
|
||||
assert (
|
||||
len(checkpoint.all_retrieved_file_ids) > first_checkpoint_file_count
|
||||
), "Checkpoint should be updated with retrieved file IDs"
|
||||
|
||||
|
||||
def test_connector_validation(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test validate_connector_settings()."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
# Should not raise an exception
|
||||
connector.validate_connector_settings()
|
||||
|
||||
|
||||
def test_connector_initialization() -> None:
|
||||
"""Test that Box connector can be initialized."""
|
||||
connector = BoxConnector(
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
assert connector is not None
|
||||
assert connector.include_all_files is True
|
||||
assert connector._requested_folder_ids == set()
|
||||
|
||||
|
||||
def test_connector_initialization_with_folder_ids() -> None:
|
||||
"""Test that Box connector can be initialized with folder IDs."""
|
||||
folder_ids = "123,456"
|
||||
connector = BoxConnector(
|
||||
include_all_files=False,
|
||||
folder_ids=folder_ids,
|
||||
)
|
||||
assert connector is not None
|
||||
assert connector.include_all_files is False
|
||||
assert "123" in connector._requested_folder_ids
|
||||
assert "456" in connector._requested_folder_ids
|
||||
|
||||
|
||||
def test_connector_initialization_fails_without_config() -> None:
|
||||
"""Test that Box connector fails to initialize without include_all_files or folder_ids."""
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
|
||||
with pytest.raises(ConnectorValidationError):
|
||||
BoxConnector(
|
||||
include_all_files=False,
|
||||
folder_ids=None,
|
||||
)
|
||||
87
backend/tests/daily/connectors/box/test_error_handling.py
Normal file
87
backend/tests/daily/connectors/box/test_error_handling.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Error handling tests for Box connector."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
|
||||
|
||||
def test_connector_with_invalid_folder_id(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that connector handles invalid folder IDs gracefully."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids="999999999999", # Invalid folder ID
|
||||
)
|
||||
|
||||
# Should not raise during initialization
|
||||
assert connector is not None
|
||||
|
||||
# Loading documents should handle the error gracefully
|
||||
from tests.daily.connectors.box.consts_and_utils import load_all_docs
|
||||
|
||||
with patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
):
|
||||
try:
|
||||
docs = load_all_docs(connector)
|
||||
# If no error, should return empty list for invalid folder ID
|
||||
assert isinstance(docs, list)
|
||||
# Assert that invalid folder ID returns empty result
|
||||
assert (
|
||||
len(docs) == 0
|
||||
), f"Expected empty result for invalid folder ID, but got {len(docs)} documents"
|
||||
except Exception as e:
|
||||
# If error is raised, it should be a specific Box API error
|
||||
error_msg = str(e).lower()
|
||||
assert (
|
||||
"404" in error_msg
|
||||
or "not found" in error_msg
|
||||
or "not_found" in error_msg
|
||||
), f"Unexpected error type: {error_msg}"
|
||||
|
||||
|
||||
def test_connector_with_malformed_url(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that connector handles malformed URLs gracefully."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids="https://invalid-url.com/folder/123",
|
||||
)
|
||||
|
||||
# Should extract what it can or handle gracefully
|
||||
assert connector is not None
|
||||
# Verify that the connector extracted the folder ID from the URL (even if invalid)
|
||||
# The connector should extract "123" from the URL path
|
||||
assert "123" in connector._requested_folder_ids
|
||||
|
||||
|
||||
def test_connector_with_empty_folder_ids_string() -> None:
|
||||
"""Test that connector raises validation error for empty folder_ids string."""
|
||||
with pytest.raises(ConnectorValidationError):
|
||||
BoxConnector(
|
||||
include_all_files=False,
|
||||
folder_ids="",
|
||||
)
|
||||
|
||||
|
||||
def test_connector_with_whitespace_folder_ids() -> None:
|
||||
"""Test that connector handles whitespace-only folder_ids."""
|
||||
# Whitespace-only folder_ids get filtered out, but the connector still initializes
|
||||
# The validation happens in __init__ which checks if folder_ids is truthy (not empty string)
|
||||
# Since " , , " is truthy, it passes initial validation, but results in empty folder_ids
|
||||
# This is acceptable behavior - the connector will just have no folders to process
|
||||
connector = BoxConnector(
|
||||
include_all_files=False,
|
||||
folder_ids=" , , ",
|
||||
)
|
||||
# Connector initializes successfully
|
||||
assert connector is not None
|
||||
98
backend/tests/daily/connectors/box/test_map_test_ids.py
Normal file
98
backend/tests/daily/connectors/box/test_map_test_ids.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""Utility to generate mapping from test file IDs to actual Box file IDs."""
|
||||
|
||||
import os
|
||||
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from tests.daily.connectors.box.conftest import get_credentials_from_env
|
||||
from tests.daily.connectors.box.consts_and_utils import ADMIN_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import file_name_template
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_3_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import load_all_docs
|
||||
|
||||
|
||||
def generate_test_id_to_box_id_mapping() -> dict[int, str]:
|
||||
"""
|
||||
Generate a mapping from test file IDs to actual Box file IDs.
|
||||
|
||||
This is useful for writing tests that need to verify specific files
|
||||
are accessible to specific users.
|
||||
|
||||
Returns:
|
||||
dict: Mapping from test file ID (int) to Box file URL (str)
|
||||
"""
|
||||
# Set up the connector with real credentials
|
||||
# For tests, scope to test parent folder instead of root
|
||||
test_parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID")
|
||||
if test_parent_folder_id:
|
||||
connector = BoxConnector(
|
||||
include_all_files=False,
|
||||
folder_ids=test_parent_folder_id,
|
||||
)
|
||||
else:
|
||||
connector = BoxConnector(
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
# Load credentials
|
||||
connector.load_credentials(get_credentials_from_env("admin"))
|
||||
|
||||
# Get all documents from the connector
|
||||
docs = load_all_docs(connector)
|
||||
|
||||
# Create a mapping from test file ID to actual Box file URL
|
||||
test_id_to_box_id = {}
|
||||
|
||||
# Process all documents retrieved from Box
|
||||
for doc in docs:
|
||||
# Check if this document's name matches our test file naming pattern (file_X.txt)
|
||||
if not doc.semantic_identifier.startswith(
|
||||
file_name_template.format("").split("_")[0]
|
||||
):
|
||||
continue
|
||||
|
||||
try:
|
||||
# Extract the test file ID from the filename (file_X.txt -> X)
|
||||
file_id_str = doc.semantic_identifier.split("_")[1].split(".")[0]
|
||||
test_file_id = int(file_id_str)
|
||||
|
||||
# Store the mapping from test ID to actual Box file URL
|
||||
# Box document IDs are URLs
|
||||
test_id_to_box_id[test_file_id] = doc.id
|
||||
except (ValueError, IndexError):
|
||||
# Skip files that don't follow our naming convention
|
||||
continue
|
||||
|
||||
# Print the mapping for all defined test file ID ranges
|
||||
all_test_ranges = {
|
||||
"ADMIN_FILE_IDS": ADMIN_FILE_IDS,
|
||||
"FOLDER_1_FILE_IDS": FOLDER_1_FILE_IDS,
|
||||
"FOLDER_1_1_FILE_IDS": FOLDER_1_1_FILE_IDS,
|
||||
"FOLDER_1_2_FILE_IDS": FOLDER_1_2_FILE_IDS,
|
||||
"FOLDER_2_FILE_IDS": FOLDER_2_FILE_IDS,
|
||||
"FOLDER_2_1_FILE_IDS": FOLDER_2_1_FILE_IDS,
|
||||
"FOLDER_2_2_FILE_IDS": FOLDER_2_2_FILE_IDS,
|
||||
"FOLDER_3_FILE_IDS": FOLDER_3_FILE_IDS,
|
||||
}
|
||||
|
||||
# Print the mapping for each test range
|
||||
for range_name, file_ids in all_test_ranges.items():
|
||||
print(f"\n{range_name}:")
|
||||
for test_id in file_ids:
|
||||
box_id = test_id_to_box_id.get(test_id, "NOT_FOUND")
|
||||
print(f" {test_id} -> {box_id}")
|
||||
|
||||
return test_id_to_box_id
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Allow running this script directly to generate mappings
|
||||
generate_test_id_to_box_id_mapping()
|
||||
290
backend/tests/daily/connectors/box/test_perm_sync.py
Normal file
290
backend/tests/daily/connectors/box/test_perm_sync.py
Normal file
@@ -0,0 +1,290 @@
|
||||
"""Permission sync tests for Box connector."""
|
||||
|
||||
import copy
|
||||
import json
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from ee.onyx.external_permissions.box.doc_sync import box_doc_sync
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from tests.daily.connectors.box.consts_and_utils import ACCESS_MAPPING
|
||||
from tests.daily.connectors.box.consts_and_utils import PUBLIC_RANGE
|
||||
|
||||
|
||||
def _build_connector(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> BoxConnector:
|
||||
"""Build a Box connector for permission sync testing."""
|
||||
return box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
|
||||
def test_box_perm_sync_with_real_data(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
set_ee_on: None,
|
||||
) -> None:
|
||||
"""
|
||||
Test box_doc_sync with real data from the test Box account.
|
||||
|
||||
This test uses the real connector to make actual API calls to Box
|
||||
and verifies the permission structure returned.
|
||||
"""
|
||||
# Create a mock cc_pair that will use our real connector
|
||||
# For tests, scope to test parent folder instead of root
|
||||
import os
|
||||
from tests.daily.connectors.box.conftest import get_credentials_from_env
|
||||
|
||||
test_parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID")
|
||||
mock_cc_pair = MagicMock(spec=ConnectorCredentialPair)
|
||||
mock_cc_pair.connector = MagicMock()
|
||||
if test_parent_folder_id:
|
||||
mock_cc_pair.connector.connector_specific_config = {
|
||||
"include_all_files": False,
|
||||
"folder_ids": test_parent_folder_id,
|
||||
}
|
||||
else:
|
||||
mock_cc_pair.connector.connector_specific_config = {
|
||||
"include_all_files": True,
|
||||
"folder_ids": None,
|
||||
}
|
||||
mock_cc_pair.credential_id = 1
|
||||
# Use real credentials from environment
|
||||
mock_cc_pair.credential.credential_json = get_credentials_from_env("admin")
|
||||
mock_cc_pair.last_time_perm_sync = None
|
||||
|
||||
# Create a mock heartbeat
|
||||
mock_heartbeat = MagicMock(spec=IndexingHeartbeatInterface)
|
||||
mock_heartbeat.should_stop.return_value = False
|
||||
|
||||
# Load box_id_mapping.json if it exists
|
||||
mapping_file = os.path.join(os.path.dirname(__file__), "box_id_mapping.json")
|
||||
url_to_id_mapping: dict[str, int] = {}
|
||||
if os.path.exists(mapping_file):
|
||||
with open(mapping_file, "r") as f:
|
||||
box_id_mapping = json.load(f)
|
||||
# Invert the mapping to get URL -> ID
|
||||
url_to_id_mapping = {url: int(id) for id, url in box_id_mapping.items()}
|
||||
|
||||
# Use the connector directly without mocking Box API calls
|
||||
# Create a connector factory that respects the test scoping
|
||||
def connector_factory(**kwargs):
|
||||
# Use the connector_specific_config from mock_cc_pair to respect test scoping
|
||||
config = mock_cc_pair.connector.connector_specific_config
|
||||
return box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=config.get("include_all_files", True),
|
||||
folder_ids=config.get("folder_ids", None),
|
||||
)
|
||||
|
||||
with patch(
|
||||
"ee.onyx.external_permissions.box.doc_sync.BoxConnector",
|
||||
side_effect=connector_factory,
|
||||
):
|
||||
# Call the function under test
|
||||
mock_fetch_all_docs_fn = MagicMock(return_value=[])
|
||||
mock_fetch_all_docs_ids_fn = MagicMock(return_value=[])
|
||||
|
||||
doc_access_generator = box_doc_sync(
|
||||
mock_cc_pair,
|
||||
mock_fetch_all_docs_fn,
|
||||
mock_fetch_all_docs_ids_fn,
|
||||
mock_heartbeat,
|
||||
)
|
||||
doc_access_list = list(doc_access_generator)
|
||||
|
||||
# Verify we got some results
|
||||
assert len(doc_access_list) > 0
|
||||
print(f"Found {len(doc_access_list)} documents with permissions")
|
||||
|
||||
# Map documents to their permissions
|
||||
doc_to_user_id_mapping: dict[str, set[str]] = {}
|
||||
doc_to_raw_result_mapping: dict[str, set[str]] = {}
|
||||
public_doc_ids: set[str] = set()
|
||||
|
||||
for doc_access in doc_access_list:
|
||||
doc_id = doc_access.doc_id
|
||||
# make sure they are new sets to avoid mutating the original
|
||||
doc_to_user_id_mapping[doc_id] = copy.deepcopy(
|
||||
doc_access.external_access.external_user_emails
|
||||
)
|
||||
doc_to_raw_result_mapping[doc_id] = copy.deepcopy(
|
||||
doc_access.external_access.external_user_emails
|
||||
)
|
||||
|
||||
# Box uses user emails directly, not groups like Google Drive
|
||||
# But we may have group IDs that need to be resolved
|
||||
for group_id in doc_access.external_access.external_user_group_ids:
|
||||
# For Box, group IDs might need to be resolved to user emails
|
||||
# This would require additional group sync functionality
|
||||
doc_to_raw_result_mapping[doc_id].add(group_id)
|
||||
|
||||
if doc_access.external_access.is_public:
|
||||
public_doc_ids.add(doc_id)
|
||||
|
||||
# Check permissions based on box_id_mapping.json and ACCESS_MAPPING
|
||||
# For each document URL that exists in our mapping
|
||||
checked_files = 0
|
||||
for doc_id, user_ids_with_access in doc_to_user_id_mapping.items():
|
||||
# Skip URLs that aren't in our mapping, we don't want new stuff to interfere
|
||||
# with the test.
|
||||
if doc_id not in url_to_id_mapping:
|
||||
continue
|
||||
|
||||
file_numeric_id = url_to_id_mapping.get(doc_id)
|
||||
if file_numeric_id is None:
|
||||
raise ValueError(f"File {doc_id} not found in box_id_mapping.json")
|
||||
|
||||
checked_files += 1
|
||||
|
||||
# Check which users should have access to this file according to ACCESS_MAPPING
|
||||
# Note: ACCESS_MAPPING uses user IDs (e.g., "13089353657"), but Box permissions
|
||||
# return user emails (e.g., "admin@onyx-test.com"). We need to verify access
|
||||
# by checking that the expected number of users have access, rather than
|
||||
# exact email matching (which would require a user ID to email mapping).
|
||||
expected_user_count = 0
|
||||
for user_id, file_ids in ACCESS_MAPPING.items():
|
||||
if file_numeric_id in file_ids:
|
||||
expected_user_count += 1
|
||||
|
||||
# Verify the permissions match
|
||||
if file_numeric_id in PUBLIC_RANGE:
|
||||
# Public files should be marked as public
|
||||
assert (
|
||||
doc_id in public_doc_ids
|
||||
), f"File {doc_id} (ID: {file_numeric_id}) should be public but is not in the public_doc_ids set"
|
||||
# Public files may have additional user access, so we just verify it's marked public
|
||||
else:
|
||||
# Non-public files should have at least the expected number of users with access
|
||||
# Note: We can't do exact email matching without a user ID to email mapping,
|
||||
# but we can verify that files have the expected level of access
|
||||
# Check both user emails and group IDs (files may have group-only permissions)
|
||||
has_user_access = len(user_ids_with_access) > 0
|
||||
has_group_access = len(doc_to_raw_result_mapping[doc_id]) > len(
|
||||
user_ids_with_access
|
||||
)
|
||||
assert has_user_access or has_group_access, (
|
||||
f"File {doc_id} (ID: {file_numeric_id}) should have some access "
|
||||
f"(user emails or group IDs) but has none. "
|
||||
f"User emails: {user_ids_with_access}, "
|
||||
f"Raw result (includes groups): {doc_to_raw_result_mapping[doc_id]}"
|
||||
)
|
||||
|
||||
# Verify that the number of users with access is at least the expected count
|
||||
# (some files may have additional access beyond what's in ACCESS_MAPPING)
|
||||
assert len(user_ids_with_access) >= expected_user_count, (
|
||||
f"File {doc_id} (ID: {file_numeric_id}) should have access for at least "
|
||||
f"{expected_user_count} user(s) according to ACCESS_MAPPING, "
|
||||
f"but only {len(user_ids_with_access)} user(s) have access. "
|
||||
f"Users with access: {user_ids_with_access}. "
|
||||
f"Raw result: {doc_to_raw_result_mapping[doc_id]}"
|
||||
)
|
||||
|
||||
# Log the access for debugging (helps identify permission issues)
|
||||
if len(user_ids_with_access) != expected_user_count:
|
||||
print(
|
||||
f"Note: File {doc_id} (ID: {file_numeric_id}) has {len(user_ids_with_access)} "
|
||||
f"users with access, expected {expected_user_count} from ACCESS_MAPPING. "
|
||||
f"This may be due to additional sharing or group permissions."
|
||||
)
|
||||
|
||||
if checked_files > 0:
|
||||
print(f"Checked permissions for {checked_files} files from box_id_mapping.json")
|
||||
else:
|
||||
# Fail the test if no files were checked - this indicates either:
|
||||
# 1. box_id_mapping.json is missing, or
|
||||
# 2. No doc_ids from the sync matched the mapping (potential sync issue)
|
||||
raise AssertionError(
|
||||
"No files checked. This test requires box_id_mapping.json to exist and "
|
||||
"doc_ids from box_doc_sync to match entries in the mapping. "
|
||||
"Run test_map_test_ids.py to generate box_id_mapping.json."
|
||||
)
|
||||
|
||||
|
||||
def test_slim_document_generation(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test slim document generation for permission sync."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
# Test that retrieve_all_slim_docs_perm_sync works
|
||||
# Add safety limit to prevent infinite loops
|
||||
slim_doc_generator = connector.retrieve_all_slim_docs_perm_sync()
|
||||
|
||||
slim_doc_batches = []
|
||||
max_iterations = 1000 # Safety limit
|
||||
iteration_count = 0
|
||||
|
||||
for batch in slim_doc_generator:
|
||||
slim_doc_batches.append(batch)
|
||||
iteration_count += 1
|
||||
if iteration_count >= max_iterations:
|
||||
raise RuntimeError(
|
||||
f"Test hit safety limit of {max_iterations} iterations. "
|
||||
"This suggests an infinite loop."
|
||||
)
|
||||
|
||||
# Should get some slim documents
|
||||
assert len(slim_doc_batches) > 0
|
||||
|
||||
# Each batch should contain slim documents
|
||||
for batch in slim_doc_batches:
|
||||
assert len(batch) > 0
|
||||
for slim_doc in batch:
|
||||
assert slim_doc.id is not None
|
||||
# External access may or may not be present
|
||||
# depending on whether permissions were fetched
|
||||
|
||||
|
||||
def test_permission_sync_checkpointing(
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test permission sync with checkpointing."""
|
||||
connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
|
||||
# Load slim docs with checkpointing using the proper method
|
||||
import time
|
||||
|
||||
start_time = 0
|
||||
end_time = time.time()
|
||||
|
||||
# Use retrieve_all_slim_docs_perm_sync which properly handles checkpointing
|
||||
slim_doc_generator = connector.retrieve_all_slim_docs_perm_sync(
|
||||
start=start_time,
|
||||
end=end_time,
|
||||
callback=None,
|
||||
)
|
||||
|
||||
# Collect batches with a safety limit to prevent infinite loops
|
||||
slim_doc_batches = []
|
||||
max_iterations = 1000 # Safety limit
|
||||
iteration_count = 0
|
||||
|
||||
for batch in slim_doc_generator:
|
||||
slim_doc_batches.append(batch)
|
||||
iteration_count += 1
|
||||
if iteration_count >= max_iterations:
|
||||
# If we hit the limit, something is wrong
|
||||
raise RuntimeError(
|
||||
f"Test hit safety limit of {max_iterations} iterations. "
|
||||
"This suggests an infinite loop or checkpoint not updating properly."
|
||||
)
|
||||
|
||||
# Should get some documents
|
||||
assert len(slim_doc_batches) > 0
|
||||
# Verify we got some slim documents
|
||||
total_docs = sum(len(batch) for batch in slim_doc_batches)
|
||||
assert total_docs > 0, "Should have retrieved at least one slim document"
|
||||
286
backend/tests/daily/connectors/box/test_permissions.py
Normal file
286
backend/tests/daily/connectors/box/test_permissions.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""Permission and access tests for Box connector."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from box_sdk_gen.box import BoxAPIError
|
||||
|
||||
from onyx.connectors.box.connector import BoxConnector
|
||||
from tests.daily.connectors.box.consts_and_utils import ACCESS_MAPPING
|
||||
from tests.daily.connectors.box.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import ADMIN_FOLDER_3_URL
|
||||
from tests.daily.connectors.box.consts_and_utils import ADMIN_USER_ID
|
||||
from tests.daily.connectors.box.consts_and_utils import (
|
||||
assert_expected_docs_in_retrieved_docs,
|
||||
)
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_FILE_IDS
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_URL
|
||||
from tests.daily.connectors.box.consts_and_utils import load_all_docs
|
||||
from tests.daily.connectors.box.consts_and_utils import TEST_USER_1_ID
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_user_access_mapping(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that files are only accessible to users with permissions."""
|
||||
# Test with admin user - should have access to everything
|
||||
admin_connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
admin_docs = load_all_docs(admin_connector)
|
||||
admin_file_ids = list(ACCESS_MAPPING[ADMIN_USER_ID])
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=admin_docs,
|
||||
expected_file_ids=admin_file_ids,
|
||||
)
|
||||
|
||||
# Test with test_user_1 - should have limited access
|
||||
user1_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_1",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
user1_docs = load_all_docs(user1_connector)
|
||||
user1_file_ids = list(ACCESS_MAPPING[TEST_USER_1_ID])
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=user1_docs,
|
||||
expected_file_ids=user1_file_ids,
|
||||
)
|
||||
|
||||
# Verify that user1's expected files are a subset of admin's expected files
|
||||
# (When scoped to test parent folder, all users can see all subfolders)
|
||||
assert set(user1_file_ids).issubset(set(admin_file_ids))
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_public_files(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that public files are accessible to all users."""
|
||||
from tests.daily.connectors.box.consts_and_utils import PUBLIC_RANGE
|
||||
from tests.daily.connectors.box.consts_and_utils import id_to_name
|
||||
|
||||
# Test with admin
|
||||
admin_connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=True,
|
||||
folder_ids=None,
|
||||
)
|
||||
admin_docs = load_all_docs(admin_connector)
|
||||
admin_file_names = {doc.semantic_identifier for doc in admin_docs}
|
||||
|
||||
# Test with test_user_3 (most restricted user)
|
||||
# Use FOLDER_1_2 which is public and accessible to all users
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_URL
|
||||
|
||||
user3_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_3",
|
||||
include_all_files=False,
|
||||
folder_ids=FOLDER_1_2_URL,
|
||||
)
|
||||
user3_docs = load_all_docs(user3_connector)
|
||||
user3_file_names = {doc.semantic_identifier for doc in user3_docs}
|
||||
|
||||
# Verify that public files are accessible to both users
|
||||
# PUBLIC_RANGE includes FOLDER_1_2_FILE_IDS (public folder) and PUBLIC_FILE_IDS
|
||||
# test_user_3 only has access to FOLDER_1_2, so we verify that subset
|
||||
expected_public_file_names = {id_to_name(file_id) for file_id in PUBLIC_RANGE}
|
||||
|
||||
admin_public_files = admin_file_names & expected_public_file_names
|
||||
user3_public_files = user3_file_names & expected_public_file_names
|
||||
|
||||
# Verify test_user_3 has access to the public folder files
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
|
||||
|
||||
expected_folder_1_2_names = {id_to_name(file_id) for file_id in FOLDER_1_2_FILE_IDS}
|
||||
|
||||
# test_user_3 should have access to all files in the public folder
|
||||
assert expected_folder_1_2_names.issubset(user3_public_files), (
|
||||
f"test_user_3 should have access to all files in public folder FOLDER_1_2. "
|
||||
f"Expected: {expected_folder_1_2_names}, Got: {user3_public_files}"
|
||||
)
|
||||
|
||||
# Admin should also have access to the public folder files
|
||||
assert expected_folder_1_2_names.issubset(admin_public_files), (
|
||||
f"Admin should have access to all files in public folder FOLDER_1_2. "
|
||||
f"Expected: {expected_folder_1_2_names}, Got: {admin_public_files}"
|
||||
)
|
||||
|
||||
# At least some public files should exist
|
||||
assert len(user3_public_files) > 0, (
|
||||
f"test_user_3 should have access to at least some public files. "
|
||||
f"Got: {user3_public_files}"
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_restricted_access(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test files with restricted access."""
|
||||
# Test with admin - should have access
|
||||
admin_connector = box_jwt_connector_factory(
|
||||
user_key="admin",
|
||||
include_all_files=False,
|
||||
folder_ids=ADMIN_FOLDER_3_URL,
|
||||
)
|
||||
admin_docs = load_all_docs(admin_connector)
|
||||
assert len(admin_docs) > 0
|
||||
|
||||
# Test with test_user_3 - should not have access to admin's folder 3 (ADMIN_FOLDER_3)
|
||||
# The setup script explicitly removes test_user_3's access to ensure this test is useful
|
||||
user3_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_3",
|
||||
include_all_files=False,
|
||||
folder_ids=ADMIN_FOLDER_3_URL,
|
||||
)
|
||||
# When a user doesn't have access, Box returns a 404 error
|
||||
try:
|
||||
user3_docs = load_all_docs(user3_connector)
|
||||
assert len(user3_docs) == 0, (
|
||||
f"test_user_3 should not have access to ADMIN_FOLDER_3, "
|
||||
f"but retrieved {len(user3_docs)} files. "
|
||||
f"Run setup script to ensure test_user_3's access is removed."
|
||||
)
|
||||
except BoxAPIError as e:
|
||||
# 404 error indicates no access (expected behavior)
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code != 404:
|
||||
raise
|
||||
except Exception as e:
|
||||
# For non-BoxAPIError exceptions, check if it's a wrapped 404
|
||||
# This handles cases where BoxAPIError might be wrapped
|
||||
error_msg = str(e).lower()
|
||||
if (
|
||||
"404" not in error_msg
|
||||
and "not found" not in error_msg
|
||||
and "not_found" not in error_msg
|
||||
):
|
||||
raise
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_collaboration_permissions(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test Box collaboration permissions."""
|
||||
# Test that test_user_1 has access to admin's folder 3 (shared via collaboration)
|
||||
user1_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_1",
|
||||
include_all_files=False,
|
||||
folder_ids=ADMIN_FOLDER_3_URL,
|
||||
)
|
||||
user1_docs = load_all_docs(user1_connector)
|
||||
# Should have access to files in admin's folder 3
|
||||
expected_file_ids = ADMIN_FOLDER_3_FILE_IDS
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=user1_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_shared_folders(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test files in shared folders."""
|
||||
# Test that test_user_2 has access to folder 1 (shared via group)
|
||||
user2_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_2",
|
||||
include_all_files=False,
|
||||
folder_ids=FOLDER_1_URL,
|
||||
)
|
||||
user2_docs = load_all_docs(user2_connector)
|
||||
# Should have access to files in folder 1
|
||||
expected_file_ids = FOLDER_1_FILE_IDS + FOLDER_1_1_FILE_IDS + FOLDER_1_2_FILE_IDS
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=user2_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_user_specific_access(
|
||||
mock_get_api_key: MagicMock,
|
||||
box_jwt_connector_factory: Callable[..., BoxConnector],
|
||||
) -> None:
|
||||
"""Test that users can only access their own files and shared files."""
|
||||
# Test with test_user_3 (most restricted)
|
||||
# test_user_3 should have access to public folder FOLDER_1_2
|
||||
# but should NOT have access to ADMIN_FOLDER_3 (restricted)
|
||||
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_URL
|
||||
|
||||
user3_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_3",
|
||||
include_all_files=False,
|
||||
folder_ids=FOLDER_1_2_URL,
|
||||
)
|
||||
user3_docs = load_all_docs(user3_connector)
|
||||
# test_user_3 should have access to public folder FOLDER_1_2
|
||||
# Verify they can access the public files in that folder
|
||||
expected_file_ids = FOLDER_1_2_FILE_IDS # Public folder files
|
||||
assert_expected_docs_in_retrieved_docs(
|
||||
retrieved_docs=user3_docs,
|
||||
expected_file_ids=expected_file_ids,
|
||||
)
|
||||
|
||||
# Verify test_user_3 does NOT have access to ADMIN_FOLDER_3
|
||||
user3_restricted_connector = box_jwt_connector_factory(
|
||||
user_key="test_user_3",
|
||||
include_all_files=False,
|
||||
folder_ids=ADMIN_FOLDER_3_URL,
|
||||
)
|
||||
try:
|
||||
restricted_docs = load_all_docs(user3_restricted_connector)
|
||||
# If no exception, verify no documents were retrieved
|
||||
assert len(restricted_docs) == 0, (
|
||||
f"test_user_3 should NOT have access to ADMIN_FOLDER_3, "
|
||||
f"but retrieved {len(restricted_docs)} files: {[doc.semantic_identifier for doc in restricted_docs]}"
|
||||
)
|
||||
except BoxAPIError as e:
|
||||
# If a BoxAPIError is raised with 404, that means test_user_3
|
||||
# doesn't have access, which is what we want. The test passes.
|
||||
status_code = getattr(e, "status_code", None)
|
||||
if status_code != 404:
|
||||
# Unexpected status code, re-raise it
|
||||
raise
|
||||
except Exception as e:
|
||||
# For non-BoxAPIError exceptions, check if it's a wrapped 404
|
||||
# This handles cases where BoxAPIError might be wrapped
|
||||
error_msg = str(e).lower()
|
||||
if (
|
||||
"404" not in error_msg
|
||||
and "not found" not in error_msg
|
||||
and "not_found" not in error_msg
|
||||
):
|
||||
# Unexpected error, re-raise it
|
||||
raise
|
||||
@@ -201,6 +201,10 @@ LOG_ONYX_MODEL_INTERACTIONS=False
|
||||
# LINEAR_CLIENT_ID=
|
||||
# LINEAR_CLIENT_SECRET=
|
||||
|
||||
# Box testing
|
||||
# WARNING: If set, BOX_DEVELOPER_TOKEN overrides JWT authentication and uses developer token instead. FOR TESTING ONLY.
|
||||
# BOX_DEVELOPER_TOKEN=
|
||||
|
||||
## Miscellaneous
|
||||
# ONYX_QUERY_HISTORY_TYPE=
|
||||
# CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=
|
||||
|
||||
@@ -115,6 +115,7 @@ backend = [
|
||||
"zulip==0.8.2",
|
||||
"hubspot-api-client==11.1.0",
|
||||
"asana==5.0.8",
|
||||
"boxsdk==10.3.0",
|
||||
"dropbox==12.0.2",
|
||||
"shapely==2.0.6",
|
||||
"stripe==10.12.0",
|
||||
|
||||
15
uv.lock
generated
15
uv.lock
generated
@@ -528,6 +528,19 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5e/e0/4bfaa72002bbe7befb96e8af8e56e7398b58ef981941577818b1a671e7f7/botocore_stubs-1.40.74-py3-none-any.whl", hash = "sha256:4c215592a8c26f66e0af773b513f1a34437da2a6d0f53a04928bbba1b131c935", size = 66541, upload-time = "2025-11-14T21:23:24.697Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxsdk"
|
||||
version = "10.3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "requests" },
|
||||
{ name = "requests-toolbelt" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/df/25/d859cc617d832506e80327a277b0e0cc7d1114d66e966fdab8b218ffaf17/boxsdk-10.3.0.tar.gz", hash = "sha256:5b8ec0e2ed70160e16fe2fc1240d3896c88d50bd30796b021e95cfbe977b3444", size = 272690, upload-time = "2025-12-19T11:31:15.369Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/05/af/fec6a530efdfc3d7739d821cdcb63de7c9979954fa21ef6d16d0b678c8ed/boxsdk-10.3.0-py3-none-any.whl", hash = "sha256:3f65792834315177765c096402e35f43400c4c99c9b6e82f9ac40c8de3da4767", size = 574729, upload-time = "2025-12-19T11:31:13.575Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "braintrust"
|
||||
version = "0.3.9"
|
||||
@@ -3465,6 +3478,7 @@ backend = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "boto3" },
|
||||
{ name = "boto3-stubs", extra = ["s3"] },
|
||||
{ name = "boxsdk" },
|
||||
{ name = "braintrust" },
|
||||
{ name = "celery" },
|
||||
{ name = "chardet" },
|
||||
@@ -3619,6 +3633,7 @@ requires-dist = [
|
||||
{ name = "black", marker = "extra == 'dev'", specifier = "==25.1.0" },
|
||||
{ name = "boto3", marker = "extra == 'backend'", specifier = "==1.39.11" },
|
||||
{ name = "boto3-stubs", extras = ["s3"], marker = "extra == 'backend'", specifier = "==1.39.11" },
|
||||
{ name = "boxsdk", marker = "extra == 'backend'", specifier = "==10.3.0" },
|
||||
{ name = "braintrust", marker = "extra == 'backend'", specifier = "==0.3.9" },
|
||||
{ name = "brotli", specifier = ">=1.2.0" },
|
||||
{ name = "celery", marker = "extra == 'backend'", specifier = "==5.5.1" },
|
||||
|
||||
BIN
web/public/box.png
Normal file
BIN
web/public/box.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 6.5 KiB |
@@ -35,9 +35,11 @@ import {
|
||||
} from "@/lib/connectors/connectors";
|
||||
import Modal from "@/refresh-components/Modal";
|
||||
import { GmailMain } from "@/app/admin/connectors/[connector]/pages/gmail/GmailPage";
|
||||
import { BoxMain } from "@/app/admin/connectors/[connector]/pages/box/BoxPage";
|
||||
import {
|
||||
useGmailCredentials,
|
||||
useGoogleDriveCredentials,
|
||||
useBoxCredentials,
|
||||
} from "@/app/admin/connectors/[connector]/pages/utils/hooks";
|
||||
import { Formik } from "formik";
|
||||
import NavigationRow from "@/app/admin/connectors/[connector]/NavigationRow";
|
||||
@@ -195,11 +197,13 @@ export default function AddConnector({
|
||||
// Hooks for Google Drive and Gmail credentials
|
||||
const { liveGDriveCredential } = useGoogleDriveCredentials(connector);
|
||||
const { liveGmailCredential } = useGmailCredentials(connector);
|
||||
const { liveBoxCredential } = useBoxCredentials(connector);
|
||||
|
||||
// Check if credential is activated
|
||||
const credentialActivated =
|
||||
(connector === "google_drive" && liveGDriveCredential) ||
|
||||
(connector === "gmail" && liveGmailCredential) ||
|
||||
(connector === "box" && liveBoxCredential) ||
|
||||
currentCredential;
|
||||
|
||||
// Check if there are no credentials
|
||||
@@ -434,7 +438,8 @@ export default function AddConnector({
|
||||
const credential =
|
||||
currentCredential ||
|
||||
liveGDriveCredential ||
|
||||
liveGmailCredential;
|
||||
liveGmailCredential ||
|
||||
liveBoxCredential;
|
||||
const linkCredentialResponse = await linkCredential(
|
||||
response.id,
|
||||
credential?.id!,
|
||||
@@ -516,6 +521,8 @@ export default function AddConnector({
|
||||
|
||||
{connector == ValidSources.Gmail ? (
|
||||
<GmailMain />
|
||||
) : connector == ValidSources.Box ? (
|
||||
<BoxMain />
|
||||
) : (
|
||||
<>
|
||||
<ModifyCredential
|
||||
@@ -638,6 +645,7 @@ export default function AddConnector({
|
||||
currentCredential ||
|
||||
liveGDriveCredential ||
|
||||
liveGmailCredential ||
|
||||
liveBoxCredential ||
|
||||
null
|
||||
}
|
||||
/>
|
||||
|
||||
118
web/src/app/admin/connectors/[connector]/pages/box/BoxPage.tsx
Normal file
118
web/src/app/admin/connectors/[connector]/pages/box/BoxPage.tsx
Normal file
@@ -0,0 +1,118 @@
|
||||
"use client";
|
||||
|
||||
import React from "react";
|
||||
import { ErrorCallout } from "@/components/ErrorCallout";
|
||||
import { LoadingAnimation } from "@/components/Loading";
|
||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||
import { CCPairBasicInfo, ValidSources } from "@/lib/types";
|
||||
import { Credential, BoxCredentialJson } from "@/lib/connectors/credentials";
|
||||
import { BoxAuthSection, BoxJsonUploadSection } from "./Credential";
|
||||
import { usePublicCredentials, useBasicConnectorStatus } from "@/lib/hooks";
|
||||
import Title from "@/components/ui/title";
|
||||
import { useUser } from "@/components/user/UserProvider";
|
||||
import useSWR from "swr";
|
||||
import { errorHandlingFetcher } from "@/lib/fetcher";
|
||||
import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
|
||||
|
||||
export const BoxMain = () => {
|
||||
const { isAdmin, user } = useUser();
|
||||
const { popup, setPopup } = usePopup();
|
||||
|
||||
const {
|
||||
data: jwtConfigData,
|
||||
isLoading: isJwtConfigLoading,
|
||||
error: isJwtConfigError,
|
||||
} = useSWR<{ client_id: string; enterprise_id: string }>(
|
||||
"/api/manage/admin/connector/box/jwt-config",
|
||||
errorHandlingFetcher
|
||||
);
|
||||
|
||||
const {
|
||||
data: connectorIndexingStatuses,
|
||||
isLoading: isConnectorIndexingStatusesLoading,
|
||||
error: connectorIndexingStatusesError,
|
||||
} = useBasicConnectorStatus();
|
||||
|
||||
const {
|
||||
data: credentialsData,
|
||||
isLoading: isCredentialsLoading,
|
||||
error: credentialsError,
|
||||
refreshCredentials,
|
||||
} = usePublicCredentials();
|
||||
|
||||
const handleRefresh = () => {
|
||||
refreshCredentials();
|
||||
};
|
||||
|
||||
if (
|
||||
(!jwtConfigData && isJwtConfigLoading && !isJwtConfigError) ||
|
||||
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
|
||||
(!credentialsData && isCredentialsLoading)
|
||||
) {
|
||||
return (
|
||||
<div className="mx-auto">
|
||||
<LoadingAnimation text="" />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (isJwtConfigError) {
|
||||
return <ErrorCallout errorTitle="Failed to load Box JWT config." />;
|
||||
}
|
||||
|
||||
if (credentialsError || !credentialsData) {
|
||||
return <ErrorCallout errorTitle="Failed to load credentials." />;
|
||||
}
|
||||
|
||||
if (connectorIndexingStatusesError || !connectorIndexingStatuses) {
|
||||
return <ErrorCallout errorTitle="Failed to load connectors." />;
|
||||
}
|
||||
|
||||
const boxJwtCredential: Credential<BoxCredentialJson> | undefined =
|
||||
credentialsData.find(
|
||||
(credential) =>
|
||||
credential.credential_json?.box_jwt_config &&
|
||||
credential.source === "box"
|
||||
);
|
||||
|
||||
const boxConnectorIndexingStatuses: CCPairBasicInfo[] =
|
||||
connectorIndexingStatuses.filter(
|
||||
(connectorIndexingStatus) => connectorIndexingStatus.source === "box"
|
||||
);
|
||||
|
||||
const connectorExists = boxConnectorIndexingStatuses.length > 0;
|
||||
|
||||
const hasUploadedJwtConfig = Boolean(jwtConfigData?.client_id);
|
||||
|
||||
return (
|
||||
<>
|
||||
{popup}
|
||||
<Title className="mb-2 mt-6 ml-auto mr-auto">
|
||||
Step 1: Provide your Box JWT Config
|
||||
</Title>
|
||||
<BoxJsonUploadSection
|
||||
setPopup={setPopup}
|
||||
jwtConfigData={jwtConfigData}
|
||||
isAdmin={isAdmin}
|
||||
onSuccess={handleRefresh}
|
||||
existingAuthCredential={Boolean(boxJwtCredential)}
|
||||
/>
|
||||
|
||||
{isAdmin && hasUploadedJwtConfig && (
|
||||
<>
|
||||
<Title className="mb-2 mt-6 ml-auto mr-auto">
|
||||
Step 2: Create Credential
|
||||
</Title>
|
||||
<BoxAuthSection
|
||||
setPopup={setPopup}
|
||||
refreshCredentials={handleRefresh}
|
||||
boxJwtCredential={boxJwtCredential}
|
||||
jwtConfigData={jwtConfigData}
|
||||
connectorAssociated={connectorExists}
|
||||
user={user}
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
};
|
||||
@@ -0,0 +1,540 @@
|
||||
"use client";
|
||||
|
||||
import { PopupSpec } from "@/components/admin/connectors/Popup";
|
||||
import React, { useState, useEffect } from "react";
|
||||
import { useSWRConfig } from "swr";
|
||||
import * as Yup from "yup";
|
||||
import { TextFormField, SectionHeader } from "@/components/Field";
|
||||
import { Form, Formik } from "formik";
|
||||
import { User, ValidSources } from "@/lib/types";
|
||||
import Button from "@/refresh-components/buttons/Button";
|
||||
import { Credential, BoxCredentialJson } from "@/lib/connectors/credentials";
|
||||
import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
|
||||
import { FiFile, FiCheck, FiLink, FiAlertTriangle } from "react-icons/fi";
|
||||
import { cn, truncateString } from "@/lib/utils";
|
||||
import { adminDeleteCredential } from "@/lib/credential";
|
||||
import { DOCS_ADMINS_PATH } from "@/lib/constants";
|
||||
|
||||
export const BoxJsonUpload = ({
|
||||
setPopup,
|
||||
onSuccess,
|
||||
}: {
|
||||
setPopup: (popupSpec: PopupSpec | null) => void;
|
||||
onSuccess?: () => void;
|
||||
}) => {
|
||||
const { mutate } = useSWRConfig();
|
||||
const [isUploading, setIsUploading] = useState(false);
|
||||
const [fileName, setFileName] = useState<string | undefined>();
|
||||
const [isDragging, setIsDragging] = useState(false);
|
||||
|
||||
const handleFileUpload = async (file: File) => {
|
||||
setIsUploading(true);
|
||||
setFileName(file.name);
|
||||
|
||||
const reader = new FileReader();
|
||||
reader.onload = async (loadEvent) => {
|
||||
if (!loadEvent?.target?.result) {
|
||||
setIsUploading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
const credentialJsonStr = loadEvent.target.result as string;
|
||||
|
||||
// Validate Box JWT config structure
|
||||
try {
|
||||
const jwtConfigJson = JSON.parse(credentialJsonStr);
|
||||
if (!jwtConfigJson.boxAppSettings) {
|
||||
throw new Error(
|
||||
"Invalid Box JWT config: missing 'boxAppSettings' field"
|
||||
);
|
||||
}
|
||||
if (!jwtConfigJson.boxAppSettings.clientID) {
|
||||
throw new Error(
|
||||
"Invalid Box JWT config: missing 'boxAppSettings.clientID'"
|
||||
);
|
||||
}
|
||||
if (!jwtConfigJson.boxAppSettings.appAuth) {
|
||||
throw new Error(
|
||||
"Invalid Box JWT config: missing 'boxAppSettings.appAuth'"
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
setPopup({
|
||||
message: `Invalid Box JWT config file - ${e}`,
|
||||
type: "error",
|
||||
});
|
||||
setIsUploading(false);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(
|
||||
"/api/manage/admin/connector/box/jwt-config",
|
||||
{
|
||||
method: "PUT",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: credentialJsonStr,
|
||||
}
|
||||
);
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: "Successfully uploaded Box JWT config",
|
||||
type: "success",
|
||||
});
|
||||
mutate("/api/manage/admin/connector/box/jwt-config");
|
||||
if (onSuccess) {
|
||||
onSuccess();
|
||||
}
|
||||
} else {
|
||||
const errorMsg = await response.text();
|
||||
setPopup({
|
||||
message: `Failed to upload Box JWT config - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
setPopup({
|
||||
message: `Failed to upload Box JWT config - ${error}`,
|
||||
type: "error",
|
||||
});
|
||||
} finally {
|
||||
setIsUploading(false);
|
||||
}
|
||||
};
|
||||
|
||||
reader.onerror = () => {
|
||||
setPopup({
|
||||
message: "Failed to read file. Please try again.",
|
||||
type: "error",
|
||||
});
|
||||
setIsUploading(false);
|
||||
};
|
||||
|
||||
reader.onabort = () => {
|
||||
setPopup({
|
||||
message: "File read was aborted. Please try again.",
|
||||
type: "error",
|
||||
});
|
||||
setIsUploading(false);
|
||||
};
|
||||
|
||||
reader.readAsText(file);
|
||||
};
|
||||
|
||||
const handleDragEnter = (e: React.DragEvent<HTMLLabelElement>) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
if (!isUploading) {
|
||||
setIsDragging(true);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDragLeave = (e: React.DragEvent<HTMLLabelElement>) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
setIsDragging(false);
|
||||
};
|
||||
|
||||
const handleDragOver = (e: React.DragEvent<HTMLLabelElement>) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
};
|
||||
|
||||
const handleDrop = (e: React.DragEvent<HTMLLabelElement>) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
setIsDragging(false);
|
||||
|
||||
if (isUploading) return;
|
||||
|
||||
const files = e.dataTransfer.files;
|
||||
if (files.length > 0) {
|
||||
const file = files[0];
|
||||
if (
|
||||
file !== undefined &&
|
||||
(file.type === "application/json" || file.name.endsWith(".json"))
|
||||
) {
|
||||
handleFileUpload(file);
|
||||
} else {
|
||||
setPopup({
|
||||
message: "Please upload a JSON file",
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="flex flex-col mt-4">
|
||||
<div className="flex items-center">
|
||||
<div className="relative flex flex-1 items-center">
|
||||
<label
|
||||
className={cn(
|
||||
"flex h-10 items-center justify-center w-full px-4 py-2 border border-dashed rounded-md transition-colors",
|
||||
isUploading
|
||||
? "opacity-70 cursor-not-allowed border-background-400 bg-background-50/30"
|
||||
: isDragging
|
||||
? "bg-background-50/50 border-primary dark:border-primary"
|
||||
: "cursor-pointer hover:bg-background-50/30 hover:border-primary dark:hover:border-primary border-background-300 dark:border-background-600"
|
||||
)}
|
||||
onDragEnter={handleDragEnter}
|
||||
onDragLeave={handleDragLeave}
|
||||
onDragOver={handleDragOver}
|
||||
onDrop={handleDrop}
|
||||
>
|
||||
<div className="flex items-center space-x-2">
|
||||
{isUploading ? (
|
||||
<div className="h-4 w-4 border-t-2 border-b-2 border-primary rounded-full animate-spin"></div>
|
||||
) : (
|
||||
<FiFile className="h-4 w-4 text-text-500" />
|
||||
)}
|
||||
<span className="text-sm text-text-500">
|
||||
{isUploading
|
||||
? `Uploading ${truncateString(fileName || "file", 50)}...`
|
||||
: isDragging
|
||||
? "Drop JSON file here"
|
||||
: truncateString(
|
||||
fileName || "Select or drag Box JWT config file...",
|
||||
50
|
||||
)}
|
||||
</span>
|
||||
</div>
|
||||
<input
|
||||
className="sr-only"
|
||||
type="file"
|
||||
accept=".json"
|
||||
disabled={isUploading}
|
||||
onChange={(event) => {
|
||||
if (!event.target.files?.length) {
|
||||
return;
|
||||
}
|
||||
const file = event.target.files[0];
|
||||
if (file === undefined) {
|
||||
return;
|
||||
}
|
||||
handleFileUpload(file);
|
||||
}}
|
||||
/>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
interface BoxJsonUploadSectionProps {
|
||||
setPopup: (popupSpec: PopupSpec | null) => void;
|
||||
jwtConfigData?: { client_id: string; enterprise_id: string };
|
||||
isAdmin: boolean;
|
||||
onSuccess?: () => void;
|
||||
existingAuthCredential?: boolean;
|
||||
}
|
||||
|
||||
export const BoxJsonUploadSection = ({
|
||||
setPopup,
|
||||
jwtConfigData,
|
||||
isAdmin,
|
||||
onSuccess,
|
||||
existingAuthCredential,
|
||||
}: BoxJsonUploadSectionProps) => {
|
||||
const { mutate } = useSWRConfig();
|
||||
const [localJwtConfigData, setLocalJwtConfigData] = useState(jwtConfigData);
|
||||
|
||||
// Update local state when props change
|
||||
useEffect(() => {
|
||||
setLocalJwtConfigData(jwtConfigData);
|
||||
}, [jwtConfigData]);
|
||||
|
||||
const handleSuccess = () => {
|
||||
if (onSuccess) {
|
||||
onSuccess();
|
||||
}
|
||||
};
|
||||
|
||||
if (!isAdmin) {
|
||||
return (
|
||||
<div>
|
||||
<div className="flex items-start py-3 px-4 bg-yellow-50/30 dark:bg-yellow-900/5 rounded">
|
||||
<FiAlertTriangle className="text-yellow-500 h-5 w-5 mr-2 mt-0.5 flex-shrink-0" />
|
||||
<p className="text-sm">
|
||||
Curators are unable to set up the Box credentials. To add a Box
|
||||
connector, please contact an administrator.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<p className="text-sm mb-3">
|
||||
To connect your Box account, create a Box Platform App with JWT
|
||||
authentication, download the JSON config file, and upload it below.
|
||||
</p>
|
||||
<div className="mb-4">
|
||||
<a
|
||||
className="text-primary hover:text-primary/80 flex items-center gap-1 text-sm"
|
||||
target="_blank"
|
||||
href={`${DOCS_ADMINS_PATH}/connectors/official/box/overview`}
|
||||
rel="noreferrer"
|
||||
>
|
||||
<FiLink className="h-3 w-3" />
|
||||
View detailed setup instructions
|
||||
</a>
|
||||
</div>
|
||||
|
||||
{localJwtConfigData?.client_id && (
|
||||
<div className="mb-4">
|
||||
<div className="relative flex flex-1 items-center">
|
||||
<label
|
||||
className={cn(
|
||||
"flex h-10 items-center justify-center w-full px-4 py-2 border border-dashed rounded-md transition-colors",
|
||||
"cursor-pointer hover:bg-background-50/30 hover:border-primary dark:hover:border-primary border-background-300 dark:border-background-600"
|
||||
)}
|
||||
>
|
||||
<div className="flex items-center space-x-2">
|
||||
<FiFile className="h-4 w-4 text-text-500" />
|
||||
<span className="text-sm text-text-500">
|
||||
{truncateString(
|
||||
`Client ID: ${localJwtConfigData.client_id}`,
|
||||
50
|
||||
)}
|
||||
</span>
|
||||
</div>
|
||||
</label>
|
||||
</div>
|
||||
{isAdmin && !existingAuthCredential && (
|
||||
<div className="mt-2">
|
||||
<Button
|
||||
danger
|
||||
onClick={async () => {
|
||||
try {
|
||||
const response = await fetch(
|
||||
"/api/manage/admin/connector/box/jwt-config",
|
||||
{
|
||||
method: "DELETE",
|
||||
}
|
||||
);
|
||||
|
||||
if (response.ok) {
|
||||
mutate("/api/manage/admin/connector/box/jwt-config");
|
||||
mutate(buildSimilarCredentialInfoURL(ValidSources.Box));
|
||||
|
||||
setPopup({
|
||||
message: "Successfully deleted Box JWT config",
|
||||
type: "success",
|
||||
});
|
||||
setLocalJwtConfigData(undefined);
|
||||
handleSuccess();
|
||||
} else {
|
||||
const errorMsg = await response.text();
|
||||
setPopup({
|
||||
message: `Failed to delete JWT config - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
setPopup({
|
||||
message: `Failed to delete JWT config - ${error}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
}}
|
||||
>
|
||||
Delete JWT Config
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{!localJwtConfigData?.client_id && (
|
||||
<BoxJsonUpload setPopup={setPopup} onSuccess={handleSuccess} />
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
interface BoxCredentialSectionProps {
|
||||
boxJwtCredential?: Credential<BoxCredentialJson>;
|
||||
jwtConfigData?: { client_id: string; enterprise_id: string };
|
||||
setPopup: (popupSpec: PopupSpec | null) => void;
|
||||
refreshCredentials: () => void;
|
||||
connectorAssociated: boolean;
|
||||
user: User | null;
|
||||
}
|
||||
|
||||
async function handleRevokeAccess(
|
||||
connectorAssociated: boolean,
|
||||
setPopup: (popupSpec: PopupSpec | null) => void,
|
||||
existingCredential: Credential<BoxCredentialJson>,
|
||||
refreshCredentials: () => void
|
||||
) {
|
||||
if (connectorAssociated) {
|
||||
const message =
|
||||
"Cannot revoke the Box credential while any connector is still associated with the credential. " +
|
||||
"Please delete all associated connectors, then try again.";
|
||||
setPopup({
|
||||
message: message,
|
||||
type: "error",
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
const response = await adminDeleteCredential(existingCredential.id);
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: "Successfully revoked the Box credential!",
|
||||
type: "success",
|
||||
});
|
||||
refreshCredentials();
|
||||
} else {
|
||||
const errorMsg = await response.text();
|
||||
setPopup({
|
||||
message: `Failed to revoke Box credential - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export const BoxAuthSection = ({
|
||||
boxJwtCredential,
|
||||
jwtConfigData,
|
||||
setPopup,
|
||||
refreshCredentials,
|
||||
connectorAssociated,
|
||||
user,
|
||||
}: BoxCredentialSectionProps) => {
|
||||
const [localJwtConfigData, setLocalJwtConfigData] = useState(jwtConfigData);
|
||||
const [localBoxJwtCredential, setLocalBoxJwtCredential] =
|
||||
useState(boxJwtCredential);
|
||||
|
||||
// Update local state when props change
|
||||
useEffect(() => {
|
||||
setLocalJwtConfigData(jwtConfigData);
|
||||
setLocalBoxJwtCredential(boxJwtCredential);
|
||||
}, [jwtConfigData, boxJwtCredential]);
|
||||
|
||||
if (localBoxJwtCredential) {
|
||||
return (
|
||||
<div>
|
||||
<div className="mt-4">
|
||||
<div className="py-3 px-4 bg-blue-50/30 dark:bg-blue-900/5 rounded mb-4 flex items-start">
|
||||
<FiCheck className="text-blue-500 h-5 w-5 mr-2 mt-0.5 flex-shrink-0" />
|
||||
<div className="flex-1">
|
||||
<span className="font-medium block">Authentication Complete</span>
|
||||
<p className="text-sm mt-1 text-text-500 dark:text-text-400 break-words">
|
||||
Your Box JWT credentials have been successfully uploaded and
|
||||
authenticated.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<Button
|
||||
danger
|
||||
onClick={async () => {
|
||||
handleRevokeAccess(
|
||||
connectorAssociated,
|
||||
setPopup,
|
||||
localBoxJwtCredential,
|
||||
refreshCredentials
|
||||
);
|
||||
}}
|
||||
>
|
||||
Revoke Access
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// If no JWT config is uploaded, show message to complete step 1 first
|
||||
if (!localJwtConfigData?.client_id) {
|
||||
return (
|
||||
<div>
|
||||
<SectionHeader>Box Authentication</SectionHeader>
|
||||
<div className="mt-4">
|
||||
<div className="flex items-start py-3 px-4 bg-yellow-50/30 dark:bg-yellow-900/5 rounded">
|
||||
<FiAlertTriangle className="text-yellow-500 h-5 w-5 mr-2 mt-0.5 flex-shrink-0" />
|
||||
<p className="text-sm">
|
||||
Please complete Step 1 by uploading the Box JWT config file before
|
||||
proceeding with authentication.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// If JWT config is uploaded, show form to create credential with user ID
|
||||
return (
|
||||
<div>
|
||||
<div className="mt-4">
|
||||
<Formik
|
||||
initialValues={{
|
||||
box_primary_admin_user_id: "",
|
||||
}}
|
||||
validationSchema={Yup.object().shape({
|
||||
box_primary_admin_user_id: Yup.string().required(
|
||||
"Primary admin user ID is required"
|
||||
),
|
||||
})}
|
||||
onSubmit={async (values, formikHelpers) => {
|
||||
formikHelpers.setSubmitting(true);
|
||||
try {
|
||||
const response = await fetch(
|
||||
"/api/manage/admin/connector/box/jwt-credential",
|
||||
{
|
||||
method: "PUT",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
box_primary_admin_user_id: values.box_primary_admin_user_id,
|
||||
}),
|
||||
}
|
||||
);
|
||||
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: "Successfully created Box JWT credential",
|
||||
type: "success",
|
||||
});
|
||||
refreshCredentials();
|
||||
} else {
|
||||
const errorMsg = await response.text();
|
||||
setPopup({
|
||||
message: `Failed to create Box JWT credential - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
setPopup({
|
||||
message: `Failed to create Box JWT credential - ${error}`,
|
||||
type: "error",
|
||||
});
|
||||
} finally {
|
||||
formikHelpers.setSubmitting(false);
|
||||
}
|
||||
}}
|
||||
>
|
||||
{({ isSubmitting }) => (
|
||||
<Form>
|
||||
<TextFormField
|
||||
name="box_primary_admin_user_id"
|
||||
label="Primary Admin User ID:"
|
||||
subtext="Enter the Box user ID of an admin/owner that has access to the Box content you want to index. You can find this in the Box Admin Console or by calling the Box API."
|
||||
/>
|
||||
<div className="flex">
|
||||
<Button type="submit" disabled={isSubmitting}>
|
||||
{isSubmitting ? "Creating..." : "Create Credential"}
|
||||
</Button>
|
||||
</div>
|
||||
</Form>
|
||||
)}
|
||||
</Formik>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
GmailServiceAccountCredentialJson,
|
||||
GoogleDriveCredentialJson,
|
||||
GoogleDriveServiceAccountCredentialJson,
|
||||
BoxCredentialJson,
|
||||
} from "@/lib/connectors/credentials";
|
||||
|
||||
export const useGmailCredentials = (connector: string) => {
|
||||
@@ -73,3 +74,19 @@ export const useGoogleDriveCredentials = (connector: string) => {
|
||||
liveGDriveCredential: liveGDriveCredential,
|
||||
};
|
||||
};
|
||||
|
||||
export const useBoxCredentials = (connector: string) => {
|
||||
const { data: credentialsData } = usePublicCredentials();
|
||||
|
||||
const boxJwtCredential: Credential<BoxCredentialJson> | undefined =
|
||||
credentialsData?.find(
|
||||
(credential) =>
|
||||
credential.credential_json?.box_jwt_config &&
|
||||
credential.admin_public &&
|
||||
credential.source === connector
|
||||
);
|
||||
|
||||
return {
|
||||
liveBoxCredential: boxJwtCredential,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -37,6 +37,7 @@ import discordIcon from "@public/discord.png";
|
||||
import discourseIcon from "@public/Discourse.png";
|
||||
import document360Icon from "@public/Document360.png";
|
||||
import dropboxIcon from "@public/Dropbox.png";
|
||||
import boxIcon from "@public/box.png";
|
||||
import drupalwikiIcon from "@public/DrupalWiki.png";
|
||||
import egnyteIcon from "@public/Egnyte.png";
|
||||
import firefliesIcon from "@public/Fireflies.png";
|
||||
@@ -835,6 +836,7 @@ export const DeepseekIcon = createLogoIcon(deepseekSVG);
|
||||
export const DiscourseIcon = createLogoIcon(discourseIcon);
|
||||
export const Document360Icon = createLogoIcon(document360Icon);
|
||||
export const DropboxIcon = createLogoIcon(dropboxIcon);
|
||||
export const BoxIcon = createLogoIcon(boxIcon);
|
||||
export const DrupalWikiIcon = createLogoIcon(drupalwikiIcon);
|
||||
export const EgnyteIcon = createLogoIcon(egnyteIcon);
|
||||
export const FirefliesIcon = createLogoIcon(firefliesIcon);
|
||||
|
||||
@@ -1236,6 +1236,31 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
|
||||
values: [],
|
||||
advanced_values: [],
|
||||
},
|
||||
box: {
|
||||
description: "Configure Box connector",
|
||||
values: [
|
||||
{
|
||||
type: "checkbox",
|
||||
query: "Index all accessible files?",
|
||||
label: "Include All Files",
|
||||
name: "include_all_files",
|
||||
description:
|
||||
"If checked, the connector will index all files accessible to the authenticated user.",
|
||||
optional: false,
|
||||
default: false,
|
||||
},
|
||||
{
|
||||
type: "list",
|
||||
query: "Enter folder IDs or URLs (optional):",
|
||||
label: "Folder IDs",
|
||||
name: "folder_ids",
|
||||
description:
|
||||
"Comma-separated list of Box folder IDs or URLs to index. Leave empty if 'Include All Files' is checked.",
|
||||
optional: true,
|
||||
},
|
||||
],
|
||||
advanced_values: [],
|
||||
},
|
||||
s3: {
|
||||
description: "Configure S3 connector",
|
||||
values: [
|
||||
@@ -1743,6 +1768,26 @@ export function createConnectorValidationSchema(
|
||||
},
|
||||
{} as Record<string, any>
|
||||
),
|
||||
// Box-specific validation: require either include_all_files or folder_ids
|
||||
...(connector === "box"
|
||||
? {
|
||||
folder_ids: Yup.array()
|
||||
.of(Yup.string())
|
||||
.when("include_all_files", {
|
||||
is: false,
|
||||
then: (schema) =>
|
||||
schema
|
||||
.min(
|
||||
1,
|
||||
"At least one folder ID is required when 'Include All Files' is unchecked"
|
||||
)
|
||||
.required(
|
||||
"Folder IDs are required when 'Include All Files' is unchecked"
|
||||
),
|
||||
otherwise: (schema) => schema,
|
||||
}),
|
||||
}
|
||||
: {}),
|
||||
// These are advanced settings
|
||||
indexingStart: Yup.string().nullable(),
|
||||
pruneFreq: Yup.number().min(
|
||||
|
||||
@@ -177,6 +177,20 @@ export interface DropboxCredentialJson {
|
||||
dropbox_access_token: string;
|
||||
}
|
||||
|
||||
export interface BoxCredentialJson {
|
||||
// JWT authentication (new)
|
||||
box_jwt_config?: string; // JSON string of Box JWT config
|
||||
box_primary_admin_user_id?: string; // User ID to impersonate
|
||||
authentication_method?: string; // "uploaded" for JWT
|
||||
// OAuth flow credentials (legacy, deprecated)
|
||||
access_token?: string;
|
||||
refresh_token?: string;
|
||||
// Legacy credentials (for backward compatibility)
|
||||
box_access_token?: string;
|
||||
box_refresh_token?: string;
|
||||
box_user_id?: string;
|
||||
}
|
||||
|
||||
export interface R2CredentialJson {
|
||||
account_id: string;
|
||||
r2_access_key_id: string;
|
||||
@@ -331,6 +345,10 @@ export const credentialTemplates: Record<ValidSources, any> = {
|
||||
loopio_client_token: "",
|
||||
} as LoopioCredentialJson,
|
||||
dropbox: { dropbox_access_token: "" } as DropboxCredentialJson,
|
||||
box: {
|
||||
access_token: "",
|
||||
refresh_token: "",
|
||||
} as BoxCredentialJson,
|
||||
salesforce: {
|
||||
sf_username: "",
|
||||
sf_password: "",
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
DiscourseIcon,
|
||||
Document360Icon,
|
||||
DropboxIcon,
|
||||
BoxIcon,
|
||||
GithubIcon,
|
||||
GitlabIcon,
|
||||
BitbucketIcon,
|
||||
@@ -198,6 +199,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
|
||||
category: SourceCategory.Storage,
|
||||
docs: `${DOCS_ADMINS_PATH}/connectors/official/dropbox`,
|
||||
},
|
||||
box: {
|
||||
icon: BoxIcon,
|
||||
displayName: "Box",
|
||||
category: SourceCategory.Storage,
|
||||
docs: `${DOCS_ADMINS_PATH}/connectors/official/box`,
|
||||
},
|
||||
s3: {
|
||||
icon: S3Icon,
|
||||
displayName: "S3",
|
||||
|
||||
@@ -504,6 +504,7 @@ export enum ValidSources {
|
||||
DrupalWiki = "drupal_wiki",
|
||||
Imap = "imap",
|
||||
Bitbucket = "bitbucket",
|
||||
Box = "box",
|
||||
TestRail = "testrail",
|
||||
|
||||
// Federated Connectors
|
||||
|
||||
Reference in New Issue
Block a user