Compare commits

...

6 Commits

Author SHA1 Message Date
coreyauger
7d92362c15 review 2026-01-14 20:50:54 +08:00
coreyauger
3dc6389190 more review 2026-01-14 18:01:41 +08:00
coreyauger
a0c1438200 review changes 2026-01-13 14:40:36 +08:00
coreyauger
c65d3741c2 more review changes 2026-01-13 13:44:03 +08:00
coreyauger
1748c0c0e4 review changes 2026-01-13 11:41:17 +08:00
coreyauger
9230997069 feat: box connector 2026-01-12 13:21:53 +08:00
43 changed files with 5732 additions and 1 deletions

View File

@@ -100,6 +100,15 @@ SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY = int(
)
#####
# Box
#####
# In seconds, default is 30 minutes
BOX_PERMISSION_GROUP_SYNC_FREQUENCY = int(
os.environ.get("BOX_PERMISSION_GROUP_SYNC_FREQUENCY") or 30 * 60
)
####
# Celery Job Frequency
####

View File

@@ -0,0 +1 @@
"""Box external permissions module for syncing document permissions."""

View File

@@ -0,0 +1,170 @@
from collections.abc import Generator
from datetime import datetime
from datetime import timezone
from box_sdk_gen.client import BoxClient
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
from onyx.access.models import DocExternalAccess
from onyx.access.models import ExternalAccess
from onyx.connectors.box.connector import BoxConnector
from onyx.connectors.box.models import BoxFileType
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.db.models import ConnectorCredentialPair
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
logger = setup_logger()
def get_external_access_for_raw_box_file(
file: BoxFileType,
company_domain: str | None,
retriever_box_client: BoxClient | None,
admin_box_client: BoxClient,
) -> ExternalAccess:
"""
Extract permissions from a Box file's collaborations.
Box permissions are managed through collaborations, which can be:
- User collaborations: direct access for specific users (by email)
- Group collaborations: access for groups (by group ID)
- Public links: shared links that may be publicly accessible
"""
file_id = file.get("id")
if not file_id:
raise ValueError("No file_id found in file")
user_emails: set[str] = set()
group_ids: set[str] = set()
public = False
# Use admin client to get permissions (has broader access)
box_client = admin_box_client or retriever_box_client
if not box_client:
logger.warning(f"No Box client available for file {file_id}")
return ExternalAccess(
external_user_emails=set(),
external_user_group_ids=set(),
is_public=False,
)
try:
collaborations_response = box_client.collaborations.get_file_collaborations(
file_id=file_id
)
for collaboration in collaborations_response.entries:
accessible_by = collaboration.accessible_by
if accessible_by:
# User collaboration: extract email/login
if hasattr(accessible_by, "login") and accessible_by.login:
user_emails.add(accessible_by.login)
elif hasattr(accessible_by, "email") and accessible_by.email:
user_emails.add(accessible_by.email)
# Group collaboration: groups have name but no login/email
if hasattr(accessible_by, "name") and not hasattr(
accessible_by, "login"
):
if hasattr(accessible_by, "id") and accessible_by.id:
group_ids.add(str(accessible_by.id))
# Public link collaboration: accessible_by is None for public links
if accessible_by is None:
if (
hasattr(collaboration, "status")
and collaboration.status == "accepted"
and file.get("shared_link")
):
public = True
except Exception as e:
logger.warning(
f"Failed to get collaborations for Box file {file_id}: {e}. "
"Returning minimal access (file owner retains access via retriever user)."
)
# Check for shared link (indicates potential public access)
# Only mark as public if the shared link is actually public (access="open")
# and not password-protected
shared_link = file.get("shared_link")
if shared_link:
# shared_link can be a string (legacy) or a dict with access/password info
if isinstance(shared_link, dict):
access = shared_link.get("access")
password = shared_link.get("password")
# Only mark as public if access is "open" and not password-protected
if access == "open" and not password:
public = True
elif isinstance(shared_link, str):
# Legacy: if it's just a URL string, we can't determine access level
# Don't assume it's public - only mark as public if we found a public
# collaboration above
pass
return ExternalAccess(
external_user_emails=user_emails,
external_user_group_ids=group_ids,
is_public=public,
)
def _get_slim_doc_generator(
cc_pair: ConnectorCredentialPair,
box_connector: BoxConnector,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
current_time = datetime.now(timezone.utc)
start_time = (
cc_pair.last_time_perm_sync.replace(tzinfo=timezone.utc).timestamp()
if cc_pair.last_time_perm_sync
else 0.0
)
return box_connector.retrieve_all_slim_docs_perm_sync(
start=start_time,
end=current_time.timestamp(),
callback=callback,
)
def box_doc_sync(
cc_pair: ConnectorCredentialPair,
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
callback: IndexingHeartbeatInterface | None,
) -> Generator[DocExternalAccess, None, None]:
"""
Sync Box file permissions to documents in the database.
Retrieves slim documents from Box and extracts their permissions,
yielding DocExternalAccess objects for each document with permissions.
If a document doesn't exist yet, permissions are pre-populated
so they're available when the document is created.
"""
box_connector = BoxConnector(**cc_pair.connector.connector_specific_config)
box_connector.load_credentials(cc_pair.credential.credential_json)
slim_doc_generator = _get_slim_doc_generator(
cc_pair, box_connector, callback=callback
)
for slim_doc_batch in slim_doc_generator:
for slim_doc in slim_doc_batch:
if callback:
if callback.should_stop():
raise RuntimeError("box_doc_sync: Stop signal detected")
callback.progress("box_doc_sync", 1)
if slim_doc.external_access is None:
logger.warning(f"No permissions found for document {slim_doc.id}")
continue
yield DocExternalAccess(
doc_id=slim_doc.id,
external_access=slim_doc.external_access,
)

View File

@@ -0,0 +1,84 @@
from collections.abc import Generator
from ee.onyx.db.external_perm import ExternalUserGroup
from onyx.connectors.box.connector import BoxConnector
from onyx.db.models import ConnectorCredentialPair
from onyx.utils.logger import setup_logger
logger = setup_logger()
def box_group_sync(
tenant_id: str,
cc_pair: ConnectorCredentialPair,
) -> Generator[ExternalUserGroup, None, None]:
"""
Sync Box groups and their members.
This function fetches all groups from Box and yields ExternalUserGroup
objects containing the group ID and member emails.
"""
# Create Box connector and load credentials
box_connector = BoxConnector(**cc_pair.connector.connector_specific_config)
box_connector.load_credentials(cc_pair.credential.credential_json)
box_client = box_connector.box_client
logger.info("Starting Box group sync...")
try:
# Get all groups in the enterprise
# Box API: GET /groups
groups_response = box_client.groups.get_groups()
for group in groups_response.entries:
group_id = str(group.id)
group_name = getattr(group, "name", None) or f"Group_{group_id}"
logger.debug(f"Processing Box group: {group_name} (ID: {group_id})")
# Get members of this group
# Box API: GET /groups/{group_id}/memberships
try:
memberships_response = box_client.groups.get_group_memberships(
group_id=group_id
)
user_emails: set[str] = set()
for membership in memberships_response.entries:
user = getattr(membership, "user", None)
if user:
# Extract email from user object
email = getattr(user, "login", None) or getattr(
user, "email", None
)
if email:
user_emails.add(email)
else:
logger.warning(
f"Group member {getattr(user, 'id', 'unknown')} "
f"has no email/login in group {group_name}"
)
if user_emails:
logger.info(
f"Found {len(user_emails)} members in Box group {group_name}"
)
yield ExternalUserGroup(
id=group_id,
user_emails=list(user_emails),
)
else:
logger.warning(
f"Box group {group_name} (ID: {group_id}) has no members with emails"
)
except Exception as e:
logger.error(
f"Error fetching members for Box group {group_name} (ID: {group_id}): {e}"
)
# Continue with other groups even if one fails
except Exception as e:
logger.error(f"Error during Box group sync: {e}")
raise

View File

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
from pydantic import BaseModel
from ee.onyx.configs.app_configs import BOX_PERMISSION_GROUP_SYNC_FREQUENCY
from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY
from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY
from ee.onyx.configs.app_configs import DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
@@ -16,6 +17,8 @@ from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY
from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY
from ee.onyx.configs.app_configs import SLACK_PERMISSION_DOC_SYNC_FREQUENCY
from ee.onyx.configs.app_configs import TEAMS_PERMISSION_DOC_SYNC_FREQUENCY
from ee.onyx.external_permissions.box.doc_sync import box_doc_sync
from ee.onyx.external_permissions.box.group_sync import box_group_sync
from ee.onyx.external_permissions.confluence.doc_sync import confluence_doc_sync
from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
from ee.onyx.external_permissions.github.doc_sync import github_doc_sync
@@ -134,6 +137,18 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
initial_index_should_sync=False,
),
),
DocumentSource.BOX: SyncConfig(
doc_sync_config=DocSyncConfig(
doc_sync_frequency=DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY,
doc_sync_func=box_doc_sync,
initial_index_should_sync=True,
),
group_sync_config=GroupSyncConfig(
group_sync_frequency=BOX_PERMISSION_GROUP_SYNC_FREQUENCY,
group_sync_func=box_group_sync,
group_sync_is_cc_pair_agnostic=False,
),
),
DocumentSource.GITHUB: SyncConfig(
doc_sync_config=DocSyncConfig(
doc_sync_frequency=GITHUB_PERMISSION_DOC_SYNC_FREQUENCY,

View File

@@ -570,6 +570,9 @@ EGNYTE_CLIENT_SECRET = os.getenv("EGNYTE_CLIENT_SECRET")
LINEAR_CLIENT_ID = os.getenv("LINEAR_CLIENT_ID")
LINEAR_CLIENT_SECRET = os.getenv("LINEAR_CLIENT_SECRET")
# Box specific configs
BOX_DEVELOPER_TOKEN = os.getenv("BOX_DEVELOPER_TOKEN")
# Slack specific configs
SLACK_NUM_THREADS = int(os.getenv("SLACK_NUM_THREADS") or 8)
MAX_SLACK_QUERY_EXPANSIONS = int(os.environ.get("MAX_SLACK_QUERY_EXPANSIONS", "5"))

View File

@@ -103,6 +103,7 @@ KV_GMAIL_CRED_KEY = "gmail_app_credential"
KV_GMAIL_SERVICE_ACCOUNT_KEY = "gmail_service_account_key"
KV_GOOGLE_DRIVE_CRED_KEY = "google_drive_app_credential"
KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY = "google_drive_service_account_key"
KV_BOX_JWT_CONFIG = "box_jwt_config"
KV_GEN_AI_KEY_CHECK_TIME = "genai_api_key_last_check_time"
KV_SETTINGS_KEY = "onyx_settings"
KV_CUSTOMER_UUID_KEY = "customer_uuid"
@@ -210,6 +211,7 @@ class DocumentSource(str, Enum):
AIRTABLE = "airtable"
HIGHSPOT = "highspot"
DRUPAL_WIKI = "drupal_wiki"
BOX = "box"
IMAP = "imap"
BITBUCKET = "bitbucket"
@@ -631,6 +633,7 @@ project management, and collaboration tools into a single, customizable platform
DocumentSource.AIRTABLE: "airtable - database",
DocumentSource.HIGHSPOT: "highspot - CRM data",
DocumentSource.DRUPAL_WIKI: "drupal wiki - knowledge base content (pages, spaces, attachments)",
DocumentSource.BOX: "box - files and folders",
DocumentSource.IMAP: "imap - email data",
DocumentSource.TESTRAIL: "testrail - test case management tool for QA processes",
}

View File

View File

@@ -0,0 +1,98 @@
import json
from typing import Any
from pydantic import BaseModel
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import KV_BOX_JWT_CONFIG
from onyx.key_value_store.factory import get_kv_store
from onyx.key_value_store.interface import KvKeyNotFoundError
from onyx.server.documents.models import CredentialBase
from onyx.utils.logger import setup_logger
logger = setup_logger()
# Key for Box JWT config in credentials dict
DB_CREDENTIALS_DICT_BOX_JWT_CONFIG = "box_jwt_config"
# Key for primary admin user ID in credentials dict
DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID = "box_primary_admin_user_id"
# Authentication method indicator
DB_CREDENTIALS_AUTHENTICATION_METHOD = "authentication_method"
BOX_AUTHENTICATION_METHOD_UPLOADED = "uploaded"
class BoxJWTConfig(BaseModel):
"""Box JWT configuration from JSON file."""
boxAppSettings: dict[str, Any]
enterpriseID: str | None = None
@property
def client_id(self) -> str:
return self.boxAppSettings["clientID"]
@property
def client_secret(self) -> str:
return self.boxAppSettings["clientSecret"]
@property
def private_key(self) -> str:
return self.boxAppSettings["appAuth"]["privateKey"]
@property
def passphrase(self) -> str | None:
return self.boxAppSettings["appAuth"].get("passphrase")
@property
def public_key_id(self) -> str:
return self.boxAppSettings["appAuth"]["publicKeyID"]
def get_box_jwt_config() -> BoxJWTConfig:
"""Get Box JWT config from KV store."""
try:
creds_str = str(get_kv_store().load(KV_BOX_JWT_CONFIG))
return BoxJWTConfig(**json.loads(creds_str))
except KvKeyNotFoundError:
raise KvKeyNotFoundError("Box JWT config not found in KV store")
def upsert_box_jwt_config(jwt_config: BoxJWTConfig) -> None:
"""Store Box JWT config in KV store (encrypted)."""
get_kv_store().store(
KV_BOX_JWT_CONFIG,
jwt_config.model_dump_json(),
encrypt=True,
)
def delete_box_jwt_config() -> None:
"""Delete Box JWT config from KV store."""
get_kv_store().delete(KV_BOX_JWT_CONFIG)
def build_box_jwt_creds(
primary_admin_user_id: str | None = None,
name: str | None = None,
) -> CredentialBase:
"""Build CredentialBase from Box JWT config stored in KV store.
Note: JWT config (including private key) is stored encrypted in KV store,
not in credential_json to avoid duplicating sensitive data in admin_public credentials.
"""
# Don't include JWT config in credential_json - it's stored encrypted in KV store
# The connector will load it from KV store when needed
credential_dict: dict[str, Any] = {}
if primary_admin_user_id:
credential_dict[DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID] = primary_admin_user_id
credential_dict[DB_CREDENTIALS_AUTHENTICATION_METHOD] = (
BOX_AUTHENTICATION_METHOD_UPLOADED
)
return CredentialBase(
credential_json=credential_dict,
admin_public=True,
source=DocumentSource.BOX,
name=name or "Box JWT (uploaded)",
)

View File

@@ -0,0 +1,706 @@
import copy
from collections.abc import Iterator
from datetime import datetime
from functools import partial
from typing import Any
from urllib.parse import urlparse
from box_sdk_gen import BoxClient
from box_sdk_gen import BoxJWTAuth
from box_sdk_gen import JWTConfig
from box_sdk_gen.box import BoxAPIError
from box_sdk_gen.box import BoxDeveloperTokenAuth
from typing_extensions import override
from onyx.configs.app_configs import BOX_DEVELOPER_TOKEN
from onyx.configs.app_configs import GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.connectors.box.box_kv import DB_CREDENTIALS_DICT_BOX_JWT_CONFIG
from onyx.connectors.box.box_kv import DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID
from onyx.connectors.box.doc_conversion import build_slim_document
from onyx.connectors.box.doc_conversion import convert_box_item_to_document
from onyx.connectors.box.doc_conversion import onyx_document_id_from_box_file
from onyx.connectors.box.doc_conversion import PermissionSyncContext
from onyx.connectors.box.file_retrieval import crawl_folders_for_files
from onyx.connectors.box.file_retrieval import get_all_files_in_folder
from onyx.connectors.box.models import BoxCheckpoint
from onyx.connectors.box.models import BoxRetrievalStage
from onyx.connectors.box.models import RetrievedBoxFile
from onyx.connectors.box.models import StageCompletion
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.exceptions import CredentialExpiredError
from onyx.connectors.exceptions import InsufficientPermissionsError
from onyx.connectors.interfaces import CheckpointedConnectorWithPermSync
from onyx.connectors.interfaces import CheckpointOutput
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import NormalizationResult
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnectorWithPermSync
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import EntityFailure
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import ThreadSafeDict
logger = setup_logger()
def _sanitize_error_message(error: Exception) -> str:
"""Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)."""
import re
error_str = str(error)
# Remove URLs
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
# Remove potential tokens (long alphanumeric strings)
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
return error_str
def _parse_box_datetime_to_timestamp(modified_time_str: str | None) -> float | None:
"""Parse Box datetime string to Unix timestamp."""
if not modified_time_str:
return None
try:
mod_dt = datetime.fromisoformat(modified_time_str.replace("Z", "+00:00"))
return mod_dt.timestamp()
except (ValueError, AttributeError):
return None
def _extract_str_list_from_comma_str(string: str | None) -> list[str]:
"""Extract list of strings from comma-separated string."""
if not string:
return []
return [s.strip() for s in string.split(",") if s.strip()]
def _extract_ids_from_urls(urls: list[str]) -> list[str]:
"""Extract Box folder/file IDs from URLs."""
ids = []
for url in urls:
parsed = urlparse(url)
# Box URLs can be: https://app.box.com/folder/123456789
# or https://app.box.com/file/123456789
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) >= 2:
ids.append(path_parts[-1])
return ids
class BoxConnector(
SlimConnectorWithPermSync,
CheckpointedConnectorWithPermSync[BoxCheckpoint],
):
def __init__(
self,
include_all_files: bool = False,
folder_ids: str | list[str] | None = None,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
if not include_all_files and not folder_ids:
raise ConnectorValidationError(
"Nothing to index. Please specify either 'include_all_files=True' "
"or provide 'folder_ids' (comma-separated list of folder IDs or URLs)."
)
self.include_all_files = include_all_files
# Handle both string and list inputs (frontend may send list)
if isinstance(folder_ids, list):
# Convert list to comma-separated string
folder_ids_str = ",".join(str(fid).strip() for fid in folder_ids if fid)
else:
folder_ids_str = folder_ids or ""
folder_id_list = _extract_str_list_from_comma_str(folder_ids_str)
# Extract folder IDs from URLs if provided, otherwise use items as-is
extracted_ids = []
for item in folder_id_list:
if item.startswith("http://") or item.startswith("https://"):
url_ids = _extract_ids_from_urls([item])
extracted_ids.extend(url_ids)
else:
extracted_ids.append(item)
self._requested_folder_ids = set(extracted_ids)
self._box_client: BoxClient | None = None
self._user_id: str | None = None
self._creds_dict: dict[str, Any] | None = None
# IDs of folders that have been traversed
self._retrieved_folder_ids: set[str] = set()
self.allow_images = False
self.size_threshold = GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD
def set_allow_images(self, value: bool) -> None:
self.allow_images = value
@property
def box_client(self) -> BoxClient:
if self._box_client is None:
raise RuntimeError(
"Box client missing, "
"should not call this property "
"before calling load_credentials"
)
return self._box_client
@property
def user_id(self) -> str:
if self._user_id is None:
raise RuntimeError(
"User ID missing, "
"should not call this property "
"before calling load_credentials"
)
return self._user_id
@classmethod
@override
def normalize_url(cls, url: str) -> NormalizationResult:
"""Normalize a Box URL to match the canonical Document.id format."""
parsed = urlparse(url)
netloc = parsed.netloc.lower()
if not (netloc.startswith("app.box.com") or netloc.startswith("box.com")):
return NormalizationResult(normalized_url=None, use_default=False)
# Extract file/folder ID from path
path_parts = parsed.path.strip("/").split("/")
if len(path_parts) >= 2:
item_id = path_parts[-1]
# Construct normalized URL
normalized = f"https://app.box.com/file/{item_id}"
return NormalizationResult(normalized_url=normalized, use_default=False)
return NormalizationResult(normalized_url=None, use_default=False)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None:
"""Load Box credentials and initialize client."""
# Check if BOX_DEVELOPER_TOKEN is set (for TESTING only)
if BOX_DEVELOPER_TOKEN:
logger.info("Using BOX_DEVELOPER_TOKEN for authentication (TESTING ONLY)")
auth = BoxDeveloperTokenAuth(token=BOX_DEVELOPER_TOKEN)
self._box_client = BoxClient(auth=auth)
try:
current_user = self._box_client.users.get_user_me()
self._user_id = current_user.id
except Exception as e:
logger.warning(f"Could not get current user info: {e}")
self._user_id = credentials.get("box_user_id", "me")
self._creds_dict = credentials
return None
# Support JWT authentication from uploaded config
# JWT config may be in credentials (legacy) or loaded from KV store (preferred)
jwt_config_json_str = credentials.get(DB_CREDENTIALS_DICT_BOX_JWT_CONFIG)
if not jwt_config_json_str:
# Try loading from KV store (preferred method - avoids duplicating sensitive data)
try:
from onyx.connectors.box.box_kv import get_box_jwt_config
jwt_config_obj = get_box_jwt_config()
jwt_config_json_str = jwt_config_obj.model_dump_json()
logger.info("Loaded Box JWT config from KV store")
except Exception:
# If not in KV store either, continue to error below
pass
if jwt_config_json_str:
logger.info("Using JWT authentication")
# Get primary admin user ID for impersonation
primary_admin_user_id = credentials.get(
DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID
)
# Create BoxJWTAuth from config json string
try:
jwt_config = JWTConfig.from_config_json_string(jwt_config_json_str)
auth = BoxJWTAuth(config=jwt_config)
logger.info("Box JWT config loaded successfully")
except Exception as e:
# Sanitize error message to avoid leaking sensitive data
sanitized_error = _sanitize_error_message(e)
logger.error(f"Failed to initialize Box BoxJWTAuth: {sanitized_error}")
raise ConnectorValidationError(
f"Failed to initialize Box JWT authentication: {sanitized_error}"
)
# If primary admin user ID is provided, use it for impersonation
if primary_admin_user_id:
logger.info(
f"Using user impersonation with primary_admin_user_id: {primary_admin_user_id}"
)
user_auth = auth.with_user_subject(primary_admin_user_id)
self._box_client = BoxClient(auth=user_auth)
self._user_id = primary_admin_user_id
else:
# Use service account as user
logger.info("Using Box service account (no user impersonation)")
self._user_id = "me"
self._box_client = BoxClient(auth=auth)
# Verify authentication by getting user info
try:
current_user = self._box_client.users.get_user_me()
logger.info(
f"Box JWT authentication successful. Authenticated as user: {current_user.id} "
f"(name: {current_user.name}, login: {getattr(current_user, 'login', 'N/A')})"
)
self._user_id = current_user.id
except Exception as e:
logger.warning(
f"Could not get current user info: {e}. "
f"Using user_id: {self._user_id}"
)
# Keep the user_id we set above
elif "box_developer_token" in credentials:
# Developer token authentication (for testing/backward compatibility)
logger.info("Using developer token from credentials (TESTING ONLY)")
auth = BoxDeveloperTokenAuth(token=credentials["box_developer_token"])
self._box_client = BoxClient(auth=auth)
self._user_id = credentials.get("box_user_id", "me")
else:
raise ConnectorValidationError(
"Box credentials missing. Need either JWT config (box_jwt_config) "
"or box_developer_token in credentials. "
"Please upload JWT config JSON file via the UI."
)
self._creds_dict = credentials
return None
def _update_traversed_folder_ids(self, folder_id: str) -> None:
"""Mark a folder as traversed."""
self._retrieved_folder_ids.add(folder_id)
def _fetch_box_items(
self,
checkpoint: BoxCheckpoint,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[RetrievedBoxFile]:
"""Fetch Box files based on checkpoint state."""
if checkpoint.completion_stage == BoxRetrievalStage.START:
checkpoint.completion_stage = BoxRetrievalStage.FOLDER_FILES
checkpoint.completion_map[self.user_id] = StageCompletion(
stage=BoxRetrievalStage.START,
completed_until=0,
current_folder_id=None,
)
completion = checkpoint.completion_map.get(self.user_id)
if not completion:
completion = StageCompletion(
stage=BoxRetrievalStage.START,
completed_until=0,
current_folder_id=None,
)
checkpoint.completion_map[self.user_id] = completion
# Determine which folders to process
if checkpoint.completion_stage == BoxRetrievalStage.FOLDER_FILES:
if checkpoint.folder_ids_to_retrieve is None:
if self.include_all_files:
# Start from root folder (ID "0")
checkpoint.folder_ids_to_retrieve = ["0"]
logger.info("include_all_files=True, starting from root folder '0'")
else:
checkpoint.folder_ids_to_retrieve = sorted(
self._requested_folder_ids
)
logger.info(
f"Processing specific folders: {checkpoint.folder_ids_to_retrieve}"
)
folder_ids = checkpoint.folder_ids_to_retrieve
else:
folder_ids = checkpoint.folder_ids_to_retrieve or []
logger.info(f"Processing {len(folder_ids)} folder(s): {folder_ids}")
# Process folders
for folder_id in folder_ids:
if folder_id in self._retrieved_folder_ids:
continue
# Resume from checkpoint if needed
if completion.current_folder_id == folder_id and completion.next_marker:
# Resume from marker - continue processing direct files in folder
for file_or_marker in get_all_files_in_folder(
client=self.box_client,
folder_id=folder_id,
user_id=self.user_id,
start=(
completion.completed_until
if completion.completed_until > 0
else start
),
end=end,
marker=completion.next_marker,
):
if isinstance(file_or_marker, str):
# This is a marker for next page
completion.next_marker = file_or_marker
return # Checkpoint and resume later
yield file_or_marker
# Update completion timestamp
modified_time = file_or_marker.box_file.get("modified_at")
timestamp = _parse_box_datetime_to_timestamp(modified_time)
if timestamp is not None:
completion.completed_until = timestamp
# After resuming direct files, also recurse into subfolders
# (This ensures we don't skip nested content after pagination resume)
logger.info(
f"Resuming recursive crawl of subfolders in folder {folder_id}"
)
subfolder_files = 0
for retrieved_file in crawl_folders_for_files(
client=self.box_client,
parent_id=folder_id,
user_id=self.user_id,
traversed_parent_ids=self._retrieved_folder_ids,
update_traversed_ids_func=self._update_traversed_folder_ids,
start=start,
end=end,
):
subfolder_files += 1
yield retrieved_file
logger.info(
f"Found {subfolder_files} files in subfolders of folder {folder_id} (resumed)"
)
else:
# Start fresh folder crawl
logger.info(f"Starting fresh crawl of folder {folder_id}")
completion.current_folder_id = folder_id
completion.completed_until = 0
completion.next_marker = None
files_in_folder = 0
for file_or_marker in get_all_files_in_folder(
client=self.box_client,
folder_id=folder_id,
user_id=self.user_id,
start=start,
end=end,
):
if isinstance(file_or_marker, str):
# This is a marker for next page
logger.debug(
f"Received pagination marker for folder {folder_id}: {file_or_marker}"
)
completion.next_marker = file_or_marker
return # Checkpoint and resume later
files_in_folder += 1
yield file_or_marker
# Update completion timestamp
modified_time = file_or_marker.box_file.get("modified_at")
timestamp = _parse_box_datetime_to_timestamp(modified_time)
if timestamp is not None:
completion.completed_until = timestamp
logger.info(
f"Found {files_in_folder} files directly in folder {folder_id}"
)
# Also crawl subfolders recursively
logger.info(
f"Starting recursive crawl of subfolders in folder {folder_id}"
)
subfolder_files = 0
for retrieved_file in crawl_folders_for_files(
client=self.box_client,
parent_id=folder_id,
user_id=self.user_id,
traversed_parent_ids=self._retrieved_folder_ids,
update_traversed_ids_func=self._update_traversed_folder_ids,
start=start,
end=end,
):
subfolder_files += 1
yield retrieved_file
logger.info(
f"Found {subfolder_files} files in subfolders of folder {folder_id}"
)
# Mark folder as processed
self._retrieved_folder_ids.add(folder_id)
completion.current_folder_id = None
completion.next_marker = None
checkpoint.completion_stage = BoxRetrievalStage.DONE
def _extract_docs_from_box(
self,
checkpoint: BoxCheckpoint,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
include_permissions: bool,
) -> Iterator[Document | ConnectorFailure]:
"""Retrieve and convert Box files to documents."""
try:
# Prepare conversion function
permission_sync_context = (
PermissionSyncContext(
primary_user_id=self.user_id,
box_domain=None, # Box uses user emails directly, not domain-based access
)
if include_permissions
else None
)
convert_func = partial(
convert_box_item_to_document,
self.box_client,
self.allow_images,
self.size_threshold,
permission_sync_context,
self.user_id,
)
# Fetch files
logger.info(
f"Starting to fetch Box items for user_id: {self.user_id} "
f"(include_permissions: {include_permissions})"
)
files_fetched = 0
files_converted = 0
files_skipped = 0
files_failed = 0
for retrieved_file in self._fetch_box_items(
checkpoint=checkpoint,
start=start,
end=end,
):
files_fetched += 1
if retrieved_file.error is not None:
failure_stage = retrieved_file.completion_stage.value
sanitized_error = _sanitize_error_message(retrieved_file.error)
failure_message = (
f"retrieval failure during stage: {failure_stage}, "
f"user: {retrieved_file.user_id}, "
f"parent folder: {retrieved_file.parent_id}, "
f"error: {sanitized_error}"
)
logger.error(failure_message)
yield ConnectorFailure(
failed_entity=EntityFailure(entity_id=failure_stage),
failure_message=failure_message,
exception=retrieved_file.error,
)
continue
box_file = retrieved_file.box_file
if not box_file:
continue
try:
document_id = onyx_document_id_from_box_file(box_file)
except KeyError:
logger.warning(
f"Box file missing id (stage={retrieved_file.completion_stage} "
f"user={retrieved_file.user_id}). Skipping."
)
continue
# Check for duplicates
if document_id in checkpoint.all_retrieved_file_ids:
continue
checkpoint.all_retrieved_file_ids.add(document_id)
# Convert to document
file_name = box_file.get("name", "unknown")
logger.debug(f"Converting Box file to document: {file_name}")
doc_or_failure = convert_func(box_file)
if doc_or_failure:
if isinstance(doc_or_failure, ConnectorFailure):
files_failed += 1
logger.warning(
f"Failed to convert file {file_name}: {doc_or_failure.failure_message}"
)
else:
files_converted += 1
logger.debug(
f"Successfully converted file {file_name} to document"
)
yield doc_or_failure
else:
files_skipped += 1
logger.debug(
f"convert_func returned None for file {file_name} (likely skipped due to "
f"permissions, size, or content extraction failure)"
)
checkpoint.retrieved_folder_ids = self._retrieved_folder_ids
logger.info(
f"Finished fetching Box items for user_id: {self.user_id}. "
f"Summary: fetched={files_fetched}, converted={files_converted}, "
f"skipped={files_skipped}, failed={files_failed}, "
f"unique_file_ids={len(checkpoint.all_retrieved_file_ids)}"
)
except Exception as e:
logger.exception(f"Error extracting documents from Box: {e}")
raise
def _load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: BoxCheckpoint,
include_permissions: bool,
) -> CheckpointOutput[BoxCheckpoint]:
"""Entrypoint for the connector; first run is with an empty checkpoint."""
if self._box_client is None or self._user_id is None:
raise RuntimeError(
"Credentials missing, should not call this method before calling load_credentials"
)
logger.info(
f"Loading from checkpoint with completion stage: {checkpoint.completion_stage}, "
f"num retrieved ids: {len(checkpoint.all_retrieved_file_ids)}"
)
checkpoint = copy.deepcopy(checkpoint)
self._retrieved_folder_ids = checkpoint.retrieved_folder_ids
yield from self._extract_docs_from_box(
checkpoint, start, end, include_permissions
)
checkpoint.retrieved_folder_ids = self._retrieved_folder_ids
logger.info(
f"num box files retrieved: {len(checkpoint.all_retrieved_file_ids)}"
)
if checkpoint.completion_stage == BoxRetrievalStage.DONE:
checkpoint.has_more = False
return checkpoint
@override
def load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: BoxCheckpoint,
) -> CheckpointOutput[BoxCheckpoint]:
return self._load_from_checkpoint(
start, end, checkpoint, include_permissions=False
)
@override
def load_from_checkpoint_with_perm_sync(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: BoxCheckpoint,
) -> CheckpointOutput[BoxCheckpoint]:
return self._load_from_checkpoint(
start, end, checkpoint, include_permissions=True
)
def _extract_slim_docs_from_box(
self,
checkpoint: BoxCheckpoint,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
"""Extract slim documents for permission syncing."""
from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
slim_batch = []
for file in self._fetch_box_items(
checkpoint=checkpoint,
start=start,
end=end,
):
if file.error is not None:
raise file.error
if doc := build_slim_document(
self.box_client,
file.box_file,
PermissionSyncContext(
primary_user_id=self.user_id,
box_domain=None,
),
):
slim_batch.append(doc)
if len(slim_batch) >= SLIM_BATCH_SIZE:
yield slim_batch
slim_batch = []
if callback:
if callback.should_stop():
raise RuntimeError(
"_extract_slim_docs_from_box: Stop signal detected"
)
callback.progress("_extract_slim_docs_from_box", 1)
yield slim_batch
def retrieve_all_slim_docs_perm_sync(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
"""Retrieve all slim documents for permission syncing."""
checkpoint = self.build_dummy_checkpoint()
while checkpoint.completion_stage != BoxRetrievalStage.DONE:
yield from self._extract_slim_docs_from_box(
checkpoint=checkpoint,
start=start,
end=end,
callback=callback,
)
logger.info("Box perm sync: Slim doc retrieval complete")
def validate_connector_settings(self) -> None:
"""Validate Box connector settings and credentials."""
if self._box_client is None:
raise ConnectorMissingCredentialError("Box credentials not loaded.")
try:
# Test API access by getting current user
current_user = self._box_client.users.get_user_me()
logger.info(f"Box connector validated for user: {current_user.name}")
except BoxAPIError as e:
status_code = e.status_code if hasattr(e, "status_code") else None
if status_code == 401:
raise CredentialExpiredError(
"Invalid or expired Box credentials (401)."
)
elif status_code == 403:
raise InsufficientPermissionsError(
"Box app lacks required permissions (403). "
"Please ensure the necessary scopes are granted."
)
else:
raise ConnectorValidationError(
f"Unexpected Box error (status={status_code}): {e}"
)
except Exception as e:
raise ConnectorValidationError(
f"Unexpected error during Box validation: {e}"
)
@override
def build_dummy_checkpoint(self) -> BoxCheckpoint:
"""Build an initial empty checkpoint."""
return BoxCheckpoint(
retrieved_folder_ids=set(),
completion_stage=BoxRetrievalStage.START,
completion_map=ThreadSafeDict(),
all_retrieved_file_ids=set(),
has_more=True,
)
@override
def validate_checkpoint_json(self, checkpoint_json: str) -> BoxCheckpoint:
"""Validate checkpoint JSON and return checkpoint object."""
return BoxCheckpoint.model_validate_json(checkpoint_json)

View File

@@ -0,0 +1,9 @@
BOX_FOLDER_TYPE = "folder"
BOX_FILE_TYPE = "file"
BOX_WEBLINK_BASE = "https://app.box.com/file/"
# Box API constants
BOX_API_MAX_ITEMS_PER_PAGE = 1000 # Maximum items per page in Box API pagination
# Box download constants
BOX_DOWNLOAD_CHUNK_SIZE = 8192 # Read files in 8KB chunks

View File

@@ -0,0 +1,497 @@
import io
from collections.abc import Callable
from datetime import datetime
from datetime import timezone
from typing import cast
from urllib.parse import urlparse
from urllib.parse import urlunparse
from box_sdk_gen.client import BoxClient
from pydantic import BaseModel
from onyx.access.models import ExternalAccess
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.box.constants import BOX_DOWNLOAD_CHUNK_SIZE
from onyx.connectors.box.constants import BOX_FOLDER_TYPE
from onyx.connectors.box.constants import BOX_WEBLINK_BASE
from onyx.connectors.box.models import BoxFileType
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.file_processing.extract_file_text import pptx_to_text
from onyx.file_processing.extract_file_text import read_docx_file
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.extract_file_text import xlsx_to_text
from onyx.file_processing.file_types import OnyxFileExtensions
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import (
fetch_versioned_implementation_with_fallback,
)
from onyx.utils.variable_functionality import noop_fallback
logger = setup_logger()
CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read
def _handle_box_download_error(file_id: str, error: Exception) -> bytes:
"""Handle Box download errors, logging appropriately based on error type."""
from box_sdk_gen.box import BoxAPIError
is_403 = False
status_code = None
# Check if it's a BoxAPIError with status code
if isinstance(error, BoxAPIError):
status_code = getattr(error, "status_code", None)
if status_code == 403:
is_403 = True
else:
# Check error message for 403 indicators
error_message = str(error).lower()
if (
"403" in str(error)
or "access_denied" in error_message
or "insufficient permission" in error_message
):
is_403 = True
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
error_str = str(error)
# Remove potential URLs and tokens from error message
import re
# Remove URLs
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
# Remove potential tokens (long alphanumeric strings)
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
# Log based on error type
if is_403:
logger.warning(
f"Permission denied (403) downloading Box file {file_id}. "
f"This may be due to file-level permissions or Box app scope limitations. "
f"Error: {error_str}"
)
else:
logger.error(
f"Failed to download Box file {file_id}"
+ (f" (status={status_code})" if status_code else "")
+ f": {error_str}"
)
return bytes()
class PermissionSyncContext(BaseModel):
"""
This is the information that is needed to sync permissions for a document.
"""
primary_user_id: str
box_domain: str | None = None
def onyx_document_id_from_box_file(file: BoxFileType) -> str:
"""Generate Onyx document ID from Box file."""
file_id = file.get("id")
if not file_id:
raise KeyError("Box file missing 'id' field.")
# Construct Box web link
# shared_link may be a string URL or an object with a 'url' attribute/key
shared_link = file.get("shared_link")
link = None
if shared_link:
if isinstance(shared_link, str):
link = shared_link
elif isinstance(shared_link, dict):
# Extract URL from object
link = shared_link.get("url")
elif hasattr(shared_link, "url"):
# Handle object with url attribute
link = shared_link.url
else:
# Fallback: treat as string
link = str(shared_link)
if not link:
link = f"{BOX_WEBLINK_BASE}{file_id}"
# Normalize the URL
parsed_url = urlparse(link)
parsed_url = parsed_url._replace(query="") # remove query parameters
# Remove trailing slashes and normalize
path = parsed_url.path.rstrip("/")
parsed_url = parsed_url._replace(path=path)
return urlunparse(parsed_url)
def download_box_file(client: BoxClient, file_id: str, size_threshold: int) -> bytes:
"""
Download the file from Box.
"""
download_stream = None
try:
# Box SDK v10 downloads files using download_file method
# This returns a stream that we need to read
download_stream = client.downloads.download_file(file_id=file_id)
# Use list to collect chunks for O(n) performance instead of O(n²) with +=
chunks: list[bytes] = []
total_size = 0
chunk_size = BOX_DOWNLOAD_CHUNK_SIZE
# Read the stream in chunks
while True:
chunk = download_stream.read(chunk_size)
if not chunk:
break
if isinstance(chunk, bytes):
chunks.append(chunk)
total_size += len(chunk)
else:
# Handle string chunks (shouldn't happen but be safe)
chunk_bytes = chunk.encode("utf-8")
chunks.append(chunk_bytes)
total_size += len(chunk)
if total_size > size_threshold:
logger.warning(
f"File {file_id} exceeds size threshold of {size_threshold}. Skipping."
)
return bytes()
# Join all chunks at once for O(n) performance
return b"".join(chunks)
except Exception as e:
return _handle_box_download_error(file_id, e)
finally:
# Ensure stream is closed on all paths (success, exception, early return)
if download_stream is not None:
try:
download_stream.close()
except Exception as close_error:
logger.warning(
f"Error closing download stream for file {file_id}: {close_error}"
)
def _download_and_extract_sections(
file: BoxFileType,
client: BoxClient,
allow_images: bool,
size_threshold: int,
) -> list[TextSection | ImageSection]:
"""Extract text and images from a Box file."""
file_id = file.get("id", "")
file_name = file.get("name", "")
file_type = file.get("type", "")
# Handle shared_link as string or object
shared_link = file.get("shared_link")
if shared_link:
if isinstance(shared_link, str):
link = shared_link
elif isinstance(shared_link, dict):
link = shared_link.get("url")
elif hasattr(shared_link, "url"):
link = shared_link.url
else:
link = str(shared_link) if shared_link else None
else:
link = None
if not link:
link = f"{BOX_WEBLINK_BASE}{file_id}"
# Skip folders
if file_type == BOX_FOLDER_TYPE:
logger.info("Skipping folder.")
return []
# Lazy evaluation to only download the file if necessary
def response_call() -> bytes:
return download_box_file(client, file_id, size_threshold)
# Check file size
file_size = file.get("size", 0)
if file_size and file_size > size_threshold:
logger.warning(
f"{file_name} exceeds size threshold of {size_threshold}. Skipping."
)
return []
# Get file extension for mime type detection
file_ext = get_file_ext(file_name)
# Handle images
if file_ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]:
if not allow_images:
return []
sections: list[TextSection | ImageSection] = []
try:
section, embedded_id = store_image_and_create_section(
image_data=response_call(),
file_id=file_id,
display_name=file_name,
media_type=f"image/{file_ext[1:]}",
file_origin=FileOrigin.CONNECTOR,
link=link,
)
sections.append(section)
except Exception as e:
logger.error(f"Failed to process image {file_name}: {e}")
return sections
# Process based on file extension
try:
file_bytes = response_call()
if not file_bytes:
logger.warning(f"Failed to download {file_name}")
return []
file_io = io.BytesIO(file_bytes)
if file_ext == ".pdf":
text, _pdf_meta, images = read_pdf_file(file_io)
pdf_sections: list[TextSection | ImageSection] = [
TextSection(link=link, text=text)
]
# Process embedded images in the PDF only if images are allowed
if allow_images:
try:
for idx, (img_data, img_name) in enumerate(images):
section, embedded_id = store_image_and_create_section(
image_data=img_data,
file_id=f"{file_id}_img_{idx}",
display_name=img_name or f"{file_name} - image {idx}",
file_origin=FileOrigin.CONNECTOR,
)
pdf_sections.append(section)
except Exception as e:
logger.error(f"Failed to process PDF images in {file_name}: {e}")
return pdf_sections
elif file_ext in [".docx", ".doc"]:
text, _ = read_docx_file(file_io)
return [TextSection(link=link, text=text)]
elif file_ext == ".xlsx":
text = xlsx_to_text(file_io, file_name=file_name)
return [TextSection(link=link, text=text)] if text else []
elif file_ext == ".xls":
# Legacy Excel format - use generic extractor which can handle via unstructured API
text = extract_file_text(file_io, file_name)
return [TextSection(link=link, text=text)] if text else []
elif file_ext == ".pptx":
text = pptx_to_text(file_io, file_name=file_name)
return [TextSection(link=link, text=text)] if text else []
elif file_ext == ".ppt":
# Legacy PowerPoint format - use generic extractor which can handle via unstructured API
text = extract_file_text(file_io, file_name)
return [TextSection(link=link, text=text)] if text else []
elif file_ext == ".txt":
text = file_bytes.decode("utf-8", errors="ignore")
return [TextSection(link=link, text=text)]
# Final attempt at extracting text using generic extractor
if file_ext not in OnyxFileExtensions.ALL_ALLOWED_EXTENSIONS:
logger.warning(f"Skipping file {file_name} due to extension.")
return []
try:
text = extract_file_text(file_io, file_name)
return [TextSection(link=link, text=text)]
except Exception as e:
logger.warning(f"Failed to extract text from {file_name}: {e}")
return []
except Exception as e:
logger.error(f"Error processing file {file_name}: {e}")
return []
def _get_external_access_for_raw_box_file(
file: BoxFileType,
company_domain: str | None,
retriever_box_client: BoxClient | None,
admin_box_client: BoxClient,
) -> ExternalAccess:
"""
Get the external access for a raw Box file.
"""
external_access_fn = cast(
Callable[
[BoxFileType, str | None, BoxClient | None, BoxClient],
ExternalAccess,
],
fetch_versioned_implementation_with_fallback(
"onyx.external_permissions.box.doc_sync",
"get_external_access_for_raw_box_file",
fallback=noop_fallback,
),
)
return external_access_fn(
file,
company_domain,
retriever_box_client,
admin_box_client,
)
def convert_box_item_to_document(
client: BoxClient,
allow_images: bool,
size_threshold: int,
permission_sync_context: PermissionSyncContext | None,
retriever_user_id: str,
file: BoxFileType,
) -> Document | ConnectorFailure | None:
"""
Convert a Box file to an Onyx Document.
"""
sections: list[TextSection | ImageSection] = []
doc_id = "unknown"
try:
# Skip folders
if file.get("type") == BOX_FOLDER_TYPE:
logger.info("Skipping folder.")
return None
# Check file size
size_str = file.get("size")
if size_str:
try:
size_int = int(size_str)
except ValueError:
logger.warning(f"Parsing string to int failed: size_str={size_str}")
else:
if size_int > size_threshold:
logger.warning(
f"{file.get('name')} exceeds size threshold of {size_threshold}. Skipping."
)
return None
# Extract sections
file_name = file.get("name", "unknown")
file_id = file.get("id", "unknown")
logger.debug(
f"Attempting to extract content from file: {file_name} (id: {file_id})"
)
sections = _download_and_extract_sections(
file, client, allow_images, size_threshold
)
# If we still don't have any sections, skip this file
if not sections:
logger.warning(
f"No content extracted from {file_name} (id: {file_id}). "
f"This may be due to download permission issues, unsupported file type, "
f"or empty file content."
)
return None
doc_id = onyx_document_id_from_box_file(file)
external_access = (
_get_external_access_for_raw_box_file(
file=file,
company_domain=permission_sync_context.box_domain,
retriever_box_client=client,
admin_box_client=client,
)
if permission_sync_context
else None
)
# Parse modified time to UTC datetime
# Note: Must use exact timezone.utc object (not FixedOffset) for identity checks
modified_time_str = file.get("modified_at")
doc_updated_at = None
if modified_time_str:
try:
parsed_dt = datetime.fromisoformat(
modified_time_str.replace("Z", "+00:00")
)
if parsed_dt.tzinfo is None:
doc_updated_at = parsed_dt.replace(tzinfo=timezone.utc)
else:
# Convert to UTC and recreate with exact timezone.utc object
# (astimezone may return FixedOffset, which fails identity checks)
utc_timestamp = parsed_dt.astimezone(timezone.utc).timestamp()
doc_updated_at = datetime.fromtimestamp(
utc_timestamp, tz=timezone.utc
)
except (ValueError, AttributeError) as e:
logger.warning(
f"Failed to parse modified_at timestamp '{modified_time_str}': {e}"
)
# Create the document
return Document(
id=doc_id,
sections=sections,
source=DocumentSource.BOX,
semantic_identifier=file.get("name", ""),
metadata={},
doc_updated_at=doc_updated_at,
external_access=external_access,
)
except Exception as e:
# Try to get doc_id for error reporting, but don't fail if it's unavailable
try:
doc_id = onyx_document_id_from_box_file(file)
except Exception:
doc_id = "unknown"
file_name = file.get("name", "unknown")
error_str = f"Error converting file '{file_name}' to Document as {retriever_user_id}: {e}"
logger.warning(error_str)
return ConnectorFailure(
failed_document=DocumentFailure(
document_id=doc_id,
document_link=(sections[0].link if sections else None),
),
failed_entity=None,
failure_message=error_str,
exception=e,
)
def build_slim_document(
client: BoxClient,
file: BoxFileType,
permission_sync_context: PermissionSyncContext | None,
) -> SlimDocument | None:
"""Build a slim document for pruning."""
if file.get("type") == BOX_FOLDER_TYPE:
return None
external_access = (
_get_external_access_for_raw_box_file(
file=file,
company_domain=(
permission_sync_context.box_domain if permission_sync_context else None
),
retriever_box_client=client,
admin_box_client=client,
)
if permission_sync_context
else None
)
return SlimDocument(
id=onyx_document_id_from_box_file(file),
external_access=external_access,
)

View File

@@ -0,0 +1,455 @@
from collections.abc import Callable
from collections.abc import Iterator
from datetime import datetime
from box_sdk_gen.client import BoxClient
from box_sdk_gen.schemas import File as BoxFile
from box_sdk_gen.schemas import Folder as BoxFolder
from onyx.connectors.box.constants import BOX_API_MAX_ITEMS_PER_PAGE
from onyx.connectors.box.models import BoxFileType
from onyx.connectors.box.models import BoxRetrievalStage
from onyx.connectors.box.models import RetrievedBoxFile
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.utils.logger import setup_logger
logger = setup_logger()
def _should_include_file_by_time(
file_dict: BoxFileType,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> bool:
"""Check if a file should be included based on its modified time."""
if start is None and end is None:
return True
modified_time = file_dict.get("modified_at")
if not modified_time:
return True # Include files without timestamps
try:
mod_dt = datetime.fromisoformat(modified_time.replace("Z", "+00:00"))
mod_ts = mod_dt.timestamp()
if start is not None and mod_ts < start:
logger.debug(
f"Skipping file {file_dict.get('name')} - "
f"modified {mod_ts} < start {start}"
)
return False
if end is not None and mod_ts > end:
logger.debug(
f"Skipping file {file_dict.get('name')} - "
f"modified {mod_ts} > end {end}"
)
return False
return True
except (ValueError, AttributeError):
return True # Include files with invalid timestamps
def _box_file_to_dict(file: BoxFile | BoxFolder) -> BoxFileType:
"""Convert Box SDK file/folder object to dictionary."""
# Helper to safely convert datetime or string to ISO format
def to_iso_string(dt_or_str):
if dt_or_str is None:
return None
if isinstance(dt_or_str, str):
return dt_or_str
if hasattr(dt_or_str, "isoformat"):
return dt_or_str.isoformat()
return str(dt_or_str)
# Helper to safely get parent ID
def get_parent_id(parent):
if parent is None:
return None
if isinstance(parent, dict):
return {"id": parent.get("id")} if parent.get("id") else None
if hasattr(parent, "id"):
return {"id": parent.id}
return None
return {
"id": file.id,
"name": file.name,
"type": file.type.value if hasattr(file.type, "value") else str(file.type),
"modified_at": (
to_iso_string(file.modified_at)
if hasattr(file, "modified_at") and file.modified_at
else None
),
"created_at": (
to_iso_string(file.created_at)
if hasattr(file, "created_at") and file.created_at
else None
),
"size": file.size if hasattr(file, "size") and file.size is not None else 0,
"parent": (
get_parent_id(file.parent)
if hasattr(file, "parent") and file.parent
else None
),
"shared_link": (
{
"url": file.shared_link.url,
"access": (
file.shared_link.access.value
if hasattr(file.shared_link, "access")
and hasattr(file.shared_link.access, "value")
else None
),
"password": (
file.shared_link.password
if hasattr(file.shared_link, "password")
else None
),
}
if hasattr(file, "shared_link") and file.shared_link
else None
),
}
def _get_folders_in_parent(
client: BoxClient,
parent_id: str = "0", # "0" is root folder in Box
) -> Iterator[BoxFileType]:
"""Get all folders in a parent folder."""
logger.info(f"Getting folders in parent {parent_id}")
try:
limit = BOX_API_MAX_ITEMS_PER_PAGE
marker: str | None = None
total_folders = 0
page_num = 0
while True:
page_num += 1
items = client.folders.get_folder_items(
folder_id=parent_id,
fields=["id", "name", "type", "modified_at", "created_at", "parent"],
limit=limit,
marker=marker,
)
logger.debug(
f"Box API page {page_num} for parent {parent_id}: {len(items.entries)} items"
)
for item in items.entries:
if item.type.value == "folder":
total_folders += 1
logger.debug(
f"Found folder in parent {parent_id}: {item.name} (id: {item.id})"
)
yield _box_file_to_dict(item)
# Box API pagination: check if there are more pages
# Box markers are opaque tokens and must come from next_marker.
# Using item IDs as markers can cause duplicates, skipped items, or infinite loops.
next_marker = getattr(items, "next_marker", None)
if next_marker:
marker = next_marker
elif items.entries and len(items.entries) == limit:
# Box API should always provide next_marker when there are more pages.
# If it doesn't, we cannot safely continue pagination.
logger.error(
f"Box API did not return next_marker for parent {parent_id} despite full page. "
f"Stopping pagination to avoid duplicates or infinite loops. "
f"This may indicate a Box API issue or incomplete data retrieval."
)
break
else:
break
logger.info(f"Found {total_folders} folders in parent {parent_id}")
except Exception as e:
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
import re
error_str = str(e)
# Remove URLs
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
# Remove potential tokens (long alphanumeric strings)
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
logger.warning(f"Error getting folders in parent {parent_id}: {error_str}")
# Continue on error, similar to Google Drive behavior
def _get_files_in_parent(
client: BoxClient,
parent_id: str = "0",
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[BoxFileType]:
"""Get all files in a parent folder."""
logger.info(f"Getting files in parent {parent_id} (start={start}, end={end})")
try:
# Box API pagination: uses limit and marker (last item ID from previous page)
limit = BOX_API_MAX_ITEMS_PER_PAGE
marker: str | None = None
total_files = 0
page_num = 0
while True:
page_num += 1
items = client.folders.get_folder_items(
folder_id=parent_id,
fields=[
"id",
"name",
"type",
"modified_at",
"created_at",
"size",
"parent",
"shared_link",
],
limit=limit,
marker=marker,
)
logger.debug(
f"Box API page {page_num} for parent {parent_id}: {len(items.entries)} items"
)
for item in items.entries:
if item.type.value == "file":
file_dict = _box_file_to_dict(item)
if not _should_include_file_by_time(file_dict, start, end):
continue
total_files += 1
yield file_dict
# Box API pagination: check if there are more pages
# The Box API response should have a next_marker field when there are more pages
# Box markers are opaque tokens and must come from next_marker.
# Using item IDs as markers can cause duplicates, skipped items, or infinite loops.
next_marker = getattr(items, "next_marker", None)
if next_marker:
# Use the API-provided next_marker token for the next page
marker = next_marker
elif items.entries and len(items.entries) == limit:
# Box API should always provide next_marker when there are more pages.
# If it doesn't, we cannot safely continue pagination.
logger.error(
f"Box API did not return next_marker for parent {parent_id} despite full page. "
f"Stopping pagination to avoid duplicates or infinite loops. "
f"This may indicate a Box API issue or incomplete data retrieval."
)
break
else:
break
logger.info(f"Found {total_files} files in parent {parent_id}")
except Exception as e:
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
import re
error_str = str(e)
# Remove URLs
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
# Remove potential tokens (long alphanumeric strings)
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
logger.error(
f"Error getting files in parent {parent_id}: {error_str}. "
f"Re-raising to prevent folder from being marked as traversed."
)
# Re-raise the exception so the caller can handle it and avoid marking
# the folder as traversed after a failed/partial retrieval
raise
def crawl_folders_for_files(
client: BoxClient,
parent_id: str,
user_id: str,
traversed_parent_ids: set[str],
update_traversed_ids_func: Callable[[str], None],
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[RetrievedBoxFile]:
"""
Recursively crawl folders to get all files.
This function starts crawling from any folder.
"""
logger.debug(f"Crawling folder {parent_id}")
if parent_id not in traversed_parent_ids:
try:
files_yielded = 0
for file_dict in _get_files_in_parent(
client=client,
parent_id=parent_id,
start=start,
end=end,
):
logger.debug(f"Found file: {file_dict.get('name')}")
yield RetrievedBoxFile(
box_file=file_dict,
user_id=user_id,
parent_id=parent_id,
completion_stage=BoxRetrievalStage.FOLDER_FILES,
)
files_yielded += 1
# Mark folder as traversed only after successfully processing all files
# (even if no files were found, to avoid re-processing empty folders)
# Only mark as traversed if we completed without exceptions
update_traversed_ids_func(parent_id)
logger.debug(
f"Successfully traversed folder {parent_id}, found {files_yielded} files"
)
except Exception as e:
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
import re
error_str = str(e)
# Remove URLs
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
# Remove potential tokens (long alphanumeric strings)
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
logger.error(
f"Error getting files in parent {parent_id}: {error_str}. "
f"Folder will not be marked as traversed and may be retried in future crawls."
)
# Do NOT mark folder as traversed when file listing aborts on error
# This allows the folder to be retried in future crawls
yield RetrievedBoxFile(
box_file={},
user_id=user_id,
parent_id=parent_id,
completion_stage=BoxRetrievalStage.FOLDER_FILES,
error=e,
)
else:
logger.debug(f"Skipping folder {parent_id} (already traversed)")
# Recursively process subfolders
for folder_dict in _get_folders_in_parent(client=client, parent_id=parent_id):
folder_id = folder_dict.get("id")
if folder_id:
logger.debug(f"Recursively crawling subfolder: {folder_dict.get('name')}")
yield from crawl_folders_for_files(
client=client,
parent_id=folder_id,
user_id=user_id,
traversed_parent_ids=traversed_parent_ids,
update_traversed_ids_func=update_traversed_ids_func,
start=start,
end=end,
)
def get_all_files_in_folder(
client: BoxClient,
folder_id: str = "0",
user_id: str = "me",
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
marker: str | None = None,
) -> Iterator[RetrievedBoxFile | str]:
"""
Get all files in a folder (non-recursive).
Returns RetrievedBoxFile objects or a marker string for pagination.
"""
logger.info(
f"Getting files in folder {folder_id} (user: {user_id}, "
f"start={start}, end={end}, marker={marker})"
)
try:
limit = BOX_API_MAX_ITEMS_PER_PAGE
current_marker = marker
total_files = 0
page_num = 0
while True:
page_num += 1
items = client.folders.get_folder_items(
folder_id=folder_id,
fields=[
"id",
"name",
"type",
"modified_at",
"created_at",
"size",
"parent",
"shared_link",
],
limit=limit,
marker=current_marker,
)
logger.info(
f"Box API returned {len(items.entries)} items for folder {folder_id} "
f"(page {page_num}, marker={current_marker})"
)
for item in items.entries:
logger.debug(
f"Found item in folder {folder_id}: type={item.type.value}, "
f"name={item.name if hasattr(item, 'name') else 'N/A'}"
)
if item.type.value == "file":
file_dict = _box_file_to_dict(item)
# Apply time filter
if not _should_include_file_by_time(file_dict, start, end):
continue
total_files += 1
logger.debug(f"Yielding file: {file_dict.get('name')}")
yield RetrievedBoxFile(
box_file=file_dict,
user_id=user_id,
parent_id=folder_id,
completion_stage=BoxRetrievalStage.FOLDER_FILES,
)
# Box API pagination: check if there are more pages
# The Box API response should have a next_marker field when there are more pages
# Box markers are opaque tokens and must come from next_marker.
# Using item IDs as markers can cause duplicates, skipped items, or infinite loops.
next_marker = getattr(items, "next_marker", None)
if next_marker:
# Use the API-provided next_marker token for the next page
current_marker = next_marker
logger.debug(
f"More pages available for folder {folder_id}, next_marker: {current_marker}"
)
yield current_marker # Yield marker for checkpoint resumption
break
elif items.entries and len(items.entries) == limit:
# Box API should always provide next_marker when there are more pages.
# If it doesn't, we cannot safely continue pagination.
logger.error(
f"Box API did not return next_marker for folder {folder_id} despite full page. "
f"Stopping pagination to avoid duplicates or infinite loops. "
f"This may indicate a Box API issue or incomplete data retrieval."
)
# Don't yield a marker - we can't safely continue
break
else:
# No more pages
break
logger.info(f"Found {total_files} files in folder {folder_id}")
except Exception as e:
# Sanitize error message to avoid leaking sensitive data (URLs, tokens, etc.)
error_str = str(e)
# Remove potential URLs and tokens from error message
import re
# Remove URLs
error_str = re.sub(r"https?://[^\s]+", "[URL_REDACTED]", error_str)
# Remove potential tokens (long alphanumeric strings)
error_str = re.sub(r"\b[a-zA-Z0-9]{32,}\b", "[TOKEN_REDACTED]", error_str)
logger.error(f"Error getting all files in folder {folder_id}: {error_str}")
yield RetrievedBoxFile(
box_file={},
user_id=user_id,
parent_id=folder_id,
completion_stage=BoxRetrievalStage.FOLDER_FILES,
error=e,
)

View File

@@ -0,0 +1,88 @@
from enum import Enum
from typing import Any
from pydantic import BaseModel
from pydantic import ConfigDict
from pydantic import field_serializer
from pydantic import field_validator
from onyx.connectors.interfaces import ConnectorCheckpoint
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.utils.threadpool_concurrency import ThreadSafeDict
BoxFileType = dict[str, Any]
class BoxRetrievalStage(str, Enum):
"""Stages of retrieval for Box connector."""
START = "start"
FOLDER_FILES = "folder_files"
DONE = "done"
class StageCompletion(BaseModel):
"""
Tracks progress through the retrieval process for a user.
completed_until: Timestamp of the latest file retrieved or error yielded.
current_folder_id: Folder currently being processed (for resumption).
next_marker: Pagination marker for resuming from a specific page.
"""
stage: BoxRetrievalStage
completed_until: SecondsSinceUnixEpoch
current_folder_id: str | None = None
next_marker: str | None = None
class RetrievedBoxFile(BaseModel):
"""
Represents a file retrieved from Box.
If an error occurs during retrieval, the error field is set
and will be propagated as a ConnectorFailure.
"""
# The stage at which this file was retrieved
completion_stage: BoxRetrievalStage
# The file that was retrieved
box_file: BoxFileType
# The ID of the user that the file was retrieved by
user_id: str
# The id of the parent folder of the file
parent_id: str | None = None
# Any unexpected error that occurred while retrieving the file.
error: Exception | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)
class BoxCheckpoint(ConnectorCheckpoint):
"""Checkpoint for Box connector retrieval state."""
retrieved_folder_ids: set[str]
completion_stage: BoxRetrievalStage
completion_map: ThreadSafeDict[str, StageCompletion]
all_retrieved_file_ids: set[str] = set()
folder_ids_to_retrieve: list[str] | None = None
@field_serializer("completion_map")
def serialize_completion_map(
self, completion_map: ThreadSafeDict[str, StageCompletion], _info: Any
) -> dict[str, StageCompletion]:
# Use copy() method to get a thread-safe snapshot instead of accessing _dict directly
# This maintains thread safety and avoids exposing mutable internal state
return completion_map.copy()
@field_validator("completion_map", mode="before")
def validate_completion_map(cls, v: Any) -> ThreadSafeDict[str, StageCompletion]:
assert isinstance(v, dict) or isinstance(v, ThreadSafeDict)
return ThreadSafeDict(
{k: StageCompletion.model_validate(val) for k, val in v.items()}
)

View File

@@ -0,0 +1,73 @@
"""Utility functions for Box connector."""
import json
from typing import Any
def parse_box_jwt_config(env_str: str) -> dict[str, Any]:
"""
Parse a Box JWT configuration JSON string from environment variables into a Python dictionary.
Handles double-escaped JSON strings that may come from environment variables.
Also ensures that newline sequences in the private key are converted to actual newlines.
Args:
env_str: The JSON string from environment variables (may be double-escaped)
Returns:
Parsed JWT config dictionary
Raises:
json.JSONDecodeError: If the string cannot be parsed as JSON
"""
# First try parsing normally
try:
config = json.loads(env_str)
except json.JSONDecodeError:
# Try removing extra escaping backslashes
unescaped = env_str.replace('\\"', '"')
# Remove leading/trailing quotes if present
unescaped = unescaped.strip('"')
# Now parse the JSON
config = json.loads(unescaped)
# Handle case where double-parsing returns a string instead of dict
# (e.g., if the JSON was double-encoded as a JSON string)
if isinstance(config, str):
# Try parsing the string as JSON again
try:
config = json.loads(config)
except json.JSONDecodeError:
# If it's not valid JSON, raise an error
raise json.JSONDecodeError(
"Double-parsed JSON returned a string that is not valid JSON",
config,
0,
)
# Validate that config is a dictionary with the expected structure
if not isinstance(config, dict):
raise TypeError(
f"Expected Box JWT config to be a dict, got {type(config).__name__}"
)
if "boxAppSettings" not in config:
raise ValueError("Box JWT config missing required 'boxAppSettings' field")
if not isinstance(config["boxAppSettings"], dict):
raise TypeError(
f"Expected boxAppSettings to be a dict, got {type(config['boxAppSettings']).__name__}"
)
# Ensure private key has actual newlines (not \n sequences)
if "appAuth" in config["boxAppSettings"]:
app_auth = config["boxAppSettings"]["appAuth"]
# Type check: appAuth must be a dict
if not isinstance(app_auth, dict):
raise TypeError(
f"Expected appAuth to be a dict, got {type(app_auth).__name__}"
)
private_key = app_auth.get("privateKey", "")
if private_key and "\\n" in private_key:
# Convert \n sequences to actual newlines
app_auth["privateKey"] = private_key.replace("\\n", "\n")
return config

View File

@@ -208,6 +208,10 @@ CONNECTOR_CLASS_MAP = {
module_path="onyx.connectors.bitbucket.connector",
class_name="BitbucketConnector",
),
DocumentSource.BOX: ConnectorMapping(
module_path="onyx.connectors.box.connector",
class_name="BoxConnector",
),
DocumentSource.TESTRAIL: ConnectorMapping(
module_path="onyx.connectors.testrail.connector",
class_name="TestRailConnector",

View File

@@ -482,6 +482,37 @@ def cleanup_google_drive_credentials(db_session: Session) -> None:
db_session.commit()
def cleanup_box_jwt_credentials(db_session: Session) -> None:
"""Clean up Box JWT credentials that reference the deleted JWT config.
This function properly handles deletion of related connector/document pairs
to avoid foreign key constraint violations.
"""
from onyx.connectors.box.box_kv import (
BOX_AUTHENTICATION_METHOD_UPLOADED,
DB_CREDENTIALS_AUTHENTICATION_METHOD,
)
box_credentials = fetch_credentials_by_source(
db_session=db_session, document_source=DocumentSource.BOX
)
for credential in box_credentials:
# Only delete credentials that use uploaded JWT config
credential_json = credential.credential_json or {}
if (
credential_json.get(DB_CREDENTIALS_AUTHENTICATION_METHOD)
== BOX_AUTHENTICATION_METHOD_UPLOADED
):
# Use _delete_credential_internal with force=True to properly clean up
# related connector/document pairs and avoid FK constraint violations
_delete_credential_internal(
credential=credential,
credential_id=credential.id,
db_session=db_session,
force=True,
)
def delete_service_account_credentials(
user: User | None, db_session: Session, source: DocumentSource
) -> None:

View File

@@ -37,6 +37,10 @@ from onyx.configs.constants import MilestoneRecordType
from onyx.configs.constants import ONYX_METADATA_FILENAME
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryTask
from onyx.connectors.box.box_kv import build_box_jwt_creds
from onyx.connectors.box.box_kv import delete_box_jwt_config
from onyx.connectors.box.box_kv import get_box_jwt_config
from onyx.connectors.box.box_kv import upsert_box_jwt_config
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.factory import validate_ccpair_for_user
from onyx.connectors.google_utils.google_auth import (
@@ -90,6 +94,7 @@ from onyx.db.connector_credential_pair import get_connector_credential_pairs_for
from onyx.db.connector_credential_pair import (
get_connector_credential_pairs_for_user_parallel,
)
from onyx.db.credentials import cleanup_box_jwt_credentials
from onyx.db.credentials import cleanup_gmail_credentials
from onyx.db.credentials import cleanup_google_drive_credentials
from onyx.db.credentials import create_credential
@@ -118,6 +123,8 @@ from onyx.key_value_store.interface import KvKeyNotFoundError
from onyx.redis.redis_pool import get_redis_client
from onyx.server.documents.models import AuthStatus
from onyx.server.documents.models import AuthUrl
from onyx.server.documents.models import BoxJWTConfig
from onyx.server.documents.models import BoxJWTCredentialRequest
from onyx.server.documents.models import ConnectorBase
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
from onyx.server.documents.models import ConnectorFileInfo
@@ -393,6 +400,78 @@ def upsert_gmail_service_account_credential(
return ObjectCreationIdResponse(id=credential.id)
@router.get("/admin/connector/box/jwt-config")
def check_box_jwt_config_exist(
_: User = Depends(current_admin_user),
) -> dict[str, str]:
"""Check if Box JWT config exists."""
try:
jwt_config = get_box_jwt_config()
return {
"client_id": jwt_config.client_id,
"enterprise_id": jwt_config.enterpriseID or "Not set",
}
except KvKeyNotFoundError:
raise HTTPException(status_code=404, detail="Box JWT config not found")
@router.put("/admin/connector/box/jwt-config")
def upsert_box_jwt_config_endpoint(
jwt_config: BoxJWTConfig, _: User = Depends(current_admin_user)
) -> StatusResponse:
"""Upload Box JWT config JSON."""
try:
upsert_box_jwt_config(jwt_config)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
return StatusResponse(success=True, message="Successfully saved Box JWT config")
@router.delete("/admin/connector/box/jwt-config")
def delete_box_jwt_config_endpoint(
_: User = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> StatusResponse:
"""Delete Box JWT config."""
try:
delete_box_jwt_config()
# Clean up Box JWT credentials that reference the deleted JWT config
cleanup_box_jwt_credentials(db_session=db_session)
except KvKeyNotFoundError as e:
raise HTTPException(status_code=400, detail=str(e))
return StatusResponse(success=True, message="Successfully deleted Box JWT config")
@router.put("/admin/connector/box/jwt-credential")
def upsert_box_jwt_credential(
jwt_credential_request: BoxJWTCredentialRequest,
user: User | None = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> ObjectCreationIdResponse:
"""Special API which allows the creation of a credential for Box JWT.
Combines the input with the saved JWT config to create an entry in the
`Credential` table."""
try:
credential_base = build_box_jwt_creds(
primary_admin_user_id=jwt_credential_request.box_primary_admin_user_id,
name="Box JWT (uploaded)",
)
except KvKeyNotFoundError as e:
raise HTTPException(status_code=400, detail=str(e))
# Clean up existing Box JWT credentials before creating a new one
# This prevents accumulation of stale/duplicate credentials
# Note: cleanup_box_jwt_credentials handles deletion of related connector/document pairs
cleanup_box_jwt_credentials(db_session=db_session)
# `user=None` since this credential is not a personal credential
credential = create_credential(
credential_data=credential_base, user=user, db_session=db_session
)
return ObjectCreationIdResponse(id=credential.id)
@router.get("/admin/connector/google-drive/check-auth/{credential_id}")
def check_drive_tokens(
credential_id: int,

View File

@@ -565,6 +565,52 @@ class GoogleServiceAccountCredentialRequest(BaseModel):
google_primary_admin: str | None = None # email of user to impersonate
class BoxJWTConfig(BaseModel):
"""Box JWT configuration from JSON file."""
boxAppSettings: dict[str, Any]
enterpriseID: str | None = None
model_config = {"extra": "allow"} # Allow extra fields in case Box adds more
def model_post_init(self, __context: Any) -> None:
"""Validate required nested keys after model initialization."""
# Validate boxAppSettings structure
if not isinstance(self.boxAppSettings, dict):
raise ValueError(
f"boxAppSettings must be a dict, got {type(self.boxAppSettings).__name__}"
)
# Validate required top-level fields in boxAppSettings
if "clientID" not in self.boxAppSettings:
raise ValueError("boxAppSettings missing required 'clientID' field")
if "clientSecret" not in self.boxAppSettings:
raise ValueError("boxAppSettings missing required 'clientSecret' field")
if "appAuth" not in self.boxAppSettings:
raise ValueError("boxAppSettings missing required 'appAuth' field")
# Validate appAuth structure
app_auth = self.boxAppSettings["appAuth"]
if not isinstance(app_auth, dict):
raise ValueError(
f"boxAppSettings.appAuth must be a dict, got {type(app_auth).__name__}"
)
# Validate required fields in appAuth
if "privateKey" not in app_auth:
raise ValueError(
"boxAppSettings.appAuth missing required 'privateKey' field"
)
if "publicKeyID" not in app_auth:
raise ValueError(
"boxAppSettings.appAuth missing required 'publicKeyID' field"
)
class BoxJWTCredentialRequest(BaseModel):
box_primary_admin_user_id: str | None = None # user ID to impersonate
class FileUploadResponse(BaseModel):
file_paths: list[str]
file_names: list[str]

View File

@@ -91,6 +91,8 @@ botocore==1.39.11
# s3transfer
botocore-stubs==1.40.74
# via boto3-stubs
boxsdk==10.3.0
# via onyx
braintrust==0.3.9
# via onyx
brotli==1.2.0
@@ -896,6 +898,7 @@ regex==2025.11.3
requests==2.32.5
# via
# atlassian-python-api
# boxsdk
# braintrust
# cohere
# dropbox
@@ -945,6 +948,7 @@ requests-oauthlib==1.3.1
# onyx
requests-toolbelt==1.0.0
# via
# boxsdk
# jira
# langsmith
# python-gitlab

View File

@@ -0,0 +1,718 @@
"""
Script to set up Box test environment for connector tests.
This script:
1. Reads Box credentials and user IDs from .test.env
2. Creates the required folder structure
3. Creates test files with proper naming and content
4. Sets up sharing/permissions between users
5. Updates consts_and_utils.py with actual folder and user IDs
Usage:
cd backend
python scripts/setup_box_test_env.py
"""
import json
import os
import re
import sys
from pathlib import Path
from typing import Any
# Add backend to path before importing onyx modules
backend_path = Path(__file__).parent.parent
sys.path.insert(0, str(backend_path))
from box_sdk_gen import BoxClient # noqa: E402
from box_sdk_gen import BoxJWTAuth # noqa: E402
from box_sdk_gen import JWTConfig # noqa: E402
from box_sdk_gen.managers.folders import CreateFolderParent # noqa: E402
from box_sdk_gen.schemas import File # noqa: E402
from box_sdk_gen.schemas import Folder # noqa: E402
from onyx.connectors.box.utils import parse_box_jwt_config # noqa: E402
def load_env_vars() -> None:
"""Load environment variables from .test.env."""
env_file = backend_path / ".test.env"
if not env_file.exists():
raise FileNotFoundError(f".test.env file not found at {env_file}")
with open(env_file, "r") as f:
for line in f:
line = line.strip()
if line and not line.startswith("#"):
if "=" in line:
key, value = line.split("=", 1)
os.environ[key] = value.strip('"')
def get_box_client(user_key: str = "admin") -> tuple[BoxClient, str]:
"""Get Box client for a specific user.
Uses the same JWT config for all users, impersonating via user ID.
"""
# Always use the same JWT config
jwt_config_str = os.environ.get("BOX_JWT_CONFIG_JSON_STR")
if not jwt_config_str:
raise ValueError("BOX_JWT_CONFIG_JSON_STR not found in .test.env")
# Get the user ID for impersonation
user_id_map = {
"admin": "BOX_PRIMARY_ADMIN_USER_ID",
"test_user_1": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1",
"test_user_2": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2",
"test_user_3": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3",
}
primary_admin_id = os.environ.get(
user_id_map.get(user_key, "BOX_PRIMARY_ADMIN_USER_ID")
)
# Parse and normalize the JWT config string
jwt_config_dict = parse_box_jwt_config(jwt_config_str)
# Re-serialize to ensure proper JSON format
normalized_jwt_config_str = json.dumps(jwt_config_dict)
# Use from_config_json_string (as used in connector)
try:
jwt_config = JWTConfig.from_config_json_string(normalized_jwt_config_str)
except Exception as e:
raise ValueError(
f"Failed to parse JWT config: {e}. Please check your BOX_JWT_CONFIG_JSON_STR format."
)
auth = BoxJWTAuth(config=jwt_config)
# Use primary admin user ID for impersonation if provided
if primary_admin_id:
user_auth = auth.with_user_subject(primary_admin_id)
client = BoxClient(auth=user_auth)
user_id = primary_admin_id
else:
client = BoxClient(auth=auth)
# Get user ID
user = client.users.get_user_me()
user_id = user.id
return client, user_id
def create_folder(client: BoxClient, name: str, parent_id: str = "0") -> Folder:
"""Create a folder in Box."""
print(f"Creating folder '{name}' in parent {parent_id}...")
try:
from box_sdk_gen.box.errors import BoxAPIError
folder = client.folders.create_folder(
name=name,
parent=CreateFolderParent(id=parent_id),
)
print(f" ✓ Created folder '{name}' with ID: {folder.id}")
return folder
except BoxAPIError as e:
# Handle folder already exists (409)
error_msg = str(e)
error_code = getattr(e, "code", None)
if "409" in error_msg or error_code == "item_name_in_use":
# Try to get the existing folder ID from the error
try:
if hasattr(e, "response") and hasattr(e.response, "body"):
body = e.response.body
if isinstance(body, dict):
context_info = body.get("context_info", {})
conflicts = context_info.get("conflicts", [])
if conflicts:
# Conflicts can be a list or dict
if isinstance(conflicts, list) and len(conflicts) > 0:
folder_id = conflicts[0].get("id")
elif isinstance(conflicts, dict):
folder_id = conflicts.get("id")
else:
folder_id = None
if folder_id:
folder = client.folders.get_folder_by_id(folder_id)
print(
f" Folder '{name}' already exists (ID: {folder_id})"
)
return folder
except Exception:
pass # Will try listing approach below
# If we can't get the folder from error response, try to find it by listing parent folder
try:
# List items in parent folder to find the folder by name
items_response = client.folders.get_folder_items(parent_id)
if hasattr(items_response, "entries"):
for item in items_response.entries:
# Check if this item matches the folder name
item_name = getattr(item, "name", None)
if item_name == name:
# Check if it's a folder (not a file)
item_type = getattr(item, "type", None)
# Box SDK Gen uses type.value or type enum
if hasattr(item_type, "value"):
item_type_str = item_type.value
else:
item_type_str = str(item_type)
if (
item_type_str == "folder"
or "folder" in item_type_str.lower()
):
folder_id = item.id
folder = client.folders.get_folder_by_id(folder_id)
print(
f" Folder '{name}' already exists (ID: {folder_id})"
)
return folder
except Exception:
# If listing also fails, we'll try one more approach
pass
# Last resort: try to search for the folder
try:
# Use search to find the folder
search_results = client.search.search(
query=name,
type="folder",
ancestor_folders=[parent_id],
)
if hasattr(search_results, "entries"):
for item in search_results.entries:
if getattr(item, "name", None) == name:
folder_id = item.id
folder = client.folders.get_folder_by_id(folder_id)
print(
f" Folder '{name}' already exists (ID: {folder_id})"
)
return folder
except Exception:
pass
# If we still can't get the folder, inform the user
print(
f" ⚠️ Folder '{name}' already exists but could not retrieve it automatically"
)
print(" You may need to delete it manually or use a different name")
raise ValueError(
f"Folder '{name}' already exists. Please delete it manually or use a different name."
)
raise
except Exception as e:
print(f" ✗ Error creating folder '{name}': {e}")
raise
def upload_file(client: BoxClient, name: str, content: str, parent_id: str) -> File:
"""Upload a file to Box."""
print(f" Uploading file '{name}'...")
try:
import io
from box_sdk_gen.box.errors import BoxAPIError
file_content = content.encode("utf-8")
file_size = len(file_content)
file_io = io.BytesIO(file_content)
# Use uploads.upload_file for small files (< 20MB)
# Use chunked_uploads.upload_big_file for large files (>= 20MB)
from box_sdk_gen.managers.uploads import UploadFileAttributes
from box_sdk_gen.managers.uploads import UploadFileAttributesParentField
if file_size < 20 * 1024 * 1024: # 20MB threshold
# Small file - use regular upload
try:
file_result = client.uploads.upload_file(
attributes=UploadFileAttributes(
name=name,
parent=UploadFileAttributesParentField(id=parent_id),
),
file=file_io,
)
# upload_file returns Files object which contains entries list
if hasattr(file_result, "entries") and file_result.entries:
uploaded_file = file_result.entries[0]
else:
uploaded_file = file_result
except BoxAPIError as e:
# Handle file already exists (409) - check error message/code
error_msg = str(e)
error_code = getattr(e, "code", None)
# Check if it's a 409 conflict error
if "409" in error_msg or error_code == "item_name_in_use":
# Try to extract file ID from error response body
# The error response body contains conflicts with the file id
try:
# Parse the error to get the file ID from conflicts
# The error message shows conflicts in context_info
if hasattr(e, "response") and hasattr(e.response, "body"):
pass
body = e.response.body
if isinstance(body, dict):
context_info = body.get("context_info", {})
conflicts = context_info.get("conflicts", {})
if conflicts and "id" in conflicts:
file_id = conflicts["id"]
uploaded_file = client.files.get_file_by_id(file_id)
print(
f" File '{name}' already exists (ID: {file_id})"
)
return uploaded_file
except Exception:
pass
# If we can't get the file ID, just skip with a message
print(f" File '{name}' already exists, skipping upload")
# Return a dummy file object - the script will continue
from box_sdk_gen.schemas import File
return File(id="existing", name=name, type="file")
raise
else:
# Large file - use chunked upload
uploaded_file = client.chunked_uploads.upload_big_file(
file=file_io,
file_name=name,
file_size=file_size,
parent_folder_id=parent_id,
)
file_id = uploaded_file.id if hasattr(uploaded_file, "id") else "unknown"
print(f" ✓ Uploaded '{name}' with ID: {file_id}")
return uploaded_file
except Exception as e:
print(f" ✗ Error uploading '{name}': {e}")
raise
def share_folder(
client: BoxClient, folder_id: str, user_id: str, role: str = "viewer"
) -> None:
"""Share a folder with a user by creating a collaboration."""
print(f" Sharing folder {folder_id} with user {user_id} as {role}...")
try:
from box_sdk_gen import (
CreateCollaborationAccessibleBy,
CreateCollaborationAccessibleByTypeField,
CreateCollaborationItem,
CreateCollaborationItemTypeField,
CreateCollaborationRole,
)
# Map role string to CreateCollaborationRole enum
role_map = {
"viewer": CreateCollaborationRole.VIEWER,
"editor": CreateCollaborationRole.EDITOR,
"co-owner": CreateCollaborationRole.CO_OWNER,
"previewer": CreateCollaborationRole.PREVIEWER,
"uploader": CreateCollaborationRole.UPLOADER,
"previewer-uploader": CreateCollaborationRole.PREVIEWER_UPLOADER,
"viewer-uploader": CreateCollaborationRole.VIEWER_UPLOADER,
}
collaboration_role = role_map.get(role.lower(), CreateCollaborationRole.VIEWER)
# Create the collaboration
collaboration = client.user_collaborations.create_collaboration(
item=CreateCollaborationItem(
type=CreateCollaborationItemTypeField.FOLDER,
id=folder_id,
),
accessible_by=CreateCollaborationAccessibleBy(
type=CreateCollaborationAccessibleByTypeField.USER,
id=user_id,
),
role=collaboration_role,
)
print(
f" ✓ Successfully shared folder {folder_id} with user {user_id} as {role}"
)
if hasattr(collaboration, "id"):
print(f" Collaboration ID: {collaboration.id}")
except Exception as e:
error_msg = str(e)
error_code = None
if hasattr(e, "code"):
error_code = e.code
elif hasattr(e, "response") and hasattr(e.response, "status_code"):
error_code = str(e.response.status_code)
# Check if collaboration already exists (409 conflict or user_already_collaborator)
if (
error_code == "409"
or "409" in error_msg
or "already exists" in error_msg.lower()
or "user_already_collaborator" in error_msg.lower()
or getattr(e, "code", None) == "user_already_collaborator"
):
print(
f" Collaboration already exists for folder {folder_id} and user {user_id}"
)
else:
print(f" ✗ Warning: Could not share folder: {e}")
print(f" Error code: {error_code}")
print(" You may need to share this folder manually via Box UI:")
print(f" - Folder ID: {folder_id}")
print(f" - User ID: {user_id}")
print(f" - Role: {role}")
def remove_user_access(client: BoxClient, folder_id: str, user_id: str) -> None:
"""Remove a user's access to a folder by deleting their collaboration."""
print(f" Removing access for user {user_id} from folder {folder_id}...")
try:
# First, get all collaborations for the folder
collaborations_response = client.list_collaborations.get_folder_collaborations(
folder_id
)
# Find the collaboration for this user
collaboration_to_delete = None
if hasattr(collaborations_response, "entries"):
for collab in collaborations_response.entries:
accessible_by = getattr(collab, "accessible_by", None)
if accessible_by:
collab_user_id = getattr(accessible_by, "id", None)
if collab_user_id == user_id:
collaboration_to_delete = collab
break
if collaboration_to_delete:
# Delete the collaboration
collaboration_id = getattr(collaboration_to_delete, "id", None)
if collaboration_id:
client.user_collaborations.delete_collaboration_by_id(collaboration_id)
print(
f" ✓ Removed access for user {user_id} from folder {folder_id}"
)
else:
print(" ⚠️ Found collaboration but no ID available")
else:
print(
f" User {user_id} does not have explicit access to folder {folder_id}"
)
except Exception as e:
str(e)
error_code = None
if hasattr(e, "code"):
error_code = e.code
elif hasattr(e, "response") and hasattr(e.response, "status_code"):
error_code = str(e.response.status_code)
print(f" ✗ Warning: Could not remove user access: {e}")
print(f" Error code: {error_code}")
print(" You may need to remove access manually via Box UI")
def create_file_structure(
client: BoxClient, parent_id: str, file_ids: list[int]
) -> None:
"""Create files in a folder."""
# Import here to avoid module-level import after non-import statements (E402)
try:
from tests.daily.connectors.box.consts_and_utils import (
SPECIAL_FILE_ID_TO_CONTENT_MAP as _SPECIAL_MAP,
)
except Exception as e:
raise ImportError(
f"Failed to import SPECIAL_FILE_ID_TO_CONTENT_MAP from consts_and_utils: {e}. "
"This is required for special test file content. Please fix the import error."
) from e
for file_id in file_ids:
file_name = f"file_{file_id}.txt"
if file_id in _SPECIAL_MAP:
content = _SPECIAL_MAP[file_id]
else:
content = f"This is file {file_id}"
upload_file(client, file_name, content, parent_id)
def setup_box_test_environment() -> dict[str, Any]:
"""Set up the complete Box test environment."""
# Import test constants here to avoid E402 and ensure sys.path has been adjusted
from tests.daily.connectors.box.consts_and_utils import (
ADMIN_FILE_IDS,
ADMIN_FOLDER_3_FILE_IDS,
FOLDER_1_1_FILE_IDS,
FOLDER_1_2_FILE_IDS,
FOLDER_1_FILE_IDS,
FOLDER_2_1_FILE_IDS,
FOLDER_2_2_FILE_IDS,
FOLDER_2_FILE_IDS,
FOLDER_3_FILE_IDS,
SECTIONS_FILE_IDS,
TEST_USER_1_FILE_IDS,
TEST_USER_2_FILE_IDS,
TEST_USER_3_FILE_IDS,
)
print("=" * 80)
print("Setting up Box test environment...")
print("=" * 80)
# Load environment variables
load_env_vars()
# Get parent folder ID from env, default to root ("0")
parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID", "0")
if parent_folder_id == "0":
print("\n⚠️ Creating test structure in ROOT folder (ID: 0)")
print(
" To use a different folder, set BOX_TEST_PARENT_FOLDER_ID in .test.env"
)
else:
print(f"\nCreating test structure in folder ID: {parent_folder_id}")
# Get admin client
admin_client, admin_user_id = get_box_client("admin")
print(f"\nAdmin user ID: {admin_user_id}")
# Get test user IDs (if configured)
test_user_ids = {}
for user_key in ["test_user_1", "test_user_2", "test_user_3"]:
try:
_, user_id = get_box_client(user_key)
test_user_ids[user_key] = user_id
print(f"{user_key} ID: {user_id}")
except Exception as e:
print(f"{user_key} not configured: {e}")
# Store created folder IDs
folder_ids = {}
print("\n" + "=" * 80)
print("Creating folder structure...")
print("=" * 80)
# Create root-level files
print("\nCreating root-level files...")
create_file_structure(admin_client, parent_folder_id, ADMIN_FILE_IDS)
create_file_structure(admin_client, parent_folder_id, TEST_USER_1_FILE_IDS)
if test_user_ids.get("test_user_2"):
create_file_structure(admin_client, parent_folder_id, TEST_USER_2_FILE_IDS)
if test_user_ids.get("test_user_3"):
create_file_structure(admin_client, parent_folder_id, TEST_USER_3_FILE_IDS)
# Create Folder 1 and subfolders
print("\nCreating Folder 1 structure...")
folder_1 = create_folder(admin_client, "Folder 1", parent_folder_id)
folder_ids["FOLDER_1_ID"] = folder_1.id
create_file_structure(admin_client, folder_1.id, FOLDER_1_FILE_IDS)
folder_1_1 = create_folder(admin_client, "Folder 1-1", folder_1.id)
folder_ids["FOLDER_1_1_ID"] = folder_1_1.id
create_file_structure(admin_client, folder_1_1.id, FOLDER_1_1_FILE_IDS)
folder_1_2 = create_folder(admin_client, "Folder 1-2", folder_1.id)
folder_ids["FOLDER_1_2_ID"] = folder_1_2.id
create_file_structure(admin_client, folder_1_2.id, FOLDER_1_2_FILE_IDS)
# Create Folder 2 and subfolders
print("\nCreating Folder 2 structure...")
folder_2 = create_folder(admin_client, "Folder 2", parent_folder_id)
folder_ids["FOLDER_2_ID"] = folder_2.id
create_file_structure(admin_client, folder_2.id, FOLDER_2_FILE_IDS)
folder_2_1 = create_folder(admin_client, "Folder 2-1", folder_2.id)
folder_ids["FOLDER_2_1_ID"] = folder_2_1.id
create_file_structure(admin_client, folder_2_1.id, FOLDER_2_1_FILE_IDS)
folder_2_2 = create_folder(admin_client, "Folder 2-2", folder_2.id)
folder_ids["FOLDER_2_2_ID"] = folder_2_2.id
create_file_structure(admin_client, folder_2_2.id, FOLDER_2_2_FILE_IDS)
# Create Folder 3
print("\nCreating Folder 3...")
folder_3 = create_folder(admin_client, "Folder 3", parent_folder_id)
folder_ids["FOLDER_3_ID"] = folder_3.id
create_file_structure(admin_client, folder_3.id, FOLDER_3_FILE_IDS)
# Create Admin's Folder 3 (separate folder for sharing test)
print("\nCreating Admin's Folder 3...")
admin_folder_3 = create_folder(admin_client, "Admin Folder 3", parent_folder_id)
folder_ids["ADMIN_FOLDER_3_ID"] = admin_folder_3.id
create_file_structure(admin_client, admin_folder_3.id, ADMIN_FOLDER_3_FILE_IDS)
# Create Sections folder
print("\nCreating Sections folder...")
sections_folder = create_folder(admin_client, "Sections Folder", parent_folder_id)
folder_ids["SECTIONS_FOLDER_ID"] = sections_folder.id
create_file_structure(admin_client, sections_folder.id, SECTIONS_FILE_IDS)
# Set up sharing/permissions
print("\n" + "=" * 80)
print("Setting up sharing and permissions...")
print("=" * 80)
if test_user_ids.get("test_user_1"):
user_1_id = test_user_ids["test_user_1"]
print(f"\nSetting up permissions for test_user_1 ({user_1_id})...")
# Share Folder 1 with test_user_1
share_folder(admin_client, folder_1.id, user_1_id, "viewer")
# Share Admin's Folder 3 with test_user_1
share_folder(admin_client, admin_folder_3.id, user_1_id, "viewer")
# Note: Individual file sharing would need to be done separately if needed
# Set up permissions for test_user_3
if test_user_ids.get("test_user_3"):
user_3_id = test_user_ids["test_user_3"]
print(f"\nSetting up permissions for test_user_3 ({user_3_id})...")
# Share Folder 1-2 (public folder) with test_user_3 so they can access public files
share_folder(admin_client, folder_1_2.id, user_3_id, "viewer")
# Note: test_user_3's own files are in the root, which they should have access to
# via their own account, but we don't need to explicitly share those
# Explicitly restrict test_user_3 from ADMIN_FOLDER_3
# This ensures the test_restricted_access test is useful
if test_user_ids.get("test_user_3"):
user_3_id = test_user_ids["test_user_3"]
print(f"\nRestricting test_user_3 ({user_3_id}) from ADMIN_FOLDER_3...")
remove_user_access(admin_client, admin_folder_3.id, user_3_id)
if test_user_ids.get("test_user_2"):
user_2_id = test_user_ids["test_user_2"]
print(f"\nSetting up permissions for test_user_2 ({user_2_id})...")
# Share Folder 1 with test_user_2
share_folder(admin_client, folder_1.id, user_2_id, "viewer")
# Share Folder 2-1 with test_user_2
share_folder(admin_client, folder_2_1.id, user_2_id, "viewer")
# Make Folder 1-2 public (if needed)
print("\nMaking Folder 1-2 public...")
try:
# Try to update folder shared link settings
from box_sdk_gen.managers.folders import (
UpdateFolderByIdSharedLink,
UpdateFolderByIdSharedLinkAccessField,
)
admin_client.folders.update_folder_by_id(
folder_id=folder_1_2.id,
shared_link=UpdateFolderByIdSharedLink(
access=UpdateFolderByIdSharedLinkAccessField.OPEN
),
)
print(" ✓ Folder 1-2 is now public")
except Exception as e:
print(f" ✗ Warning: Could not make folder public: {e}")
print(" (This is optional - folder can be shared manually via Box UI)")
print(f" To make it public manually: Folder ID {folder_1_2.id}")
# Compile results
results = {
"admin_user_id": admin_user_id,
"test_user_ids": test_user_ids,
"folder_ids": folder_ids,
}
print("\n" + "=" * 80)
print("Setup complete!")
print("=" * 80)
print("\nCreated folder IDs:")
for key, value in folder_ids.items():
print(f" {key}: {value}")
print(f"\nAdmin User ID: {admin_user_id}")
if test_user_ids:
print("\nTest User IDs:")
for key, value in test_user_ids.items():
print(f" {key}: {value}")
return results
def update_consts_file(results: dict[str, Any]) -> None:
"""Update consts_and_utils.py with actual IDs."""
consts_file = (
backend_path / "tests" / "daily" / "connectors" / "box" / "consts_and_utils.py"
)
print("\n" + "=" * 80)
print("Updating consts_and_utils.py...")
print("=" * 80)
with open(consts_file, "r") as f:
content = f.read()
# Update folder IDs
folder_ids = results["folder_ids"]
for key, value in folder_ids.items():
# Replace placeholder values - try multiple patterns
patterns = [
(f'{key} = "123456789"', f'{key} = "{value}"'),
(f'{key} = "123456790"', f'{key} = "{value}"'),
(f'{key} = "123456791"', f'{key} = "{value}"'),
(f'{key} = "123456792"', f'{key} = "{value}"'),
(f'{key} = "123456793"', f'{key} = "{value}"'),
(f'{key} = "123456794"', f'{key} = "{value}"'),
(f'{key} = "123456795"', f'{key} = "{value}"'),
(f'{key} = "123456796"', f'{key} = "{value}"'),
(f'{key} = "123456797"', f'{key} = "{value}"'),
]
for pattern, replacement in patterns:
if pattern in content:
content = content.replace(pattern, replacement)
print(f" Updated {key} = {value}")
break
else:
# Try regex pattern as fallback
pattern = f'{key} = "[^"]*"'
new_content = re.sub(pattern, f'{key} = "{value}"', content)
if new_content != content:
content = new_content
print(f" Updated {key} = {value}")
# Update user IDs
admin_user_id = results["admin_user_id"]
content = re.sub(
r'ADMIN_USER_ID = "[^"]*"',
f'ADMIN_USER_ID = "{admin_user_id}"',
content,
)
print(f" Updated ADMIN_USER_ID = {admin_user_id}")
test_user_ids = results["test_user_ids"]
user_id_map = {
"test_user_1": "TEST_USER_1_ID",
"test_user_2": "TEST_USER_2_ID",
"test_user_3": "TEST_USER_3_ID",
}
for user_key, const_name in user_id_map.items():
if user_key in test_user_ids:
content = re.sub(
f'{const_name} = "[^"]*"',
f'{const_name} = "{test_user_ids[user_key]}"',
content,
)
print(f" Updated {const_name} = {test_user_ids[user_key]}")
with open(consts_file, "w") as f:
f.write(content)
print("\n✓ Updated consts_and_utils.py with actual IDs")
if __name__ == "__main__":
try:
results = setup_box_test_environment()
update_consts_file(results)
print("\n" + "=" * 80)
print("✅ Box test environment setup complete!")
print("=" * 80)
print("\nYou can now run the tests with:")
print(" pytest -v -s backend/tests/daily/connectors/box/")
except Exception as e:
print(f"\n❌ Error setting up test environment: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -0,0 +1,287 @@
# Box Connector Test Suite
## Overview
The Box connector test suite contains comprehensive integration tests for the Box connector. These tests validate that the connector properly:
- Authenticates with the Box API using JWT authentication
- Retrieves files and folders from Box
- Handles folder scoping and filtering
- Generates properly structured Onyx `Document` objects
- Handles batch processing and checkpointing
- Manages permissions and access control
- Supports nested folder traversal
- Handles error cases gracefully
## Prerequisites
1. **Box Enterprise Account**: You need a Box enterprise account with admin access
2. **Box JWT Application**: A Box application configured with JWT authentication
3. **Test Users**: At least one test user (test_user_1) is required for permission tests
4. **Python Environment**: Backend dependencies installed (see `backend/requirements`)
5. **Read Permissions**: The Box application must have read permissions to download files
## Setting Up Box JWT Application
### 1. Create a Box Application
1. Go to the [Box Developer Console](https://developer.box.com/)
2. Navigate to **My Apps****Create New App**
3. Select **Custom App****Server Authentication (with JWT)**
4. Give your app a name (e.g., "Onyx Box Connector Tests")
### 2. Configure Application Settings
1. In your app settings, go to the **Configuration** tab
2. **Important**: Enable **Read** permissions in the application scopes
- This is required for the connector to download files
- **Note**: For running the test setup script (`setup_box_test_env.py`), you will also need **Write** permissions to create folders and files. However, for normal connector operation (indexing files), only **Read** permissions are required.
3. Note your **Client ID** and **Client Secret** (you'll need these later)
### 3. Generate and Download JWT Configuration
1. In the **Configuration** tab, scroll to **Add and Manage Public Keys**
2. Click **Generate a Public/Private Keypair**
3. Download the **JSON configuration file** - this is your `config.json`
- This file contains all the necessary authentication information
- **Keep this file secure** - it contains sensitive credentials
### 4. Set Up User Access
1. In your Box enterprise admin console, go to **Users and Groups**
2. Create test users (at least `test_user_1`, optionally `test_user_2` and `test_user_3`)
3. Note the **User IDs** for each test user (you'll need these for impersonation)
### 5. Authorize the Application
1. In the Box Developer Console, go to your app's **Authorization** tab
2. Click **Review and Submit** to submit your app for authorization
3. Once authorized, you can use the JWT authentication
## Environment Variables
The test suite requires the following environment variables in `backend/.test.env`:
### Required (Admin User)
- **`BOX_JWT_CONFIG_JSON_STR`**: The JWT configuration JSON string
- This is the content of the `config.json` file you downloaded
- It should be a JSON string (may need to be escaped for the .env file)
- Example format: `{"boxAppSettings": {...}, "enterpriseID": "..."}`
- **`BOX_PRIMARY_ADMIN_USER_ID`**: The Box user ID of the admin user
- This is used for user impersonation
- Find this in the Box admin console or via the Box API
### Optional (Test Users)
For full test coverage, you can also configure test user IDs for impersonation:
- **`BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1`**: User ID for test_user_1 (required for permission tests)
- **`BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2`**: User ID for test_user_2 (optional)
- **`BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3`**: User ID for test_user_3 (optional)
**Note**: The same JWT config (`BOX_JWT_CONFIG_JSON_STR`) is used for all users. Box JWT authentication supports user impersonation, so you only need to provide different user IDs. Each user ID is used to impersonate that user when making API calls.
### Example `.test.env` File
```bash
# Box JWT Configuration (same config used for all users via impersonation)
BOX_JWT_CONFIG_JSON_STR="{\"boxAppSettings\":{...},\"enterpriseID\":\"...\"}"
# User IDs for impersonation
BOX_PRIMARY_ADMIN_USER_ID="12345678"
# Test User 1 (required for permission tests)
BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1="12345679"
# Test User 2 (optional)
BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2=""
# Test User 3 (optional)
BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3=""
```
## Setting Up the Test Environment
### Automated Setup (Recommended)
We provide a script that automatically creates the required folder structure, test files, and permissions:
```bash
cd backend
python scripts/setup_box_test_env.py
```
This script will:
1. Read credentials from `.test.env`
2. Create the required folder structure (Folder 1, Folder 2, Folder 3, etc.)
3. Create test files with proper naming (`file_0.txt`, `file_1.txt`, etc.)
4. Set up sharing and permissions between users
5. Update `consts_and_utils.py` with actual folder and user IDs
**Note**: The script requires write permissions in your Box account. Make sure your JWT application has write access enabled.
### Manual Setup
If you prefer to set up manually, you'll need to:
1. Create the following folder structure in your Box account:
```
Root/
├── file_0.txt through file_4.txt (admin files)
├── file_5.txt through file_9.txt (test_user_1 files)
├── Folder 1/
│ ├── file_25.txt through file_29.txt
│ ├── Folder 1-1/
│ │ └── file_30.txt through file_34.txt
│ └── Folder 1-2/ (public folder)
│ └── file_35.txt through file_39.txt
├── Folder 2/
│ ├── file_45.txt through file_49.txt
│ ├── Folder 2-1/
│ │ └── file_50.txt through file_54.txt
│ └── Folder 2-2/
│ └── file_55.txt through file_59.txt
├── Folder 3/
│ └── file_62.txt through file_64.txt
└── Sections Folder/
└── file_61.txt (special content)
```
2. Create files with naming pattern: `file_{id}.txt` with content: `This is file {id}`
3. Set up sharing permissions as defined in `consts_and_utils.py` (see `ACCESS_MAPPING`)
4. Update `consts_and_utils.py` with actual folder IDs and user IDs
## Running the Tests
### Prerequisites
Before running tests, ensure:
1. Your `.test.env` file is configured with valid credentials
2. The test environment has been set up (either via script or manually)
3. You're in the `backend/` directory
### Run All Box Connector Tests
```bash
cd backend
pytest -v -s tests/daily/connectors/box/
```
### Run Specific Test Files
```bash
# Run basic connector tests
pytest -v -s tests/daily/connectors/box/test_basic.py
# Run permission tests
pytest -v -s tests/daily/connectors/box/test_permissions.py
# Run permission sync tests
pytest -v -s tests/daily/connectors/box/test_perm_sync.py
```
### Run Specific Test Functions
```bash
# Run a specific test
pytest -v -s tests/daily/connectors/box/test_basic.py::test_include_all_files
# Run tests matching a pattern
pytest -v -s tests/daily/connectors/box/ -k "permission"
```
### Run Tests Without Skipped Tests
Some tests are marked with `@pytest.mark.skip` if they require additional setup:
```bash
# Run all tests, excluding skipped ones (default behavior)
pytest -v -s tests/daily/connectors/box/
# To run skipped tests, you need to remove the @pytest.mark.skip decorator from the test functions
# or use pytest's marker filtering (skipped tests are not included by default)
```
## Test Structure
### Test Files
- **`test_basic.py`**: Basic connector functionality tests
- Folder traversal
- File retrieval
- Folder scoping
- Checkpointing
- Size thresholds
- **`test_permissions.py`**: Permission and access control tests
- User access mapping
- Public file access
- Restricted access
- Collaboration permissions
- Shared folders
- **`test_perm_sync.py`**: Permission synchronization tests
- Permission extraction
- Access control validation
- **`test_box_basic.py`**: Basic initialization tests (currently skipped)
### Test Constants
The `consts_and_utils.py` file contains:
- File ID ranges for different test scenarios
- Folder IDs (should match actual Box folder IDs)
- User IDs (should match actual Box user IDs)
- Access mapping (defines which users can access which files)
- Helper functions for assertions and document loading
**Important**: After running the setup script or manual setup, the folder IDs and user IDs in `consts_and_utils.py` should be updated with actual values from your Box account.
## Troubleshooting
### Authentication Errors
- **"Failed to initialize Box JWT authentication"**
- Verify your `BOX_JWT_CONFIG_JSON_STR` is correctly formatted
- Ensure the JSON string is properly escaped in `.test.env`
- Check that the JWT application is authorized
- **"User ID missing"**
- Verify `BOX_PRIMARY_ADMIN_USER_ID` is set correctly
- Ensure the user ID exists in your Box enterprise
### Permission Errors
- **"Insufficient permissions"**
- Ensure your Box JWT application has **Write** permissions enabled
- Check that the application is authorized in your Box enterprise
- Verify user impersonation is working correctly
### File Not Found Errors
- **"File not found" or "Folder not found"**
- Run the setup script to create the test environment
- Verify folder IDs in `consts_and_utils.py` match actual Box folder IDs
- Check that files were created with the correct naming pattern
### Test Failures
- **Tests fail with "expected file IDs not found"**
- Ensure the test environment was set up correctly
- Verify file naming matches the pattern: `file_{id}.txt`
- Check that file content matches: `This is file {id}`
- Run the setup script again to recreate the environment
## Additional Resources
- [Box Developer Documentation](https://developer.box.com/)
- [Box Python SDK Documentation](https://github.com/box/box-python-sdk-gen)
- [Box JWT Authentication Guide](https://developer.box.com/guides/authentication/jwt/jwt-setup/)
## Notes
- The test environment creates a significant number of files and folders. Consider using a dedicated Box enterprise or test account.
- Some tests require multiple users for full coverage. At minimum, `test_user_1` is required for permission tests.
- The setup script handles existing folders and files gracefully (detects and reuses them when possible), but if you run it multiple times without cleaning up, it may create duplicate files with the same names in the same folders. For a clean test environment, delete the test folders between runs or use a fresh test account.
- File IDs in the tests are placeholders. The actual file IDs in Box will be different, but the connector uses file names for matching.

View File

@@ -0,0 +1 @@
"""Tests for the Box connector."""

View File

@@ -0,0 +1,139 @@
"""Test fixtures for Box connector tests."""
import json
import os
import resource
from collections.abc import Callable
import pytest
from onyx.connectors.box.box_kv import BOX_AUTHENTICATION_METHOD_UPLOADED
from onyx.connectors.box.box_kv import DB_CREDENTIALS_AUTHENTICATION_METHOD
from onyx.connectors.box.box_kv import DB_CREDENTIALS_DICT_BOX_JWT_CONFIG
from onyx.connectors.box.box_kv import DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID
from onyx.connectors.box.connector import BoxConnector
from onyx.connectors.box.utils import parse_box_jwt_config
from tests.load_env_vars import load_env_vars
# Load environment variables at the module level
load_env_vars()
_USER_TO_PRIMARY_ADMIN_USER_ID_MAP = {
"admin": "BOX_PRIMARY_ADMIN_USER_ID",
"test_user_1": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_1",
"test_user_2": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_2",
"test_user_3": "BOX_PRIMARY_ADMIN_USER_ID_TEST_USER_3",
}
def get_credentials_from_env(user_key: str) -> dict:
"""
Get Box JWT credentials from environment variables.
Uses the same JWT config for all users, impersonating via user ID.
Args:
user_key (str): Key to look up user credentials (e.g., "admin", "test_user_1")
Returns:
dict: Credentials dictionary with JWT config and primary admin user ID
"""
# Always use the same JWT config for all users
raw_credential_string = os.environ["BOX_JWT_CONFIG_JSON_STR"]
# Parse and re-serialize to ensure proper JSON formatting
parsed_config = parse_box_jwt_config(raw_credential_string)
normalized_credential_string = json.dumps(parsed_config)
credentials = {
DB_CREDENTIALS_DICT_BOX_JWT_CONFIG: normalized_credential_string,
DB_CREDENTIALS_AUTHENTICATION_METHOD: BOX_AUTHENTICATION_METHOD_UPLOADED,
}
# Get the user ID for impersonation
if user_key in _USER_TO_PRIMARY_ADMIN_USER_ID_MAP:
primary_admin_user_id = os.environ.get(
_USER_TO_PRIMARY_ADMIN_USER_ID_MAP[user_key]
)
if primary_admin_user_id:
credentials[DB_CREDENTIALS_PRIMARY_ADMIN_USER_ID] = primary_admin_user_id
return credentials
@pytest.fixture
def box_jwt_connector_factory() -> Callable[..., BoxConnector]:
"""
Factory for creating Box connectors with JWT credentials.
Similar to google_drive_service_acct_connector_factory but for Box JWT authentication.
Note: When include_all_files=True, this factory automatically scopes to the test parent
folder (BOX_TEST_PARENT_FOLDER_ID) instead of the Box account root to avoid loading
all files in the account during tests.
"""
def _connector_factory(
user_key: str = "admin",
include_all_files: bool = False,
folder_ids: str | None = None,
) -> BoxConnector:
print(f"Creating BoxConnector with JWT credentials for user: {user_key}")
# For tests, when include_all_files=True, scope to test parent folder instead of root
# This prevents loading all files in the Box account during tests
test_parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID")
if include_all_files and test_parent_folder_id:
print(
f"Scoping include_all_files to test parent folder: {test_parent_folder_id}"
)
# Use folder_ids with the test parent folder instead of include_all_files
connector = BoxConnector(
include_all_files=False,
folder_ids=test_parent_folder_id,
)
else:
connector = BoxConnector(
include_all_files=include_all_files,
folder_ids=folder_ids,
)
credentials_json = get_credentials_from_env(user_key)
connector.load_credentials(credentials_json)
return connector
return _connector_factory
@pytest.fixture
def box_connector() -> BoxConnector:
"""Create a Box connector instance for testing."""
return BoxConnector(
include_all_files=True,
folder_ids=None,
)
@pytest.fixture(scope="session", autouse=True)
def set_resource_limits() -> None:
"""
Set resource limits for Box SDK if needed.
Similar to Google Drive tests, this may be needed if Box SDK is aggressive
about using file descriptors.
"""
RLIMIT_MINIMUM = 2048
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
desired_soft = min(RLIMIT_MINIMUM, hard)
print(f"Open file limit: soft={soft} hard={hard} soft_required={RLIMIT_MINIMUM}")
if soft < desired_soft:
print(f"Raising open file limit: {soft} -> {desired_soft}")
resource.setrlimit(resource.RLIMIT_NOFILE, (desired_soft, hard))
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
print(f"New open file limit: soft={soft} hard={hard}")
return

View File

@@ -0,0 +1,300 @@
import time
from collections.abc import Sequence
from onyx.connectors.box.connector import BoxConnector
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from tests.daily.connectors.utils import load_all_docs_from_checkpoint_connector
from tests.daily.connectors.utils import load_everything_from_checkpoint_connector
# File ID ranges for different test scenarios
# These should match actual file IDs in the test Box account
ALL_FILES = list(range(0, 60))
ROOT_FOLDER_FILES = list(range(0, 10))
ADMIN_FILE_IDS = list(range(0, 5))
ADMIN_FOLDER_3_FILE_IDS = list(range(65, 70)) # This folder is shared with test_user_1
TEST_USER_1_FILE_IDS = list(range(5, 10))
TEST_USER_2_FILE_IDS = list(range(10, 15))
TEST_USER_3_FILE_IDS = list(range(15, 20))
FOLDER_1_FILE_IDS = list(range(25, 30))
FOLDER_1_1_FILE_IDS = list(range(30, 35))
FOLDER_1_2_FILE_IDS = list(range(35, 40)) # This folder is public
FOLDER_2_FILE_IDS = list(range(45, 50))
FOLDER_2_1_FILE_IDS = list(range(50, 55))
FOLDER_2_2_FILE_IDS = list(range(55, 60))
SECTIONS_FILE_IDS = [61]
FOLDER_3_FILE_IDS = list(range(62, 65))
DOWNLOAD_REVOKED_FILE_ID = 21
PUBLIC_FOLDER_RANGE = FOLDER_1_2_FILE_IDS
PUBLIC_FILE_IDS = list(range(55, 57))
PUBLIC_RANGE = PUBLIC_FOLDER_RANGE + PUBLIC_FILE_IDS
# Box folder IDs (these should match actual folder IDs in test account)
FOLDER_1_ID = "360287594085" # Replace with actual folder ID
FOLDER_1_1_ID = "360286151062" # Replace with actual folder ID
FOLDER_1_2_ID = "360285966218" # Replace with actual folder ID
FOLDER_2_ID = "360288222616" # Replace with actual folder ID
FOLDER_2_1_ID = "360287577597" # Replace with actual folder ID
FOLDER_2_2_ID = "360286012378" # Replace with actual folder ID
FOLDER_3_ID = "360285724765" # Replace with actual folder ID
ADMIN_FOLDER_3_ID = "360286714903" # Admin's Folder 3 (shared with test_user_1)
SECTIONS_FOLDER_ID = "360288138769" # Replace with actual folder ID
# Box folder URLs
FOLDER_1_URL = f"https://app.box.com/folder/{FOLDER_1_ID}"
FOLDER_1_1_URL = f"https://app.box.com/folder/{FOLDER_1_1_ID}"
FOLDER_1_2_URL = f"https://app.box.com/folder/{FOLDER_1_2_ID}"
FOLDER_2_URL = f"https://app.box.com/folder/{FOLDER_2_ID}"
FOLDER_2_1_URL = f"https://app.box.com/folder/{FOLDER_2_1_ID}"
FOLDER_2_2_URL = f"https://app.box.com/folder/{FOLDER_2_2_ID}"
FOLDER_3_URL = f"https://app.box.com/folder/{FOLDER_3_ID}"
ADMIN_FOLDER_3_URL = f"https://app.box.com/folder/{ADMIN_FOLDER_3_ID}"
SECTIONS_FOLDER_URL = f"https://app.box.com/folder/{SECTIONS_FOLDER_ID}"
RESTRICTED_ACCESS_FOLDER_ID = "123456797" # Replace with actual folder ID
RESTRICTED_ACCESS_FOLDER_URL = (
f"https://app.box.com/folder/{RESTRICTED_ACCESS_FOLDER_ID}"
)
# User IDs (these should match actual Box user IDs)
ADMIN_USER_ID = "13089353657" # Replace with actual user ID
TEST_USER_1_ID = "48129700105" # Replace with actual user ID
TEST_USER_2_ID = "48129680809" # Replace with actual user ID
TEST_USER_3_ID = "48129580359" # Replace with actual user ID
# Dictionary for access permissions
# All users have access to their own files as well as public files
ACCESS_MAPPING: dict[str, list[int]] = {
# Admin has access to everything in the test parent folder
ADMIN_USER_ID: (
ADMIN_FILE_IDS
+ ADMIN_FOLDER_3_FILE_IDS
+ FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
+ FOLDER_3_FILE_IDS
+ SECTIONS_FILE_IDS
# Admin can also see all test user files in the test parent folder
+ TEST_USER_1_FILE_IDS
+ TEST_USER_2_FILE_IDS
+ TEST_USER_3_FILE_IDS
),
TEST_USER_1_ID: (
TEST_USER_1_FILE_IDS
# This user has access to folder 1
+ FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
# This user has been given shared access to Admin's Folder 3
+ ADMIN_FOLDER_3_FILE_IDS
# This user has been given shared access to files 0 and 1 in Admin's root
+ list(range(0, 2))
# When scoped to test parent folder, user can see all subfolders
# So they can also see FOLDER_3 and other folders
+ FOLDER_3_FILE_IDS
+ SECTIONS_FILE_IDS
# They can also see files 2-4, 10-19 from other users' folders
# because they have access to the test parent folder
+ list(range(2, 5))
+ list(range(10, 20))
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
),
TEST_USER_2_ID: (
TEST_USER_2_FILE_IDS
# Group 1 includes this user, giving access to folder 1
+ FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
# This folder is public
+ FOLDER_1_2_FILE_IDS
# Folder 2-1 is shared with this user
+ FOLDER_2_1_FILE_IDS
# This user has been given shared access to files 45 and 46 in folder 2
+ list(range(45, 47))
),
# When include_all_files=True is scoped to test parent folder,
# all users can see all subfolders (Box behavior when user has access to parent folder)
TEST_USER_3_ID: (
TEST_USER_3_FILE_IDS
+ FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
+ FOLDER_3_FILE_IDS
+ SECTIONS_FILE_IDS
+ ADMIN_FILE_IDS
+ TEST_USER_1_FILE_IDS
+ TEST_USER_2_FILE_IDS
+ ADMIN_FOLDER_3_FILE_IDS
),
}
SPECIAL_FILE_ID_TO_CONTENT_MAP: dict[int, str] = {
61: (
"Title\n"
"This is a Box document with sections - "
"Section 1\n"
"Section 1 content - "
"Sub-Section 1-1\n"
"Sub-Section 1-1 content - "
"Sub-Section 1-2\n"
"Sub-Section 1-2 content - "
"Section 2\n"
"Section 2 content"
),
}
file_name_template = "file_{}.txt"
file_text_template = "This is file {}"
# This is done to prevent different tests from interfering with each other
# So each test type should have its own valid prefix
_VALID_PREFIX = "file_"
def filter_invalid_prefixes(names: set[str]) -> set[str]:
"""Filter out file names that don't match the valid prefix."""
return {name for name in names if name.startswith(_VALID_PREFIX)}
def print_discrepancies(
expected: set[str],
retrieved: set[str],
) -> None:
"""Print discrepancies between expected and retrieved file names."""
if expected != retrieved:
expected_list = sorted(expected)
retrieved_list = sorted(retrieved)
print(expected_list)
print(retrieved_list)
print("Extra:")
print(sorted(retrieved - expected))
print("Missing:")
print(sorted(expected - retrieved))
def _get_expected_file_content(file_id: int) -> str:
"""Get expected file content for a given file ID."""
if file_id in SPECIAL_FILE_ID_TO_CONTENT_MAP:
return SPECIAL_FILE_ID_TO_CONTENT_MAP[file_id]
return file_text_template.format(file_id)
def id_to_name(file_id: int) -> str:
"""Convert file ID to expected filename."""
return file_name_template.format(file_id)
def assert_expected_docs_in_retrieved_docs(
retrieved_docs: list[Document],
expected_file_ids: Sequence[int],
) -> None:
"""
Assert that expected file IDs are present in retrieved documents.
NOTE: This asserts for an exact match after filtering to valid prefixes.
It filters retrieved docs to those with the valid prefix, then asserts
that the expected file names and texts exactly match the filtered retrieved docs.
"""
expected_file_names = {id_to_name(file_id) for file_id in expected_file_ids}
expected_file_texts = {
_get_expected_file_content(file_id) for file_id in expected_file_ids
}
retrieved_docs.sort(key=lambda x: x.semantic_identifier)
for doc in retrieved_docs:
print(f"retrieved doc: doc.semantic_identifier={doc.semantic_identifier}")
# Filter out invalid prefixes to prevent different tests from interfering with each other
valid_retrieved_docs = [
doc
for doc in retrieved_docs
if doc.semantic_identifier.startswith(_VALID_PREFIX)
]
# Check for duplicate semantic identifiers before building mapping
semantic_identifiers = [doc.semantic_identifier for doc in valid_retrieved_docs]
seen_identifiers = set()
duplicates = []
for identifier in semantic_identifiers:
if identifier in seen_identifiers:
duplicates.append(identifier)
seen_identifiers.add(identifier)
if duplicates:
raise AssertionError(
f"Found duplicate semantic_identifiers in retrieved docs: {duplicates}. "
f"This indicates a bug in the connector that returns the same document multiple times."
)
# Create mapping from file name to file text to detect mismatches
retrieved_name_to_text: dict[str, str] = {}
for doc in valid_retrieved_docs:
text = " - ".join(
[
section.text
for section in doc.sections
if isinstance(section, TextSection) and section.text is not None
]
)
retrieved_name_to_text[doc.semantic_identifier] = text
valid_retrieved_file_names = set(retrieved_name_to_text.keys())
valid_retrieved_texts = set(retrieved_name_to_text.values())
# Check file names
print_discrepancies(
expected=expected_file_names,
retrieved=valid_retrieved_file_names,
)
assert expected_file_names == valid_retrieved_file_names
# Check file texts
print_discrepancies(
expected=expected_file_texts,
retrieved=valid_retrieved_texts,
)
assert expected_file_texts == valid_retrieved_texts
# Verify that each file name has the correct corresponding text
# (This prevents swapped or mismatched file content per name from passing)
for file_id in expected_file_ids:
expected_name = id_to_name(file_id)
expected_text = _get_expected_file_content(file_id)
if expected_name in retrieved_name_to_text:
retrieved_text = retrieved_name_to_text[expected_name]
assert retrieved_text == expected_text, (
f"File {expected_name} has incorrect content. "
f"Expected: {expected_text}, Got: {retrieved_text}"
)
def load_all_docs(connector: BoxConnector) -> list[Document]:
"""Load all documents from a Box connector."""
return load_all_docs_from_checkpoint_connector(
connector,
0,
time.time(),
)
def load_all_docs_with_failures(
connector: BoxConnector,
) -> list[Document | ConnectorFailure]:
"""Load all documents from a Box connector, including failures."""
return load_everything_from_checkpoint_connector(
connector,
0,
time.time(),
)

View File

@@ -0,0 +1,374 @@
"""Basic tests for Box connector."""
import time
from collections.abc import Callable
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from onyx.connectors.box.connector import BoxConnector
from tests.daily.connectors.box.consts_and_utils import ADMIN_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import (
assert_expected_docs_in_retrieved_docs,
)
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_ID
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_URL
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_ID
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_URL
from tests.daily.connectors.box.consts_and_utils import FOLDER_3_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import load_all_docs
from tests.daily.connectors.box.consts_and_utils import SECTIONS_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import TEST_USER_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import TEST_USER_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import TEST_USER_3_FILE_IDS
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_include_all_files(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that include_all_files=True indexes everything from root."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
retrieved_docs = load_all_docs(connector)
# Should get everything accessible from root (test parent folder)
expected_file_ids = (
ADMIN_FILE_IDS
+ ADMIN_FOLDER_3_FILE_IDS
+ TEST_USER_1_FILE_IDS
+ TEST_USER_2_FILE_IDS
+ TEST_USER_3_FILE_IDS
+ FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
+ FOLDER_3_FILE_IDS # Folder 3 is in the test structure
+ SECTIONS_FILE_IDS
)
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_specific_folders(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that folder_ids with specific folder IDs works."""
folder_ids = f"{FOLDER_1_ID},{FOLDER_2_ID}"
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=folder_ids,
)
retrieved_docs = load_all_docs(connector)
# Should get files from folder 1 and folder 2 (including subfolders)
expected_file_ids = (
FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
)
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_folder_urls(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that folder_ids with Box URLs extracts IDs correctly."""
folder_urls = f"{FOLDER_1_URL},{FOLDER_2_URL}"
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=folder_urls,
)
retrieved_docs = load_all_docs(connector)
# Should get files from folder 1 and folder 2 (including subfolders)
expected_file_ids = (
FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
)
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_mixed_folder_ids_and_urls(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test combination of folder IDs and URLs."""
mixed_ids = f"{FOLDER_1_ID},{FOLDER_2_URL}"
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=mixed_ids,
)
retrieved_docs = load_all_docs(connector)
# Should get files from both folders
expected_file_ids = (
FOLDER_1_FILE_IDS
+ FOLDER_1_1_FILE_IDS
+ FOLDER_1_2_FILE_IDS
+ FOLDER_2_FILE_IDS
+ FOLDER_2_1_FILE_IDS
+ FOLDER_2_2_FILE_IDS
)
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_single_folder(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test indexing a single folder."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=FOLDER_1_ID,
)
retrieved_docs = load_all_docs(connector)
# Should get files from folder 1 and its subfolders
expected_file_ids = FOLDER_1_FILE_IDS + FOLDER_1_1_FILE_IDS + FOLDER_1_2_FILE_IDS
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_nested_folders(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test recursive folder traversal with deeply nested structure."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=FOLDER_2_ID,
)
retrieved_docs = load_all_docs(connector)
# Should get files from folder 2 and all nested subfolders (2-1 and 2-2)
expected_file_ids = FOLDER_2_FILE_IDS + FOLDER_2_1_FILE_IDS + FOLDER_2_2_FILE_IDS
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_size_threshold(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""
Test that size_threshold is applied correctly.
Since all test files are small (< 1KB), this verifies the threshold
doesn't block all files rather than testing exclusion of large files.
"""
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_URL
# Test with a reasonable size threshold (16KB) - test files are small text files
connector_with_threshold = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=FOLDER_1_URL,
)
connector_with_threshold.size_threshold = 16384 # 16KB
retrieved_docs = load_all_docs(connector_with_threshold)
threshold_doc_names = {doc.semantic_identifier for doc in retrieved_docs}
# With a 16KB threshold, all small test files should still be retrieved
# (test files are small text files, typically < 1KB each)
assert (
len(retrieved_docs) > 0
), "Should retrieve at least some files with 16KB threshold"
# Verify that files were retrieved (threshold didn't block all files)
# Since test files are small, they should all pass the 16KB threshold
assert len(threshold_doc_names) > 0, (
f"With 16KB threshold, should retrieve files from {FOLDER_1_URL}. "
f"Got {len(retrieved_docs)} documents."
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_checkpoint_resumption(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test checkpointing and resuming from checkpoint."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
# Create initial checkpoint
checkpoint = connector.build_dummy_checkpoint()
assert checkpoint is not None
assert checkpoint.has_more is True
# Load some documents
from onyx.connectors.connector_runner import CheckpointOutputWrapper
start_time = 0
end_time = time.time()
# Load first batch and get updated checkpoint
first_checkpoint_file_count = len(checkpoint.all_retrieved_file_ids)
doc_batch_generator = CheckpointOutputWrapper[BoxConnector]()(
connector.load_from_checkpoint(start_time, end_time, checkpoint)
)
first_batch_docs = []
for document, failure, next_checkpoint in doc_batch_generator:
if failure is not None:
raise RuntimeError(f"Failed to load documents: {failure}")
if document is not None:
first_batch_docs.append(document)
if next_checkpoint is not None:
checkpoint = next_checkpoint
# Load a few more batches to verify checkpointing works
all_docs = first_batch_docs.copy()
max_iterations = 2 # Test a few batches to verify checkpointing
iteration_count = 0
while checkpoint.has_more and iteration_count < max_iterations:
iteration_count += 1
doc_batch_generator = CheckpointOutputWrapper[BoxConnector]()(
connector.load_from_checkpoint(start_time, end_time, checkpoint)
)
batch_docs = []
for document, failure, next_checkpoint in doc_batch_generator:
if failure is not None:
raise RuntimeError(f"Failed to load documents: {failure}")
if document is not None:
batch_docs.append(document)
if next_checkpoint is not None:
checkpoint = next_checkpoint
all_docs.extend(batch_docs)
if checkpoint.has_more:
# Checkpoint should be updated with more file IDs
assert len(checkpoint.all_retrieved_file_ids) > first_checkpoint_file_count
# Verify we got documents and checkpointing is working
assert len(all_docs) > 0, "Should have retrieved at least some documents"
assert (
len(checkpoint.all_retrieved_file_ids) > first_checkpoint_file_count
), "Checkpoint should be updated with retrieved file IDs"
def test_connector_validation(
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test validate_connector_settings()."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
# Should not raise an exception
connector.validate_connector_settings()
def test_connector_initialization() -> None:
"""Test that Box connector can be initialized."""
connector = BoxConnector(
include_all_files=True,
folder_ids=None,
)
assert connector is not None
assert connector.include_all_files is True
assert connector._requested_folder_ids == set()
def test_connector_initialization_with_folder_ids() -> None:
"""Test that Box connector can be initialized with folder IDs."""
folder_ids = "123,456"
connector = BoxConnector(
include_all_files=False,
folder_ids=folder_ids,
)
assert connector is not None
assert connector.include_all_files is False
assert "123" in connector._requested_folder_ids
assert "456" in connector._requested_folder_ids
def test_connector_initialization_fails_without_config() -> None:
"""Test that Box connector fails to initialize without include_all_files or folder_ids."""
from onyx.connectors.exceptions import ConnectorValidationError
with pytest.raises(ConnectorValidationError):
BoxConnector(
include_all_files=False,
folder_ids=None,
)

View File

@@ -0,0 +1,87 @@
"""Error handling tests for Box connector."""
from collections.abc import Callable
from unittest.mock import patch
import pytest
from onyx.connectors.box.connector import BoxConnector
from onyx.connectors.exceptions import ConnectorValidationError
def test_connector_with_invalid_folder_id(
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that connector handles invalid folder IDs gracefully."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids="999999999999", # Invalid folder ID
)
# Should not raise during initialization
assert connector is not None
# Loading documents should handle the error gracefully
from tests.daily.connectors.box.consts_and_utils import load_all_docs
with patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
):
try:
docs = load_all_docs(connector)
# If no error, should return empty list for invalid folder ID
assert isinstance(docs, list)
# Assert that invalid folder ID returns empty result
assert (
len(docs) == 0
), f"Expected empty result for invalid folder ID, but got {len(docs)} documents"
except Exception as e:
# If error is raised, it should be a specific Box API error
error_msg = str(e).lower()
assert (
"404" in error_msg
or "not found" in error_msg
or "not_found" in error_msg
), f"Unexpected error type: {error_msg}"
def test_connector_with_malformed_url(
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that connector handles malformed URLs gracefully."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids="https://invalid-url.com/folder/123",
)
# Should extract what it can or handle gracefully
assert connector is not None
# Verify that the connector extracted the folder ID from the URL (even if invalid)
# The connector should extract "123" from the URL path
assert "123" in connector._requested_folder_ids
def test_connector_with_empty_folder_ids_string() -> None:
"""Test that connector raises validation error for empty folder_ids string."""
with pytest.raises(ConnectorValidationError):
BoxConnector(
include_all_files=False,
folder_ids="",
)
def test_connector_with_whitespace_folder_ids() -> None:
"""Test that connector handles whitespace-only folder_ids."""
# Whitespace-only folder_ids get filtered out, but the connector still initializes
# The validation happens in __init__ which checks if folder_ids is truthy (not empty string)
# Since " , , " is truthy, it passes initial validation, but results in empty folder_ids
# This is acceptable behavior - the connector will just have no folders to process
connector = BoxConnector(
include_all_files=False,
folder_ids=" , , ",
)
# Connector initializes successfully
assert connector is not None

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python
"""Utility to generate mapping from test file IDs to actual Box file IDs."""
import os
from onyx.connectors.box.connector import BoxConnector
from tests.daily.connectors.box.conftest import get_credentials_from_env
from tests.daily.connectors.box.consts_and_utils import ADMIN_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import file_name_template
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_3_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import load_all_docs
def generate_test_id_to_box_id_mapping() -> dict[int, str]:
"""
Generate a mapping from test file IDs to actual Box file IDs.
This is useful for writing tests that need to verify specific files
are accessible to specific users.
Returns:
dict: Mapping from test file ID (int) to Box file URL (str)
"""
# Set up the connector with real credentials
# For tests, scope to test parent folder instead of root
test_parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID")
if test_parent_folder_id:
connector = BoxConnector(
include_all_files=False,
folder_ids=test_parent_folder_id,
)
else:
connector = BoxConnector(
include_all_files=True,
folder_ids=None,
)
# Load credentials
connector.load_credentials(get_credentials_from_env("admin"))
# Get all documents from the connector
docs = load_all_docs(connector)
# Create a mapping from test file ID to actual Box file URL
test_id_to_box_id = {}
# Process all documents retrieved from Box
for doc in docs:
# Check if this document's name matches our test file naming pattern (file_X.txt)
if not doc.semantic_identifier.startswith(
file_name_template.format("").split("_")[0]
):
continue
try:
# Extract the test file ID from the filename (file_X.txt -> X)
file_id_str = doc.semantic_identifier.split("_")[1].split(".")[0]
test_file_id = int(file_id_str)
# Store the mapping from test ID to actual Box file URL
# Box document IDs are URLs
test_id_to_box_id[test_file_id] = doc.id
except (ValueError, IndexError):
# Skip files that don't follow our naming convention
continue
# Print the mapping for all defined test file ID ranges
all_test_ranges = {
"ADMIN_FILE_IDS": ADMIN_FILE_IDS,
"FOLDER_1_FILE_IDS": FOLDER_1_FILE_IDS,
"FOLDER_1_1_FILE_IDS": FOLDER_1_1_FILE_IDS,
"FOLDER_1_2_FILE_IDS": FOLDER_1_2_FILE_IDS,
"FOLDER_2_FILE_IDS": FOLDER_2_FILE_IDS,
"FOLDER_2_1_FILE_IDS": FOLDER_2_1_FILE_IDS,
"FOLDER_2_2_FILE_IDS": FOLDER_2_2_FILE_IDS,
"FOLDER_3_FILE_IDS": FOLDER_3_FILE_IDS,
}
# Print the mapping for each test range
for range_name, file_ids in all_test_ranges.items():
print(f"\n{range_name}:")
for test_id in file_ids:
box_id = test_id_to_box_id.get(test_id, "NOT_FOUND")
print(f" {test_id} -> {box_id}")
return test_id_to_box_id
if __name__ == "__main__":
# Allow running this script directly to generate mappings
generate_test_id_to_box_id_mapping()

View File

@@ -0,0 +1,290 @@
"""Permission sync tests for Box connector."""
import copy
import json
from collections.abc import Callable
from unittest.mock import MagicMock
from unittest.mock import patch
from ee.onyx.external_permissions.box.doc_sync import box_doc_sync
from onyx.connectors.box.connector import BoxConnector
from onyx.db.models import ConnectorCredentialPair
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from tests.daily.connectors.box.consts_and_utils import ACCESS_MAPPING
from tests.daily.connectors.box.consts_and_utils import PUBLIC_RANGE
def _build_connector(
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> BoxConnector:
"""Build a Box connector for permission sync testing."""
return box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
def test_box_perm_sync_with_real_data(
box_jwt_connector_factory: Callable[..., BoxConnector],
set_ee_on: None,
) -> None:
"""
Test box_doc_sync with real data from the test Box account.
This test uses the real connector to make actual API calls to Box
and verifies the permission structure returned.
"""
# Create a mock cc_pair that will use our real connector
# For tests, scope to test parent folder instead of root
import os
from tests.daily.connectors.box.conftest import get_credentials_from_env
test_parent_folder_id = os.environ.get("BOX_TEST_PARENT_FOLDER_ID")
mock_cc_pair = MagicMock(spec=ConnectorCredentialPair)
mock_cc_pair.connector = MagicMock()
if test_parent_folder_id:
mock_cc_pair.connector.connector_specific_config = {
"include_all_files": False,
"folder_ids": test_parent_folder_id,
}
else:
mock_cc_pair.connector.connector_specific_config = {
"include_all_files": True,
"folder_ids": None,
}
mock_cc_pair.credential_id = 1
# Use real credentials from environment
mock_cc_pair.credential.credential_json = get_credentials_from_env("admin")
mock_cc_pair.last_time_perm_sync = None
# Create a mock heartbeat
mock_heartbeat = MagicMock(spec=IndexingHeartbeatInterface)
mock_heartbeat.should_stop.return_value = False
# Load box_id_mapping.json if it exists
mapping_file = os.path.join(os.path.dirname(__file__), "box_id_mapping.json")
url_to_id_mapping: dict[str, int] = {}
if os.path.exists(mapping_file):
with open(mapping_file, "r") as f:
box_id_mapping = json.load(f)
# Invert the mapping to get URL -> ID
url_to_id_mapping = {url: int(id) for id, url in box_id_mapping.items()}
# Use the connector directly without mocking Box API calls
# Create a connector factory that respects the test scoping
def connector_factory(**kwargs):
# Use the connector_specific_config from mock_cc_pair to respect test scoping
config = mock_cc_pair.connector.connector_specific_config
return box_jwt_connector_factory(
user_key="admin",
include_all_files=config.get("include_all_files", True),
folder_ids=config.get("folder_ids", None),
)
with patch(
"ee.onyx.external_permissions.box.doc_sync.BoxConnector",
side_effect=connector_factory,
):
# Call the function under test
mock_fetch_all_docs_fn = MagicMock(return_value=[])
mock_fetch_all_docs_ids_fn = MagicMock(return_value=[])
doc_access_generator = box_doc_sync(
mock_cc_pair,
mock_fetch_all_docs_fn,
mock_fetch_all_docs_ids_fn,
mock_heartbeat,
)
doc_access_list = list(doc_access_generator)
# Verify we got some results
assert len(doc_access_list) > 0
print(f"Found {len(doc_access_list)} documents with permissions")
# Map documents to their permissions
doc_to_user_id_mapping: dict[str, set[str]] = {}
doc_to_raw_result_mapping: dict[str, set[str]] = {}
public_doc_ids: set[str] = set()
for doc_access in doc_access_list:
doc_id = doc_access.doc_id
# make sure they are new sets to avoid mutating the original
doc_to_user_id_mapping[doc_id] = copy.deepcopy(
doc_access.external_access.external_user_emails
)
doc_to_raw_result_mapping[doc_id] = copy.deepcopy(
doc_access.external_access.external_user_emails
)
# Box uses user emails directly, not groups like Google Drive
# But we may have group IDs that need to be resolved
for group_id in doc_access.external_access.external_user_group_ids:
# For Box, group IDs might need to be resolved to user emails
# This would require additional group sync functionality
doc_to_raw_result_mapping[doc_id].add(group_id)
if doc_access.external_access.is_public:
public_doc_ids.add(doc_id)
# Check permissions based on box_id_mapping.json and ACCESS_MAPPING
# For each document URL that exists in our mapping
checked_files = 0
for doc_id, user_ids_with_access in doc_to_user_id_mapping.items():
# Skip URLs that aren't in our mapping, we don't want new stuff to interfere
# with the test.
if doc_id not in url_to_id_mapping:
continue
file_numeric_id = url_to_id_mapping.get(doc_id)
if file_numeric_id is None:
raise ValueError(f"File {doc_id} not found in box_id_mapping.json")
checked_files += 1
# Check which users should have access to this file according to ACCESS_MAPPING
# Note: ACCESS_MAPPING uses user IDs (e.g., "13089353657"), but Box permissions
# return user emails (e.g., "admin@onyx-test.com"). We need to verify access
# by checking that the expected number of users have access, rather than
# exact email matching (which would require a user ID to email mapping).
expected_user_count = 0
for user_id, file_ids in ACCESS_MAPPING.items():
if file_numeric_id in file_ids:
expected_user_count += 1
# Verify the permissions match
if file_numeric_id in PUBLIC_RANGE:
# Public files should be marked as public
assert (
doc_id in public_doc_ids
), f"File {doc_id} (ID: {file_numeric_id}) should be public but is not in the public_doc_ids set"
# Public files may have additional user access, so we just verify it's marked public
else:
# Non-public files should have at least the expected number of users with access
# Note: We can't do exact email matching without a user ID to email mapping,
# but we can verify that files have the expected level of access
# Check both user emails and group IDs (files may have group-only permissions)
has_user_access = len(user_ids_with_access) > 0
has_group_access = len(doc_to_raw_result_mapping[doc_id]) > len(
user_ids_with_access
)
assert has_user_access or has_group_access, (
f"File {doc_id} (ID: {file_numeric_id}) should have some access "
f"(user emails or group IDs) but has none. "
f"User emails: {user_ids_with_access}, "
f"Raw result (includes groups): {doc_to_raw_result_mapping[doc_id]}"
)
# Verify that the number of users with access is at least the expected count
# (some files may have additional access beyond what's in ACCESS_MAPPING)
assert len(user_ids_with_access) >= expected_user_count, (
f"File {doc_id} (ID: {file_numeric_id}) should have access for at least "
f"{expected_user_count} user(s) according to ACCESS_MAPPING, "
f"but only {len(user_ids_with_access)} user(s) have access. "
f"Users with access: {user_ids_with_access}. "
f"Raw result: {doc_to_raw_result_mapping[doc_id]}"
)
# Log the access for debugging (helps identify permission issues)
if len(user_ids_with_access) != expected_user_count:
print(
f"Note: File {doc_id} (ID: {file_numeric_id}) has {len(user_ids_with_access)} "
f"users with access, expected {expected_user_count} from ACCESS_MAPPING. "
f"This may be due to additional sharing or group permissions."
)
if checked_files > 0:
print(f"Checked permissions for {checked_files} files from box_id_mapping.json")
else:
# Fail the test if no files were checked - this indicates either:
# 1. box_id_mapping.json is missing, or
# 2. No doc_ids from the sync matched the mapping (potential sync issue)
raise AssertionError(
"No files checked. This test requires box_id_mapping.json to exist and "
"doc_ids from box_doc_sync to match entries in the mapping. "
"Run test_map_test_ids.py to generate box_id_mapping.json."
)
def test_slim_document_generation(
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test slim document generation for permission sync."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
# Test that retrieve_all_slim_docs_perm_sync works
# Add safety limit to prevent infinite loops
slim_doc_generator = connector.retrieve_all_slim_docs_perm_sync()
slim_doc_batches = []
max_iterations = 1000 # Safety limit
iteration_count = 0
for batch in slim_doc_generator:
slim_doc_batches.append(batch)
iteration_count += 1
if iteration_count >= max_iterations:
raise RuntimeError(
f"Test hit safety limit of {max_iterations} iterations. "
"This suggests an infinite loop."
)
# Should get some slim documents
assert len(slim_doc_batches) > 0
# Each batch should contain slim documents
for batch in slim_doc_batches:
assert len(batch) > 0
for slim_doc in batch:
assert slim_doc.id is not None
# External access may or may not be present
# depending on whether permissions were fetched
def test_permission_sync_checkpointing(
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test permission sync with checkpointing."""
connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
# Load slim docs with checkpointing using the proper method
import time
start_time = 0
end_time = time.time()
# Use retrieve_all_slim_docs_perm_sync which properly handles checkpointing
slim_doc_generator = connector.retrieve_all_slim_docs_perm_sync(
start=start_time,
end=end_time,
callback=None,
)
# Collect batches with a safety limit to prevent infinite loops
slim_doc_batches = []
max_iterations = 1000 # Safety limit
iteration_count = 0
for batch in slim_doc_generator:
slim_doc_batches.append(batch)
iteration_count += 1
if iteration_count >= max_iterations:
# If we hit the limit, something is wrong
raise RuntimeError(
f"Test hit safety limit of {max_iterations} iterations. "
"This suggests an infinite loop or checkpoint not updating properly."
)
# Should get some documents
assert len(slim_doc_batches) > 0
# Verify we got some slim documents
total_docs = sum(len(batch) for batch in slim_doc_batches)
assert total_docs > 0, "Should have retrieved at least one slim document"

View File

@@ -0,0 +1,286 @@
"""Permission and access tests for Box connector."""
from collections.abc import Callable
from unittest.mock import MagicMock
from unittest.mock import patch
from box_sdk_gen.box import BoxAPIError
from onyx.connectors.box.connector import BoxConnector
from tests.daily.connectors.box.consts_and_utils import ACCESS_MAPPING
from tests.daily.connectors.box.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import ADMIN_FOLDER_3_URL
from tests.daily.connectors.box.consts_and_utils import ADMIN_USER_ID
from tests.daily.connectors.box.consts_and_utils import (
assert_expected_docs_in_retrieved_docs,
)
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_FILE_IDS
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_URL
from tests.daily.connectors.box.consts_and_utils import load_all_docs
from tests.daily.connectors.box.consts_and_utils import TEST_USER_1_ID
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_user_access_mapping(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that files are only accessible to users with permissions."""
# Test with admin user - should have access to everything
admin_connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
admin_docs = load_all_docs(admin_connector)
admin_file_ids = list(ACCESS_MAPPING[ADMIN_USER_ID])
assert_expected_docs_in_retrieved_docs(
retrieved_docs=admin_docs,
expected_file_ids=admin_file_ids,
)
# Test with test_user_1 - should have limited access
user1_connector = box_jwt_connector_factory(
user_key="test_user_1",
include_all_files=True,
folder_ids=None,
)
user1_docs = load_all_docs(user1_connector)
user1_file_ids = list(ACCESS_MAPPING[TEST_USER_1_ID])
assert_expected_docs_in_retrieved_docs(
retrieved_docs=user1_docs,
expected_file_ids=user1_file_ids,
)
# Verify that user1's expected files are a subset of admin's expected files
# (When scoped to test parent folder, all users can see all subfolders)
assert set(user1_file_ids).issubset(set(admin_file_ids))
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_public_files(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that public files are accessible to all users."""
from tests.daily.connectors.box.consts_and_utils import PUBLIC_RANGE
from tests.daily.connectors.box.consts_and_utils import id_to_name
# Test with admin
admin_connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=True,
folder_ids=None,
)
admin_docs = load_all_docs(admin_connector)
admin_file_names = {doc.semantic_identifier for doc in admin_docs}
# Test with test_user_3 (most restricted user)
# Use FOLDER_1_2 which is public and accessible to all users
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_URL
user3_connector = box_jwt_connector_factory(
user_key="test_user_3",
include_all_files=False,
folder_ids=FOLDER_1_2_URL,
)
user3_docs = load_all_docs(user3_connector)
user3_file_names = {doc.semantic_identifier for doc in user3_docs}
# Verify that public files are accessible to both users
# PUBLIC_RANGE includes FOLDER_1_2_FILE_IDS (public folder) and PUBLIC_FILE_IDS
# test_user_3 only has access to FOLDER_1_2, so we verify that subset
expected_public_file_names = {id_to_name(file_id) for file_id in PUBLIC_RANGE}
admin_public_files = admin_file_names & expected_public_file_names
user3_public_files = user3_file_names & expected_public_file_names
# Verify test_user_3 has access to the public folder files
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_FILE_IDS
expected_folder_1_2_names = {id_to_name(file_id) for file_id in FOLDER_1_2_FILE_IDS}
# test_user_3 should have access to all files in the public folder
assert expected_folder_1_2_names.issubset(user3_public_files), (
f"test_user_3 should have access to all files in public folder FOLDER_1_2. "
f"Expected: {expected_folder_1_2_names}, Got: {user3_public_files}"
)
# Admin should also have access to the public folder files
assert expected_folder_1_2_names.issubset(admin_public_files), (
f"Admin should have access to all files in public folder FOLDER_1_2. "
f"Expected: {expected_folder_1_2_names}, Got: {admin_public_files}"
)
# At least some public files should exist
assert len(user3_public_files) > 0, (
f"test_user_3 should have access to at least some public files. "
f"Got: {user3_public_files}"
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_restricted_access(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test files with restricted access."""
# Test with admin - should have access
admin_connector = box_jwt_connector_factory(
user_key="admin",
include_all_files=False,
folder_ids=ADMIN_FOLDER_3_URL,
)
admin_docs = load_all_docs(admin_connector)
assert len(admin_docs) > 0
# Test with test_user_3 - should not have access to admin's folder 3 (ADMIN_FOLDER_3)
# The setup script explicitly removes test_user_3's access to ensure this test is useful
user3_connector = box_jwt_connector_factory(
user_key="test_user_3",
include_all_files=False,
folder_ids=ADMIN_FOLDER_3_URL,
)
# When a user doesn't have access, Box returns a 404 error
try:
user3_docs = load_all_docs(user3_connector)
assert len(user3_docs) == 0, (
f"test_user_3 should not have access to ADMIN_FOLDER_3, "
f"but retrieved {len(user3_docs)} files. "
f"Run setup script to ensure test_user_3's access is removed."
)
except BoxAPIError as e:
# 404 error indicates no access (expected behavior)
status_code = getattr(e, "status_code", None)
if status_code != 404:
raise
except Exception as e:
# For non-BoxAPIError exceptions, check if it's a wrapped 404
# This handles cases where BoxAPIError might be wrapped
error_msg = str(e).lower()
if (
"404" not in error_msg
and "not found" not in error_msg
and "not_found" not in error_msg
):
raise
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_collaboration_permissions(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test Box collaboration permissions."""
# Test that test_user_1 has access to admin's folder 3 (shared via collaboration)
user1_connector = box_jwt_connector_factory(
user_key="test_user_1",
include_all_files=False,
folder_ids=ADMIN_FOLDER_3_URL,
)
user1_docs = load_all_docs(user1_connector)
# Should have access to files in admin's folder 3
expected_file_ids = ADMIN_FOLDER_3_FILE_IDS
assert_expected_docs_in_retrieved_docs(
retrieved_docs=user1_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_shared_folders(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test files in shared folders."""
# Test that test_user_2 has access to folder 1 (shared via group)
user2_connector = box_jwt_connector_factory(
user_key="test_user_2",
include_all_files=False,
folder_ids=FOLDER_1_URL,
)
user2_docs = load_all_docs(user2_connector)
# Should have access to files in folder 1
expected_file_ids = FOLDER_1_FILE_IDS + FOLDER_1_1_FILE_IDS + FOLDER_1_2_FILE_IDS
assert_expected_docs_in_retrieved_docs(
retrieved_docs=user2_docs,
expected_file_ids=expected_file_ids,
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_user_specific_access(
mock_get_api_key: MagicMock,
box_jwt_connector_factory: Callable[..., BoxConnector],
) -> None:
"""Test that users can only access their own files and shared files."""
# Test with test_user_3 (most restricted)
# test_user_3 should have access to public folder FOLDER_1_2
# but should NOT have access to ADMIN_FOLDER_3 (restricted)
from tests.daily.connectors.box.consts_and_utils import FOLDER_1_2_URL
user3_connector = box_jwt_connector_factory(
user_key="test_user_3",
include_all_files=False,
folder_ids=FOLDER_1_2_URL,
)
user3_docs = load_all_docs(user3_connector)
# test_user_3 should have access to public folder FOLDER_1_2
# Verify they can access the public files in that folder
expected_file_ids = FOLDER_1_2_FILE_IDS # Public folder files
assert_expected_docs_in_retrieved_docs(
retrieved_docs=user3_docs,
expected_file_ids=expected_file_ids,
)
# Verify test_user_3 does NOT have access to ADMIN_FOLDER_3
user3_restricted_connector = box_jwt_connector_factory(
user_key="test_user_3",
include_all_files=False,
folder_ids=ADMIN_FOLDER_3_URL,
)
try:
restricted_docs = load_all_docs(user3_restricted_connector)
# If no exception, verify no documents were retrieved
assert len(restricted_docs) == 0, (
f"test_user_3 should NOT have access to ADMIN_FOLDER_3, "
f"but retrieved {len(restricted_docs)} files: {[doc.semantic_identifier for doc in restricted_docs]}"
)
except BoxAPIError as e:
# If a BoxAPIError is raised with 404, that means test_user_3
# doesn't have access, which is what we want. The test passes.
status_code = getattr(e, "status_code", None)
if status_code != 404:
# Unexpected status code, re-raise it
raise
except Exception as e:
# For non-BoxAPIError exceptions, check if it's a wrapped 404
# This handles cases where BoxAPIError might be wrapped
error_msg = str(e).lower()
if (
"404" not in error_msg
and "not found" not in error_msg
and "not_found" not in error_msg
):
# Unexpected error, re-raise it
raise

View File

@@ -201,6 +201,10 @@ LOG_ONYX_MODEL_INTERACTIONS=False
# LINEAR_CLIENT_ID=
# LINEAR_CLIENT_SECRET=
# Box testing
# WARNING: If set, BOX_DEVELOPER_TOKEN overrides JWT authentication and uses developer token instead. FOR TESTING ONLY.
# BOX_DEVELOPER_TOKEN=
## Miscellaneous
# ONYX_QUERY_HISTORY_TYPE=
# CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=

View File

@@ -115,6 +115,7 @@ backend = [
"zulip==0.8.2",
"hubspot-api-client==11.1.0",
"asana==5.0.8",
"boxsdk==10.3.0",
"dropbox==12.0.2",
"shapely==2.0.6",
"stripe==10.12.0",

15
uv.lock generated
View File

@@ -528,6 +528,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5e/e0/4bfaa72002bbe7befb96e8af8e56e7398b58ef981941577818b1a671e7f7/botocore_stubs-1.40.74-py3-none-any.whl", hash = "sha256:4c215592a8c26f66e0af773b513f1a34437da2a6d0f53a04928bbba1b131c935", size = 66541, upload-time = "2025-11-14T21:23:24.697Z" },
]
[[package]]
name = "boxsdk"
version = "10.3.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests" },
{ name = "requests-toolbelt" },
]
sdist = { url = "https://files.pythonhosted.org/packages/df/25/d859cc617d832506e80327a277b0e0cc7d1114d66e966fdab8b218ffaf17/boxsdk-10.3.0.tar.gz", hash = "sha256:5b8ec0e2ed70160e16fe2fc1240d3896c88d50bd30796b021e95cfbe977b3444", size = 272690, upload-time = "2025-12-19T11:31:15.369Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/05/af/fec6a530efdfc3d7739d821cdcb63de7c9979954fa21ef6d16d0b678c8ed/boxsdk-10.3.0-py3-none-any.whl", hash = "sha256:3f65792834315177765c096402e35f43400c4c99c9b6e82f9ac40c8de3da4767", size = 574729, upload-time = "2025-12-19T11:31:13.575Z" },
]
[[package]]
name = "braintrust"
version = "0.3.9"
@@ -3465,6 +3478,7 @@ backend = [
{ name = "beautifulsoup4" },
{ name = "boto3" },
{ name = "boto3-stubs", extra = ["s3"] },
{ name = "boxsdk" },
{ name = "braintrust" },
{ name = "celery" },
{ name = "chardet" },
@@ -3619,6 +3633,7 @@ requires-dist = [
{ name = "black", marker = "extra == 'dev'", specifier = "==25.1.0" },
{ name = "boto3", marker = "extra == 'backend'", specifier = "==1.39.11" },
{ name = "boto3-stubs", extras = ["s3"], marker = "extra == 'backend'", specifier = "==1.39.11" },
{ name = "boxsdk", marker = "extra == 'backend'", specifier = "==10.3.0" },
{ name = "braintrust", marker = "extra == 'backend'", specifier = "==0.3.9" },
{ name = "brotli", specifier = ">=1.2.0" },
{ name = "celery", marker = "extra == 'backend'", specifier = "==5.5.1" },

BIN
web/public/box.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.5 KiB

View File

@@ -35,9 +35,11 @@ import {
} from "@/lib/connectors/connectors";
import Modal from "@/refresh-components/Modal";
import { GmailMain } from "@/app/admin/connectors/[connector]/pages/gmail/GmailPage";
import { BoxMain } from "@/app/admin/connectors/[connector]/pages/box/BoxPage";
import {
useGmailCredentials,
useGoogleDriveCredentials,
useBoxCredentials,
} from "@/app/admin/connectors/[connector]/pages/utils/hooks";
import { Formik } from "formik";
import NavigationRow from "@/app/admin/connectors/[connector]/NavigationRow";
@@ -195,11 +197,13 @@ export default function AddConnector({
// Hooks for Google Drive and Gmail credentials
const { liveGDriveCredential } = useGoogleDriveCredentials(connector);
const { liveGmailCredential } = useGmailCredentials(connector);
const { liveBoxCredential } = useBoxCredentials(connector);
// Check if credential is activated
const credentialActivated =
(connector === "google_drive" && liveGDriveCredential) ||
(connector === "gmail" && liveGmailCredential) ||
(connector === "box" && liveBoxCredential) ||
currentCredential;
// Check if there are no credentials
@@ -434,7 +438,8 @@ export default function AddConnector({
const credential =
currentCredential ||
liveGDriveCredential ||
liveGmailCredential;
liveGmailCredential ||
liveBoxCredential;
const linkCredentialResponse = await linkCredential(
response.id,
credential?.id!,
@@ -516,6 +521,8 @@ export default function AddConnector({
{connector == ValidSources.Gmail ? (
<GmailMain />
) : connector == ValidSources.Box ? (
<BoxMain />
) : (
<>
<ModifyCredential
@@ -638,6 +645,7 @@ export default function AddConnector({
currentCredential ||
liveGDriveCredential ||
liveGmailCredential ||
liveBoxCredential ||
null
}
/>

View File

@@ -0,0 +1,118 @@
"use client";
import React from "react";
import { ErrorCallout } from "@/components/ErrorCallout";
import { LoadingAnimation } from "@/components/Loading";
import { usePopup } from "@/components/admin/connectors/Popup";
import { CCPairBasicInfo, ValidSources } from "@/lib/types";
import { Credential, BoxCredentialJson } from "@/lib/connectors/credentials";
import { BoxAuthSection, BoxJsonUploadSection } from "./Credential";
import { usePublicCredentials, useBasicConnectorStatus } from "@/lib/hooks";
import Title from "@/components/ui/title";
import { useUser } from "@/components/user/UserProvider";
import useSWR from "swr";
import { errorHandlingFetcher } from "@/lib/fetcher";
import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
export const BoxMain = () => {
const { isAdmin, user } = useUser();
const { popup, setPopup } = usePopup();
const {
data: jwtConfigData,
isLoading: isJwtConfigLoading,
error: isJwtConfigError,
} = useSWR<{ client_id: string; enterprise_id: string }>(
"/api/manage/admin/connector/box/jwt-config",
errorHandlingFetcher
);
const {
data: connectorIndexingStatuses,
isLoading: isConnectorIndexingStatusesLoading,
error: connectorIndexingStatusesError,
} = useBasicConnectorStatus();
const {
data: credentialsData,
isLoading: isCredentialsLoading,
error: credentialsError,
refreshCredentials,
} = usePublicCredentials();
const handleRefresh = () => {
refreshCredentials();
};
if (
(!jwtConfigData && isJwtConfigLoading && !isJwtConfigError) ||
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
(!credentialsData && isCredentialsLoading)
) {
return (
<div className="mx-auto">
<LoadingAnimation text="" />
</div>
);
}
if (isJwtConfigError) {
return <ErrorCallout errorTitle="Failed to load Box JWT config." />;
}
if (credentialsError || !credentialsData) {
return <ErrorCallout errorTitle="Failed to load credentials." />;
}
if (connectorIndexingStatusesError || !connectorIndexingStatuses) {
return <ErrorCallout errorTitle="Failed to load connectors." />;
}
const boxJwtCredential: Credential<BoxCredentialJson> | undefined =
credentialsData.find(
(credential) =>
credential.credential_json?.box_jwt_config &&
credential.source === "box"
);
const boxConnectorIndexingStatuses: CCPairBasicInfo[] =
connectorIndexingStatuses.filter(
(connectorIndexingStatus) => connectorIndexingStatus.source === "box"
);
const connectorExists = boxConnectorIndexingStatuses.length > 0;
const hasUploadedJwtConfig = Boolean(jwtConfigData?.client_id);
return (
<>
{popup}
<Title className="mb-2 mt-6 ml-auto mr-auto">
Step 1: Provide your Box JWT Config
</Title>
<BoxJsonUploadSection
setPopup={setPopup}
jwtConfigData={jwtConfigData}
isAdmin={isAdmin}
onSuccess={handleRefresh}
existingAuthCredential={Boolean(boxJwtCredential)}
/>
{isAdmin && hasUploadedJwtConfig && (
<>
<Title className="mb-2 mt-6 ml-auto mr-auto">
Step 2: Create Credential
</Title>
<BoxAuthSection
setPopup={setPopup}
refreshCredentials={handleRefresh}
boxJwtCredential={boxJwtCredential}
jwtConfigData={jwtConfigData}
connectorAssociated={connectorExists}
user={user}
/>
</>
)}
</>
);
};

View File

@@ -0,0 +1,540 @@
"use client";
import { PopupSpec } from "@/components/admin/connectors/Popup";
import React, { useState, useEffect } from "react";
import { useSWRConfig } from "swr";
import * as Yup from "yup";
import { TextFormField, SectionHeader } from "@/components/Field";
import { Form, Formik } from "formik";
import { User, ValidSources } from "@/lib/types";
import Button from "@/refresh-components/buttons/Button";
import { Credential, BoxCredentialJson } from "@/lib/connectors/credentials";
import { buildSimilarCredentialInfoURL } from "@/app/admin/connector/[ccPairId]/lib";
import { FiFile, FiCheck, FiLink, FiAlertTriangle } from "react-icons/fi";
import { cn, truncateString } from "@/lib/utils";
import { adminDeleteCredential } from "@/lib/credential";
import { DOCS_ADMINS_PATH } from "@/lib/constants";
export const BoxJsonUpload = ({
setPopup,
onSuccess,
}: {
setPopup: (popupSpec: PopupSpec | null) => void;
onSuccess?: () => void;
}) => {
const { mutate } = useSWRConfig();
const [isUploading, setIsUploading] = useState(false);
const [fileName, setFileName] = useState<string | undefined>();
const [isDragging, setIsDragging] = useState(false);
const handleFileUpload = async (file: File) => {
setIsUploading(true);
setFileName(file.name);
const reader = new FileReader();
reader.onload = async (loadEvent) => {
if (!loadEvent?.target?.result) {
setIsUploading(false);
return;
}
const credentialJsonStr = loadEvent.target.result as string;
// Validate Box JWT config structure
try {
const jwtConfigJson = JSON.parse(credentialJsonStr);
if (!jwtConfigJson.boxAppSettings) {
throw new Error(
"Invalid Box JWT config: missing 'boxAppSettings' field"
);
}
if (!jwtConfigJson.boxAppSettings.clientID) {
throw new Error(
"Invalid Box JWT config: missing 'boxAppSettings.clientID'"
);
}
if (!jwtConfigJson.boxAppSettings.appAuth) {
throw new Error(
"Invalid Box JWT config: missing 'boxAppSettings.appAuth'"
);
}
} catch (e) {
setPopup({
message: `Invalid Box JWT config file - ${e}`,
type: "error",
});
setIsUploading(false);
return;
}
try {
const response = await fetch(
"/api/manage/admin/connector/box/jwt-config",
{
method: "PUT",
headers: {
"Content-Type": "application/json",
},
body: credentialJsonStr,
}
);
if (response.ok) {
setPopup({
message: "Successfully uploaded Box JWT config",
type: "success",
});
mutate("/api/manage/admin/connector/box/jwt-config");
if (onSuccess) {
onSuccess();
}
} else {
const errorMsg = await response.text();
setPopup({
message: `Failed to upload Box JWT config - ${errorMsg}`,
type: "error",
});
}
} catch (error) {
setPopup({
message: `Failed to upload Box JWT config - ${error}`,
type: "error",
});
} finally {
setIsUploading(false);
}
};
reader.onerror = () => {
setPopup({
message: "Failed to read file. Please try again.",
type: "error",
});
setIsUploading(false);
};
reader.onabort = () => {
setPopup({
message: "File read was aborted. Please try again.",
type: "error",
});
setIsUploading(false);
};
reader.readAsText(file);
};
const handleDragEnter = (e: React.DragEvent<HTMLLabelElement>) => {
e.preventDefault();
e.stopPropagation();
if (!isUploading) {
setIsDragging(true);
}
};
const handleDragLeave = (e: React.DragEvent<HTMLLabelElement>) => {
e.preventDefault();
e.stopPropagation();
setIsDragging(false);
};
const handleDragOver = (e: React.DragEvent<HTMLLabelElement>) => {
e.preventDefault();
e.stopPropagation();
};
const handleDrop = (e: React.DragEvent<HTMLLabelElement>) => {
e.preventDefault();
e.stopPropagation();
setIsDragging(false);
if (isUploading) return;
const files = e.dataTransfer.files;
if (files.length > 0) {
const file = files[0];
if (
file !== undefined &&
(file.type === "application/json" || file.name.endsWith(".json"))
) {
handleFileUpload(file);
} else {
setPopup({
message: "Please upload a JSON file",
type: "error",
});
}
}
};
return (
<div className="flex flex-col mt-4">
<div className="flex items-center">
<div className="relative flex flex-1 items-center">
<label
className={cn(
"flex h-10 items-center justify-center w-full px-4 py-2 border border-dashed rounded-md transition-colors",
isUploading
? "opacity-70 cursor-not-allowed border-background-400 bg-background-50/30"
: isDragging
? "bg-background-50/50 border-primary dark:border-primary"
: "cursor-pointer hover:bg-background-50/30 hover:border-primary dark:hover:border-primary border-background-300 dark:border-background-600"
)}
onDragEnter={handleDragEnter}
onDragLeave={handleDragLeave}
onDragOver={handleDragOver}
onDrop={handleDrop}
>
<div className="flex items-center space-x-2">
{isUploading ? (
<div className="h-4 w-4 border-t-2 border-b-2 border-primary rounded-full animate-spin"></div>
) : (
<FiFile className="h-4 w-4 text-text-500" />
)}
<span className="text-sm text-text-500">
{isUploading
? `Uploading ${truncateString(fileName || "file", 50)}...`
: isDragging
? "Drop JSON file here"
: truncateString(
fileName || "Select or drag Box JWT config file...",
50
)}
</span>
</div>
<input
className="sr-only"
type="file"
accept=".json"
disabled={isUploading}
onChange={(event) => {
if (!event.target.files?.length) {
return;
}
const file = event.target.files[0];
if (file === undefined) {
return;
}
handleFileUpload(file);
}}
/>
</label>
</div>
</div>
</div>
);
};
interface BoxJsonUploadSectionProps {
setPopup: (popupSpec: PopupSpec | null) => void;
jwtConfigData?: { client_id: string; enterprise_id: string };
isAdmin: boolean;
onSuccess?: () => void;
existingAuthCredential?: boolean;
}
export const BoxJsonUploadSection = ({
setPopup,
jwtConfigData,
isAdmin,
onSuccess,
existingAuthCredential,
}: BoxJsonUploadSectionProps) => {
const { mutate } = useSWRConfig();
const [localJwtConfigData, setLocalJwtConfigData] = useState(jwtConfigData);
// Update local state when props change
useEffect(() => {
setLocalJwtConfigData(jwtConfigData);
}, [jwtConfigData]);
const handleSuccess = () => {
if (onSuccess) {
onSuccess();
}
};
if (!isAdmin) {
return (
<div>
<div className="flex items-start py-3 px-4 bg-yellow-50/30 dark:bg-yellow-900/5 rounded">
<FiAlertTriangle className="text-yellow-500 h-5 w-5 mr-2 mt-0.5 flex-shrink-0" />
<p className="text-sm">
Curators are unable to set up the Box credentials. To add a Box
connector, please contact an administrator.
</p>
</div>
</div>
);
}
return (
<div>
<p className="text-sm mb-3">
To connect your Box account, create a Box Platform App with JWT
authentication, download the JSON config file, and upload it below.
</p>
<div className="mb-4">
<a
className="text-primary hover:text-primary/80 flex items-center gap-1 text-sm"
target="_blank"
href={`${DOCS_ADMINS_PATH}/connectors/official/box/overview`}
rel="noreferrer"
>
<FiLink className="h-3 w-3" />
View detailed setup instructions
</a>
</div>
{localJwtConfigData?.client_id && (
<div className="mb-4">
<div className="relative flex flex-1 items-center">
<label
className={cn(
"flex h-10 items-center justify-center w-full px-4 py-2 border border-dashed rounded-md transition-colors",
"cursor-pointer hover:bg-background-50/30 hover:border-primary dark:hover:border-primary border-background-300 dark:border-background-600"
)}
>
<div className="flex items-center space-x-2">
<FiFile className="h-4 w-4 text-text-500" />
<span className="text-sm text-text-500">
{truncateString(
`Client ID: ${localJwtConfigData.client_id}`,
50
)}
</span>
</div>
</label>
</div>
{isAdmin && !existingAuthCredential && (
<div className="mt-2">
<Button
danger
onClick={async () => {
try {
const response = await fetch(
"/api/manage/admin/connector/box/jwt-config",
{
method: "DELETE",
}
);
if (response.ok) {
mutate("/api/manage/admin/connector/box/jwt-config");
mutate(buildSimilarCredentialInfoURL(ValidSources.Box));
setPopup({
message: "Successfully deleted Box JWT config",
type: "success",
});
setLocalJwtConfigData(undefined);
handleSuccess();
} else {
const errorMsg = await response.text();
setPopup({
message: `Failed to delete JWT config - ${errorMsg}`,
type: "error",
});
}
} catch (error) {
setPopup({
message: `Failed to delete JWT config - ${error}`,
type: "error",
});
}
}}
>
Delete JWT Config
</Button>
</div>
)}
</div>
)}
{!localJwtConfigData?.client_id && (
<BoxJsonUpload setPopup={setPopup} onSuccess={handleSuccess} />
)}
</div>
);
};
interface BoxCredentialSectionProps {
boxJwtCredential?: Credential<BoxCredentialJson>;
jwtConfigData?: { client_id: string; enterprise_id: string };
setPopup: (popupSpec: PopupSpec | null) => void;
refreshCredentials: () => void;
connectorAssociated: boolean;
user: User | null;
}
async function handleRevokeAccess(
connectorAssociated: boolean,
setPopup: (popupSpec: PopupSpec | null) => void,
existingCredential: Credential<BoxCredentialJson>,
refreshCredentials: () => void
) {
if (connectorAssociated) {
const message =
"Cannot revoke the Box credential while any connector is still associated with the credential. " +
"Please delete all associated connectors, then try again.";
setPopup({
message: message,
type: "error",
});
return;
}
const response = await adminDeleteCredential(existingCredential.id);
if (response.ok) {
setPopup({
message: "Successfully revoked the Box credential!",
type: "success",
});
refreshCredentials();
} else {
const errorMsg = await response.text();
setPopup({
message: `Failed to revoke Box credential - ${errorMsg}`,
type: "error",
});
}
}
export const BoxAuthSection = ({
boxJwtCredential,
jwtConfigData,
setPopup,
refreshCredentials,
connectorAssociated,
user,
}: BoxCredentialSectionProps) => {
const [localJwtConfigData, setLocalJwtConfigData] = useState(jwtConfigData);
const [localBoxJwtCredential, setLocalBoxJwtCredential] =
useState(boxJwtCredential);
// Update local state when props change
useEffect(() => {
setLocalJwtConfigData(jwtConfigData);
setLocalBoxJwtCredential(boxJwtCredential);
}, [jwtConfigData, boxJwtCredential]);
if (localBoxJwtCredential) {
return (
<div>
<div className="mt-4">
<div className="py-3 px-4 bg-blue-50/30 dark:bg-blue-900/5 rounded mb-4 flex items-start">
<FiCheck className="text-blue-500 h-5 w-5 mr-2 mt-0.5 flex-shrink-0" />
<div className="flex-1">
<span className="font-medium block">Authentication Complete</span>
<p className="text-sm mt-1 text-text-500 dark:text-text-400 break-words">
Your Box JWT credentials have been successfully uploaded and
authenticated.
</p>
</div>
</div>
<Button
danger
onClick={async () => {
handleRevokeAccess(
connectorAssociated,
setPopup,
localBoxJwtCredential,
refreshCredentials
);
}}
>
Revoke Access
</Button>
</div>
</div>
);
}
// If no JWT config is uploaded, show message to complete step 1 first
if (!localJwtConfigData?.client_id) {
return (
<div>
<SectionHeader>Box Authentication</SectionHeader>
<div className="mt-4">
<div className="flex items-start py-3 px-4 bg-yellow-50/30 dark:bg-yellow-900/5 rounded">
<FiAlertTriangle className="text-yellow-500 h-5 w-5 mr-2 mt-0.5 flex-shrink-0" />
<p className="text-sm">
Please complete Step 1 by uploading the Box JWT config file before
proceeding with authentication.
</p>
</div>
</div>
</div>
);
}
// If JWT config is uploaded, show form to create credential with user ID
return (
<div>
<div className="mt-4">
<Formik
initialValues={{
box_primary_admin_user_id: "",
}}
validationSchema={Yup.object().shape({
box_primary_admin_user_id: Yup.string().required(
"Primary admin user ID is required"
),
})}
onSubmit={async (values, formikHelpers) => {
formikHelpers.setSubmitting(true);
try {
const response = await fetch(
"/api/manage/admin/connector/box/jwt-credential",
{
method: "PUT",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
box_primary_admin_user_id: values.box_primary_admin_user_id,
}),
}
);
if (response.ok) {
setPopup({
message: "Successfully created Box JWT credential",
type: "success",
});
refreshCredentials();
} else {
const errorMsg = await response.text();
setPopup({
message: `Failed to create Box JWT credential - ${errorMsg}`,
type: "error",
});
}
} catch (error) {
setPopup({
message: `Failed to create Box JWT credential - ${error}`,
type: "error",
});
} finally {
formikHelpers.setSubmitting(false);
}
}}
>
{({ isSubmitting }) => (
<Form>
<TextFormField
name="box_primary_admin_user_id"
label="Primary Admin User ID:"
subtext="Enter the Box user ID of an admin/owner that has access to the Box content you want to index. You can find this in the Box Admin Console or by calling the Box API."
/>
<div className="flex">
<Button type="submit" disabled={isSubmitting}>
{isSubmitting ? "Creating..." : "Create Credential"}
</Button>
</div>
</Form>
)}
</Formik>
</div>
</div>
);
};

View File

@@ -10,6 +10,7 @@ import {
GmailServiceAccountCredentialJson,
GoogleDriveCredentialJson,
GoogleDriveServiceAccountCredentialJson,
BoxCredentialJson,
} from "@/lib/connectors/credentials";
export const useGmailCredentials = (connector: string) => {
@@ -73,3 +74,19 @@ export const useGoogleDriveCredentials = (connector: string) => {
liveGDriveCredential: liveGDriveCredential,
};
};
export const useBoxCredentials = (connector: string) => {
const { data: credentialsData } = usePublicCredentials();
const boxJwtCredential: Credential<BoxCredentialJson> | undefined =
credentialsData?.find(
(credential) =>
credential.credential_json?.box_jwt_config &&
credential.admin_public &&
credential.source === connector
);
return {
liveBoxCredential: boxJwtCredential,
};
};

View File

@@ -37,6 +37,7 @@ import discordIcon from "@public/discord.png";
import discourseIcon from "@public/Discourse.png";
import document360Icon from "@public/Document360.png";
import dropboxIcon from "@public/Dropbox.png";
import boxIcon from "@public/box.png";
import drupalwikiIcon from "@public/DrupalWiki.png";
import egnyteIcon from "@public/Egnyte.png";
import firefliesIcon from "@public/Fireflies.png";
@@ -835,6 +836,7 @@ export const DeepseekIcon = createLogoIcon(deepseekSVG);
export const DiscourseIcon = createLogoIcon(discourseIcon);
export const Document360Icon = createLogoIcon(document360Icon);
export const DropboxIcon = createLogoIcon(dropboxIcon);
export const BoxIcon = createLogoIcon(boxIcon);
export const DrupalWikiIcon = createLogoIcon(drupalwikiIcon);
export const EgnyteIcon = createLogoIcon(egnyteIcon);
export const FirefliesIcon = createLogoIcon(firefliesIcon);

View File

@@ -1236,6 +1236,31 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
values: [],
advanced_values: [],
},
box: {
description: "Configure Box connector",
values: [
{
type: "checkbox",
query: "Index all accessible files?",
label: "Include All Files",
name: "include_all_files",
description:
"If checked, the connector will index all files accessible to the authenticated user.",
optional: false,
default: false,
},
{
type: "list",
query: "Enter folder IDs or URLs (optional):",
label: "Folder IDs",
name: "folder_ids",
description:
"Comma-separated list of Box folder IDs or URLs to index. Leave empty if 'Include All Files' is checked.",
optional: true,
},
],
advanced_values: [],
},
s3: {
description: "Configure S3 connector",
values: [
@@ -1743,6 +1768,26 @@ export function createConnectorValidationSchema(
},
{} as Record<string, any>
),
// Box-specific validation: require either include_all_files or folder_ids
...(connector === "box"
? {
folder_ids: Yup.array()
.of(Yup.string())
.when("include_all_files", {
is: false,
then: (schema) =>
schema
.min(
1,
"At least one folder ID is required when 'Include All Files' is unchecked"
)
.required(
"Folder IDs are required when 'Include All Files' is unchecked"
),
otherwise: (schema) => schema,
}),
}
: {}),
// These are advanced settings
indexingStart: Yup.string().nullable(),
pruneFreq: Yup.number().min(

View File

@@ -177,6 +177,20 @@ export interface DropboxCredentialJson {
dropbox_access_token: string;
}
export interface BoxCredentialJson {
// JWT authentication (new)
box_jwt_config?: string; // JSON string of Box JWT config
box_primary_admin_user_id?: string; // User ID to impersonate
authentication_method?: string; // "uploaded" for JWT
// OAuth flow credentials (legacy, deprecated)
access_token?: string;
refresh_token?: string;
// Legacy credentials (for backward compatibility)
box_access_token?: string;
box_refresh_token?: string;
box_user_id?: string;
}
export interface R2CredentialJson {
account_id: string;
r2_access_key_id: string;
@@ -331,6 +345,10 @@ export const credentialTemplates: Record<ValidSources, any> = {
loopio_client_token: "",
} as LoopioCredentialJson,
dropbox: { dropbox_access_token: "" } as DropboxCredentialJson,
box: {
access_token: "",
refresh_token: "",
} as BoxCredentialJson,
salesforce: {
sf_username: "",
sf_password: "",

View File

@@ -7,6 +7,7 @@ import {
DiscourseIcon,
Document360Icon,
DropboxIcon,
BoxIcon,
GithubIcon,
GitlabIcon,
BitbucketIcon,
@@ -198,6 +199,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Storage,
docs: `${DOCS_ADMINS_PATH}/connectors/official/dropbox`,
},
box: {
icon: BoxIcon,
displayName: "Box",
category: SourceCategory.Storage,
docs: `${DOCS_ADMINS_PATH}/connectors/official/box`,
},
s3: {
icon: S3Icon,
displayName: "S3",

View File

@@ -504,6 +504,7 @@ export enum ValidSources {
DrupalWiki = "drupal_wiki",
Imap = "imap",
Bitbucket = "bitbucket",
Box = "box",
TestRail = "testrail",
// Federated Connectors