mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-18 16:25:45 +00:00
Compare commits
1 Commits
migration_
...
updates
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
171deb495e |
@@ -39,12 +39,6 @@ env:
|
||||
AIRTABLE_TEST_TABLE_ID: ${{ secrets.AIRTABLE_TEST_TABLE_ID }}
|
||||
AIRTABLE_TEST_TABLE_NAME: ${{ secrets.AIRTABLE_TEST_TABLE_NAME }}
|
||||
AIRTABLE_ACCESS_TOKEN: ${{ secrets.AIRTABLE_ACCESS_TOKEN }}
|
||||
# Sharepoint
|
||||
SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }}
|
||||
SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
|
||||
SHAREPOINT_CLIENT_DIRECTORY_ID: ${{ secrets.SHAREPOINT_CLIENT_DIRECTORY_ID }}
|
||||
SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
|
||||
|
||||
jobs:
|
||||
connectors-check:
|
||||
# See https://runs-on.com/runners/linux/
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
"""foreign key input prompts
|
||||
|
||||
Revision ID: 33ea50e88f24
|
||||
Revises: a6df6b88ef81
|
||||
Create Date: 2025-01-29 10:54:22.141765
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "33ea50e88f24"
|
||||
down_revision = "a6df6b88ef81"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Safely drop constraints if exists
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE inputprompt__user
|
||||
DROP CONSTRAINT IF EXISTS inputprompt__user_input_prompt_id_fkey
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE inputprompt__user
|
||||
DROP CONSTRAINT IF EXISTS inputprompt__user_user_id_fkey
|
||||
"""
|
||||
)
|
||||
|
||||
# Recreate with ON DELETE CASCADE
|
||||
op.create_foreign_key(
|
||||
"inputprompt__user_input_prompt_id_fkey",
|
||||
"inputprompt__user",
|
||||
"inputprompt",
|
||||
["input_prompt_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
op.create_foreign_key(
|
||||
"inputprompt__user_user_id_fkey",
|
||||
"inputprompt__user",
|
||||
"user",
|
||||
["user_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the new FKs with ondelete
|
||||
op.drop_constraint(
|
||||
"inputprompt__user_input_prompt_id_fkey",
|
||||
"inputprompt__user",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"inputprompt__user_user_id_fkey",
|
||||
"inputprompt__user",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
# Recreate them without cascading
|
||||
op.create_foreign_key(
|
||||
"inputprompt__user_input_prompt_id_fkey",
|
||||
"inputprompt__user",
|
||||
"inputprompt",
|
||||
["input_prompt_id"],
|
||||
["id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"inputprompt__user_user_id_fkey",
|
||||
"inputprompt__user",
|
||||
"user",
|
||||
["user_id"],
|
||||
["id"],
|
||||
)
|
||||
@@ -1,37 +0,0 @@
|
||||
"""lowercase_user_emails
|
||||
|
||||
Revision ID: 4d58345da04a
|
||||
Revises: f1ca58b2f2ec
|
||||
Create Date: 2025-01-29 07:48:46.784041
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
from sqlalchemy.sql import text
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "4d58345da04a"
|
||||
down_revision = "f1ca58b2f2ec"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Get database connection
|
||||
connection = op.get_bind()
|
||||
|
||||
# Update all user emails to lowercase
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
UPDATE "user"
|
||||
SET email = LOWER(email)
|
||||
WHERE email != LOWER(email)
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Cannot restore original case of emails
|
||||
pass
|
||||
@@ -1,29 +0,0 @@
|
||||
"""remove recent assistants
|
||||
|
||||
Revision ID: a6df6b88ef81
|
||||
Revises: 4d58345da04a
|
||||
Create Date: 2025-01-29 10:25:52.790407
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "a6df6b88ef81"
|
||||
down_revision = "4d58345da04a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_column("user", "recent_assistants")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.add_column(
|
||||
"user",
|
||||
sa.Column(
|
||||
"recent_assistants", postgresql.JSONB(), server_default="[]", nullable=False
|
||||
),
|
||||
)
|
||||
@@ -14,8 +14,6 @@ def _build_group_member_email_map(
|
||||
) -> dict[str, set[str]]:
|
||||
group_member_emails: dict[str, set[str]] = {}
|
||||
for user_result in confluence_client.paginated_cql_user_retrieval():
|
||||
logger.debug(f"Processing groups for user: {user_result}")
|
||||
|
||||
user = user_result.get("user", {})
|
||||
if not user:
|
||||
logger.warning(f"user result missing user field: {user_result}")
|
||||
@@ -35,17 +33,10 @@ def _build_group_member_email_map(
|
||||
logger.warning(f"user result missing email field: {user_result}")
|
||||
continue
|
||||
|
||||
all_users_groups: set[str] = set()
|
||||
for group in confluence_client.paginated_groups_by_user_retrieval(user):
|
||||
# group name uniqueness is enforced by Confluence, so we can use it as a group ID
|
||||
group_id = group["name"]
|
||||
group_member_emails.setdefault(group_id, set()).add(email)
|
||||
all_users_groups.add(group_id)
|
||||
|
||||
if not group_member_emails:
|
||||
logger.warning(f"No groups found for user with email: {email}")
|
||||
else:
|
||||
logger.debug(f"Found groups {all_users_groups} for user with email {email}")
|
||||
|
||||
return group_member_emails
|
||||
|
||||
|
||||
@@ -111,7 +111,6 @@ async def login_as_anonymous_user(
|
||||
token = generate_anonymous_user_jwt_token(tenant_id)
|
||||
|
||||
response = Response()
|
||||
response.delete_cookie("fastapiusersauth")
|
||||
response.set_cookie(
|
||||
key=ANONYMOUS_USER_COOKIE_NAME,
|
||||
value=token,
|
||||
|
||||
@@ -58,7 +58,6 @@ class UserGroup(BaseModel):
|
||||
credential=CredentialSnapshot.from_credential_db_model(
|
||||
cc_pair_relationship.cc_pair.credential
|
||||
),
|
||||
access_type=cc_pair_relationship.cc_pair.access_type,
|
||||
)
|
||||
for cc_pair_relationship in user_group_model.cc_pair_relationships
|
||||
if cc_pair_relationship.is_current
|
||||
|
||||
@@ -42,10 +42,6 @@ class UserCreate(schemas.BaseUserCreate):
|
||||
tenant_id: str | None = None
|
||||
|
||||
|
||||
class UserUpdateWithRole(schemas.BaseUserUpdate):
|
||||
role: UserRole
|
||||
|
||||
|
||||
class UserUpdate(schemas.BaseUserUpdate):
|
||||
"""
|
||||
Role updates are not allowed through the user update endpoint for security reasons
|
||||
|
||||
@@ -57,7 +57,7 @@ from onyx.auth.invited_users import get_invited_users
|
||||
from onyx.auth.schemas import AuthBackend
|
||||
from onyx.auth.schemas import UserCreate
|
||||
from onyx.auth.schemas import UserRole
|
||||
from onyx.auth.schemas import UserUpdateWithRole
|
||||
from onyx.auth.schemas import UserUpdate
|
||||
from onyx.configs.app_configs import AUTH_BACKEND
|
||||
from onyx.configs.app_configs import AUTH_COOKIE_EXPIRE_TIME_SECONDS
|
||||
from onyx.configs.app_configs import AUTH_TYPE
|
||||
@@ -216,6 +216,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
reset_password_token_secret = USER_AUTH_SECRET
|
||||
verification_token_secret = USER_AUTH_SECRET
|
||||
verification_token_lifetime_seconds = AUTH_COOKIE_EXPIRE_TIME_SECONDS
|
||||
|
||||
user_db: SQLAlchemyUserDatabase[User, uuid.UUID]
|
||||
|
||||
async def create(
|
||||
@@ -245,8 +246,10 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
referral_source=referral_source,
|
||||
request=request,
|
||||
)
|
||||
|
||||
async with get_async_session_with_tenant(tenant_id) as db_session:
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
|
||||
verify_email_is_invited(user_create.email)
|
||||
verify_email_domain(user_create.email)
|
||||
if MULTI_TENANT:
|
||||
@@ -265,16 +268,16 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
user_create.role = UserRole.ADMIN
|
||||
else:
|
||||
user_create.role = UserRole.BASIC
|
||||
|
||||
try:
|
||||
user = await super().create(user_create, safe=safe, request=request) # type: ignore
|
||||
except exceptions.UserAlreadyExists:
|
||||
user = await self.get_by_email(user_create.email)
|
||||
# Handle case where user has used product outside of web and is now creating an account through web
|
||||
if not user.role.is_web_login() and user_create.role.is_web_login():
|
||||
user_update = UserUpdateWithRole(
|
||||
user_update = UserUpdate(
|
||||
password=user_create.password,
|
||||
is_verified=user_create.is_verified,
|
||||
role=user_create.role,
|
||||
)
|
||||
user = await self.update(user_update, user)
|
||||
else:
|
||||
@@ -282,6 +285,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
|
||||
finally:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
return user
|
||||
|
||||
async def validate_password(self, password: str, _: schemas.UC | models.UP) -> None:
|
||||
|
||||
@@ -24,7 +24,6 @@ from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine import get_sqlalchemy_engine
|
||||
from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
|
||||
from onyx.redis.redis_connector_delete import RedisConnectorDelete
|
||||
@@ -317,8 +316,6 @@ def on_worker_ready(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
|
||||
def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
HttpxPool.close_all()
|
||||
|
||||
if not celery_is_worker_primary(sender):
|
||||
return
|
||||
|
||||
|
||||
@@ -10,10 +10,6 @@ from celery.signals import worker_ready
|
||||
from celery.signals import worker_shutdown
|
||||
|
||||
import onyx.background.celery.apps.app_base as app_base
|
||||
from onyx.background.celery.celery_utils import httpx_init_vespa_pool
|
||||
from onyx.configs.app_configs import MANAGED_VESPA
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
|
||||
from onyx.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME
|
||||
from onyx.db.engine import SqlEngine
|
||||
from onyx.utils.logger import setup_logger
|
||||
@@ -58,23 +54,12 @@ def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
|
||||
|
||||
@worker_init.connect
|
||||
def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
EXTRA_CONCURRENCY = 8 # small extra fudge factor for connection limits
|
||||
|
||||
logger.info("worker_init signal received.")
|
||||
|
||||
logger.info(f"Concurrency: {sender.concurrency}") # type: ignore
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=EXTRA_CONCURRENCY) # type: ignore
|
||||
|
||||
if MANAGED_VESPA:
|
||||
httpx_init_vespa_pool(
|
||||
sender.concurrency + EXTRA_CONCURRENCY, # type: ignore
|
||||
ssl_cert=VESPA_CLOUD_CERT_PATH,
|
||||
ssl_key=VESPA_CLOUD_KEY_PATH,
|
||||
)
|
||||
else:
|
||||
httpx_init_vespa_pool(sender.concurrency + EXTRA_CONCURRENCY) # type: ignore
|
||||
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8) # type: ignore
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
|
||||
from onyx.configs.app_configs import VESPA_REQUEST_TIMEOUT
|
||||
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
@@ -20,7 +17,6 @@ from onyx.db.connector_credential_pair import get_connector_credential_pair
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.models import TaskQueueState
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.server.documents.models import DeletionAttemptSnapshot
|
||||
@@ -158,25 +154,3 @@ def celery_is_worker_primary(worker: Any) -> bool:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def httpx_init_vespa_pool(
|
||||
max_keepalive_connections: int,
|
||||
timeout: int = VESPA_REQUEST_TIMEOUT,
|
||||
ssl_cert: str | None = None,
|
||||
ssl_key: str | None = None,
|
||||
) -> None:
|
||||
httpx_cert = None
|
||||
httpx_verify = False
|
||||
if ssl_cert and ssl_key:
|
||||
httpx_cert = cast(tuple[str, str], (ssl_cert, ssl_key))
|
||||
httpx_verify = True
|
||||
|
||||
HttpxPool.init_client(
|
||||
name="vespa",
|
||||
cert=httpx_cert,
|
||||
verify=httpx_verify,
|
||||
timeout=timeout,
|
||||
http2=False,
|
||||
limits=httpx.Limits(max_keepalive_connections=max_keepalive_connections),
|
||||
)
|
||||
|
||||
@@ -11,7 +11,6 @@ from celery import Task
|
||||
from celery.exceptions import SoftTimeLimitExceeded
|
||||
from redis import Redis
|
||||
from redis.lock import Lock as RedisLock
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.db.connector_credential_pair import get_all_auto_sync_cc_pairs
|
||||
from ee.onyx.db.document import upsert_document_external_perms
|
||||
@@ -32,17 +31,12 @@ from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryQueues
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.connector import mark_cc_pair_as_permissions_synced
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.document import upsert_document_by_connector_credential_pair
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import SyncStatus
|
||||
from onyx.db.enums import SyncType
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.sync_record import insert_sync_record
|
||||
from onyx.db.sync_record import update_sync_record_status
|
||||
from onyx.db.users import batch_add_ext_perm_user_if_not_exists
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_connector_doc_perm_sync import (
|
||||
@@ -63,9 +57,6 @@ LIGHT_SOFT_TIME_LIMIT = 105
|
||||
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
|
||||
|
||||
|
||||
"""Jobs / utils for kicking off doc permissions sync tasks."""
|
||||
|
||||
|
||||
def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
|
||||
"""Returns boolean indicating if external doc permissions sync is due."""
|
||||
|
||||
@@ -183,19 +174,6 @@ def try_creating_permissions_sync_task(
|
||||
|
||||
custom_task_id = f"{redis_connector.permissions.generator_task_key}_{uuid4()}"
|
||||
|
||||
# create before setting fence to avoid race condition where the monitoring
|
||||
# task updates the sync record before it is created
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
insert_sync_record(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
sync_type=SyncType.EXTERNAL_PERMISSIONS,
|
||||
)
|
||||
|
||||
# set a basic fence to start
|
||||
payload = RedisConnectorPermissionSyncPayload(started=None, celery_task_id=None)
|
||||
redis_connector.permissions.set_fence(payload)
|
||||
|
||||
result = app.send_task(
|
||||
OnyxCeleryTask.CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK,
|
||||
kwargs=dict(
|
||||
@@ -207,8 +185,11 @@ def try_creating_permissions_sync_task(
|
||||
priority=OnyxCeleryPriority.HIGH,
|
||||
)
|
||||
|
||||
# fill in the celery task id
|
||||
payload.celery_task_id = result.id
|
||||
# set a basic fence to start
|
||||
payload = RedisConnectorPermissionSyncPayload(
|
||||
started=None, celery_task_id=result.id
|
||||
)
|
||||
|
||||
redis_connector.permissions.set_fence(payload)
|
||||
except Exception:
|
||||
task_logger.exception(f"Unexpected exception: cc_pair={cc_pair_id}")
|
||||
@@ -418,53 +399,3 @@ def update_external_document_permissions_task(
|
||||
f"Error Syncing Document Permissions: connector_id={connector_id} doc_id={doc_id}"
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
"""Monitoring CCPair permissions utils, called in monitor_vespa_sync"""
|
||||
|
||||
|
||||
def monitor_ccpair_permissions_taskset(
|
||||
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
|
||||
) -> None:
|
||||
fence_key = key_bytes.decode("utf-8")
|
||||
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
|
||||
if cc_pair_id_str is None:
|
||||
task_logger.warning(
|
||||
f"monitor_ccpair_permissions_taskset: could not parse cc_pair_id from {fence_key}"
|
||||
)
|
||||
return
|
||||
|
||||
cc_pair_id = int(cc_pair_id_str)
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
if not redis_connector.permissions.fenced:
|
||||
return
|
||||
|
||||
initial = redis_connector.permissions.generator_complete
|
||||
if initial is None:
|
||||
return
|
||||
|
||||
remaining = redis_connector.permissions.get_remaining()
|
||||
task_logger.info(
|
||||
f"Permissions sync progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
|
||||
)
|
||||
if remaining > 0:
|
||||
return
|
||||
|
||||
payload: RedisConnectorPermissionSyncPayload | None = (
|
||||
redis_connector.permissions.payload
|
||||
)
|
||||
start_time: datetime | None = payload.started if payload else None
|
||||
|
||||
mark_cc_pair_as_permissions_synced(db_session, int(cc_pair_id), start_time)
|
||||
task_logger.info(f"Successfully synced permissions for cc_pair={cc_pair_id}")
|
||||
|
||||
update_sync_record_status(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
sync_type=SyncType.EXTERNAL_PERMISSIONS,
|
||||
sync_status=SyncStatus.SUCCESS,
|
||||
num_docs_synced=initial,
|
||||
)
|
||||
|
||||
redis_connector.permissions.reset()
|
||||
|
||||
@@ -33,11 +33,7 @@ from onyx.db.connector_credential_pair import get_connector_credential_pair_from
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import SyncStatus
|
||||
from onyx.db.enums import SyncType
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.sync_record import insert_sync_record
|
||||
from onyx.db.sync_record import update_sync_record_status
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_connector_ext_group_sync import (
|
||||
RedisConnectorExternalGroupSyncPayload,
|
||||
@@ -204,15 +200,6 @@ def try_creating_external_group_sync_task(
|
||||
celery_task_id=result.id,
|
||||
)
|
||||
|
||||
# create before setting fence to avoid race condition where the monitoring
|
||||
# task updates the sync record before it is created
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
insert_sync_record(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
sync_type=SyncType.EXTERNAL_GROUP,
|
||||
)
|
||||
|
||||
redis_connector.external_group_sync.set_fence(payload)
|
||||
|
||||
except Exception:
|
||||
@@ -302,26 +289,11 @@ def connector_external_group_sync_generator_task(
|
||||
)
|
||||
|
||||
mark_cc_pair_as_external_group_synced(db_session, cc_pair.id)
|
||||
|
||||
update_sync_record_status(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
sync_type=SyncType.EXTERNAL_GROUP,
|
||||
sync_status=SyncStatus.SUCCESS,
|
||||
)
|
||||
except Exception as e:
|
||||
task_logger.exception(
|
||||
f"Failed to run external group sync: cc_pair={cc_pair_id}"
|
||||
)
|
||||
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
update_sync_record_status(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
sync_type=SyncType.EXTERNAL_GROUP,
|
||||
sync_status=SyncStatus.FAILED,
|
||||
)
|
||||
|
||||
redis_connector.external_group_sync.generator_clear()
|
||||
redis_connector.external_group_sync.taskset_clear()
|
||||
raise e
|
||||
|
||||
@@ -15,7 +15,6 @@ from redis import Redis
|
||||
from redis.lock import Lock as RedisLock
|
||||
|
||||
from onyx.background.celery.apps.app_base import task_logger
|
||||
from onyx.background.celery.celery_utils import httpx_init_vespa_pool
|
||||
from onyx.background.celery.tasks.indexing.utils import _should_index
|
||||
from onyx.background.celery.tasks.indexing.utils import get_unfenced_index_attempt_ids
|
||||
from onyx.background.celery.tasks.indexing.utils import IndexingCallback
|
||||
@@ -23,9 +22,6 @@ from onyx.background.celery.tasks.indexing.utils import try_creating_indexing_ta
|
||||
from onyx.background.celery.tasks.indexing.utils import validate_indexing_fences
|
||||
from onyx.background.indexing.job_client import SimpleJobClient
|
||||
from onyx.background.indexing.run_indexing import run_indexing_entrypoint
|
||||
from onyx.configs.app_configs import MANAGED_VESPA
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
|
||||
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
|
||||
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_TASK_WAIT_FOR_FENCE_TIMEOUT
|
||||
@@ -41,7 +37,8 @@ from onyx.db.index_attempt import get_index_attempt
|
||||
from onyx.db.index_attempt import get_last_attempt_for_cc_pair
|
||||
from onyx.db.index_attempt import mark_attempt_canceled
|
||||
from onyx.db.index_attempt import mark_attempt_failed
|
||||
from onyx.db.search_settings import get_active_search_settings_list
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.swap_index import check_index_swap
|
||||
from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
|
||||
@@ -124,7 +121,9 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
search_settings_list = get_active_search_settings_list(db_session)
|
||||
search_settings_list: list[SearchSettings] = get_active_search_settings(
|
||||
db_session
|
||||
)
|
||||
for search_settings_instance in search_settings_list:
|
||||
redis_connector_index = redis_connector.new_index(
|
||||
search_settings_instance.id
|
||||
@@ -304,14 +303,6 @@ def connector_indexing_task(
|
||||
attempt_found = False
|
||||
n_final_progress: int | None = None
|
||||
|
||||
# 20 is the documented default for httpx max_keepalive_connections
|
||||
if MANAGED_VESPA:
|
||||
httpx_init_vespa_pool(
|
||||
20, ssl_cert=VESPA_CLOUD_CERT_PATH, ssl_key=VESPA_CLOUD_KEY_PATH
|
||||
)
|
||||
else:
|
||||
httpx_init_vespa_pool(20)
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
redis_connector_index = redis_connector.new_index(search_settings_id)
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ from onyx.db.models import DocumentSet
|
||||
from onyx.db.models import IndexAttempt
|
||||
from onyx.db.models import SyncRecord
|
||||
from onyx.db.models import UserGroup
|
||||
from onyx.db.search_settings import get_active_search_settings_list
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from onyx.utils.telemetry import optional_telemetry
|
||||
@@ -58,11 +58,6 @@ _SYNC_START_LATENCY_KEY_FMT = (
|
||||
"sync_start_latency:{sync_type}:{entity_id}:{sync_record_id}"
|
||||
)
|
||||
|
||||
_CONNECTOR_START_TIME_KEY_FMT = "connector_start_time:{cc_pair_id}:{index_attempt_id}"
|
||||
_CONNECTOR_END_TIME_KEY_FMT = "connector_end_time:{cc_pair_id}:{index_attempt_id}"
|
||||
_SYNC_START_TIME_KEY_FMT = "sync_start_time:{sync_type}:{entity_id}:{sync_record_id}"
|
||||
_SYNC_END_TIME_KEY_FMT = "sync_end_time:{sync_type}:{entity_id}:{sync_record_id}"
|
||||
|
||||
|
||||
def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None:
|
||||
"""Mark a metric as having been emitted by setting a Redis key with expiration"""
|
||||
@@ -308,6 +303,8 @@ def _build_connector_final_metrics(
|
||||
)
|
||||
)
|
||||
|
||||
_mark_metric_as_emitted(redis_std, metric_key)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
@@ -318,13 +315,13 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
|
||||
# Get all connector credential pairs
|
||||
cc_pairs = db_session.scalars(select(ConnectorCredentialPair)).all()
|
||||
# Might be more than one search setting, or just one
|
||||
active_search_settings_list = get_active_search_settings_list(db_session)
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
|
||||
metrics = []
|
||||
|
||||
# If you want to process each cc_pair against each search setting:
|
||||
for cc_pair in cc_pairs:
|
||||
for search_settings in active_search_settings_list:
|
||||
for search_settings in active_search_settings:
|
||||
recent_attempts = (
|
||||
db_session.query(IndexAttempt)
|
||||
.filter(
|
||||
@@ -347,52 +344,6 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
|
||||
if one_hour_ago > most_recent_attempt.time_created:
|
||||
continue
|
||||
|
||||
# Build a job_id for correlation
|
||||
job_id = build_job_id(
|
||||
"connector", str(cc_pair.id), str(most_recent_attempt.id)
|
||||
)
|
||||
|
||||
# Add raw start time metric if available
|
||||
if most_recent_attempt.time_started:
|
||||
start_time_key = _CONNECTOR_START_TIME_KEY_FMT.format(
|
||||
cc_pair_id=cc_pair.id,
|
||||
index_attempt_id=most_recent_attempt.id,
|
||||
)
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=start_time_key,
|
||||
name="connector_start_time",
|
||||
value=most_recent_attempt.time_started.timestamp(),
|
||||
tags={
|
||||
"job_id": job_id,
|
||||
"connector_id": str(cc_pair.connector.id),
|
||||
"source": str(cc_pair.connector.source),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
# Add raw end time metric if available and in terminal state
|
||||
if (
|
||||
most_recent_attempt.status.is_terminal()
|
||||
and most_recent_attempt.time_updated
|
||||
):
|
||||
end_time_key = _CONNECTOR_END_TIME_KEY_FMT.format(
|
||||
cc_pair_id=cc_pair.id,
|
||||
index_attempt_id=most_recent_attempt.id,
|
||||
)
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=end_time_key,
|
||||
name="connector_end_time",
|
||||
value=most_recent_attempt.time_updated.timestamp(),
|
||||
tags={
|
||||
"job_id": job_id,
|
||||
"connector_id": str(cc_pair.connector.id),
|
||||
"source": str(cc_pair.connector.source),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
# Connector start latency
|
||||
start_latency_metric = _build_connector_start_latency_metric(
|
||||
cc_pair, most_recent_attempt, second_most_recent_attempt, redis_std
|
||||
@@ -414,10 +365,9 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
"""
|
||||
Collect metrics for document set and group syncing:
|
||||
- Success/failure status
|
||||
- Start latency (for doc sets / user groups)
|
||||
- Start latency (always)
|
||||
- Duration & doc count (only if success)
|
||||
- Throughput (docs/min) (only if success)
|
||||
- Raw start/end times for each sync
|
||||
"""
|
||||
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
|
||||
|
||||
@@ -439,43 +389,6 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
# Build a job_id for correlation
|
||||
job_id = build_job_id("sync_record", str(sync_record.id))
|
||||
|
||||
# Add raw start time metric
|
||||
start_time_key = _SYNC_START_TIME_KEY_FMT.format(
|
||||
sync_type=sync_record.sync_type,
|
||||
entity_id=sync_record.entity_id,
|
||||
sync_record_id=sync_record.id,
|
||||
)
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=start_time_key,
|
||||
name="sync_start_time",
|
||||
value=sync_record.sync_start_time.timestamp(),
|
||||
tags={
|
||||
"job_id": job_id,
|
||||
"sync_type": str(sync_record.sync_type),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
# Add raw end time metric if available
|
||||
if sync_record.sync_end_time:
|
||||
end_time_key = _SYNC_END_TIME_KEY_FMT.format(
|
||||
sync_type=sync_record.sync_type,
|
||||
entity_id=sync_record.entity_id,
|
||||
sync_record_id=sync_record.id,
|
||||
)
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=end_time_key,
|
||||
name="sync_end_time",
|
||||
value=sync_record.sync_end_time.timestamp(),
|
||||
tags={
|
||||
"job_id": job_id,
|
||||
"sync_type": str(sync_record.sync_type),
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
# Emit a SUCCESS/FAIL boolean metric
|
||||
# Use a single Redis key to avoid re-emitting final metrics
|
||||
final_metric_key = _FINAL_METRIC_KEY_FMT.format(
|
||||
@@ -526,7 +439,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
if duration_seconds is not None:
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=final_metric_key,
|
||||
key=None,
|
||||
name="sync_duration_seconds",
|
||||
value=duration_seconds,
|
||||
tags={
|
||||
@@ -542,7 +455,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=final_metric_key,
|
||||
key=None,
|
||||
name="sync_doc_count",
|
||||
value=doc_count,
|
||||
tags={
|
||||
@@ -555,7 +468,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
if sync_speed is not None:
|
||||
metrics.append(
|
||||
Metric(
|
||||
key=final_metric_key,
|
||||
key=None,
|
||||
name="sync_speed_docs_per_min",
|
||||
value=sync_speed,
|
||||
tags={
|
||||
@@ -569,6 +482,9 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
f"Invalid sync record {sync_record.id} with no duration"
|
||||
)
|
||||
|
||||
# Mark final metrics as emitted so we don't re-emit
|
||||
_mark_metric_as_emitted(redis_std, final_metric_key)
|
||||
|
||||
# Emit start latency
|
||||
start_latency_key = _SYNC_START_LATENCY_KEY_FMT.format(
|
||||
sync_type=sync_record.sync_type,
|
||||
@@ -586,20 +502,22 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
entity = db_session.scalar(
|
||||
select(UserGroup).where(UserGroup.id == sync_record.entity_id)
|
||||
)
|
||||
else:
|
||||
task_logger.info(
|
||||
f"Skipping sync record {sync_record.id} of type {sync_record.sync_type}."
|
||||
)
|
||||
continue
|
||||
|
||||
if entity is None:
|
||||
task_logger.error(
|
||||
f"Sync record of type {sync_record.sync_type} doesn't have an entity "
|
||||
f"associated with it (id={sync_record.entity_id}). Skipping start latency metric."
|
||||
f"Could not find entity for sync record {sync_record.id} "
|
||||
f"(type={sync_record.sync_type}, id={sync_record.entity_id})."
|
||||
)
|
||||
continue
|
||||
|
||||
# Calculate start latency in seconds:
|
||||
# (actual sync start) - (last modified time)
|
||||
if (
|
||||
entity is not None
|
||||
and entity.time_last_modified_by_user
|
||||
and sync_record.sync_start_time
|
||||
):
|
||||
if entity.time_last_modified_by_user and sync_record.sync_start_time:
|
||||
start_latency = (
|
||||
sync_record.sync_start_time - entity.time_last_modified_by_user
|
||||
).total_seconds()
|
||||
@@ -623,6 +541,8 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
|
||||
)
|
||||
)
|
||||
|
||||
_mark_metric_as_emitted(redis_std, start_latency_key)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
@@ -687,12 +607,9 @@ def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
|
||||
for metric_fn in metric_functions:
|
||||
metrics = metric_fn()
|
||||
for metric in metrics:
|
||||
# double check to make sure we aren't double-emitting metrics
|
||||
if metric.key is not None and not _has_metric_been_emitted(
|
||||
redis_std, metric.key
|
||||
):
|
||||
metric.log()
|
||||
metric.emit(tenant_id)
|
||||
metric.log()
|
||||
metric.emit(tenant_id)
|
||||
if metric.key:
|
||||
_mark_metric_as_emitted(redis_std, metric.key)
|
||||
|
||||
task_logger.info("Successfully collected background metrics")
|
||||
|
||||
@@ -25,18 +25,13 @@ from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.connectors.factory import instantiate_connector
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.connector import mark_ccpair_as_pruned
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pairs
|
||||
from onyx.db.document import get_documents_for_connector_credential_pair
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import SyncStatus
|
||||
from onyx.db.enums import SyncType
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.sync_record import insert_sync_record
|
||||
from onyx.db.sync_record import update_sync_record_status
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.utils.logger import pruning_ctx
|
||||
@@ -45,9 +40,6 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
"""Jobs / utils for kicking off pruning tasks."""
|
||||
|
||||
|
||||
def _is_pruning_due(cc_pair: ConnectorCredentialPair) -> bool:
|
||||
"""Returns boolean indicating if pruning is due.
|
||||
|
||||
@@ -212,14 +204,6 @@ def try_creating_prune_generator_task(
|
||||
priority=OnyxCeleryPriority.LOW,
|
||||
)
|
||||
|
||||
# create before setting fence to avoid race condition where the monitoring
|
||||
# task updates the sync record before it is created
|
||||
insert_sync_record(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair.id,
|
||||
sync_type=SyncType.PRUNING,
|
||||
)
|
||||
|
||||
# set this only after all tasks have been added
|
||||
redis_connector.prune.set_fence(True)
|
||||
except Exception:
|
||||
@@ -364,52 +348,3 @@ def connector_pruning_generator_task(
|
||||
lock.release()
|
||||
|
||||
task_logger.info(f"Pruning generator finished: cc_pair={cc_pair_id}")
|
||||
|
||||
|
||||
"""Monitoring pruning utils, called in monitor_vespa_sync"""
|
||||
|
||||
|
||||
def monitor_ccpair_pruning_taskset(
|
||||
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
|
||||
) -> None:
|
||||
fence_key = key_bytes.decode("utf-8")
|
||||
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
|
||||
if cc_pair_id_str is None:
|
||||
task_logger.warning(
|
||||
f"monitor_ccpair_pruning_taskset: could not parse cc_pair_id from {fence_key}"
|
||||
)
|
||||
return
|
||||
|
||||
cc_pair_id = int(cc_pair_id_str)
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
if not redis_connector.prune.fenced:
|
||||
return
|
||||
|
||||
initial = redis_connector.prune.generator_complete
|
||||
if initial is None:
|
||||
return
|
||||
|
||||
remaining = redis_connector.prune.get_remaining()
|
||||
task_logger.info(
|
||||
f"Connector pruning progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
|
||||
)
|
||||
if remaining > 0:
|
||||
return
|
||||
|
||||
mark_ccpair_as_pruned(int(cc_pair_id), db_session)
|
||||
task_logger.info(
|
||||
f"Successfully pruned connector credential pair. cc_pair={cc_pair_id}"
|
||||
)
|
||||
|
||||
update_sync_record_status(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
sync_type=SyncType.PRUNING,
|
||||
sync_status=SyncStatus.SUCCESS,
|
||||
num_docs_synced=initial,
|
||||
)
|
||||
|
||||
redis_connector.prune.taskset_clear()
|
||||
redis_connector.prune.generator_clear()
|
||||
redis_connector.prune.set_fence(False)
|
||||
|
||||
@@ -27,10 +27,9 @@ from onyx.db.document import mark_document_as_synced
|
||||
from onyx.db.document_set import fetch_document_sets_for_document
|
||||
from onyx.db.engine import get_all_tenant_ids
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
|
||||
@@ -80,11 +79,9 @@ def document_by_cc_pair_cleanup_task(
|
||||
action = "skip"
|
||||
chunks_affected = 0
|
||||
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
||||
doc_index = get_default_document_index(
|
||||
active_search_settings.primary,
|
||||
active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
|
||||
)
|
||||
|
||||
retry_index = RetryDocumentIndex(doc_index)
|
||||
|
||||
@@ -24,10 +24,6 @@ from onyx.access.access import get_access_for_document
|
||||
from onyx.background.celery.apps.app_base import task_logger
|
||||
from onyx.background.celery.celery_redis import celery_get_queue_length
|
||||
from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
|
||||
from onyx.background.celery.tasks.doc_permission_syncing.tasks import (
|
||||
monitor_ccpair_permissions_taskset,
|
||||
)
|
||||
from onyx.background.celery.tasks.pruning.tasks import monitor_ccpair_pruning_taskset
|
||||
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
|
||||
from onyx.background.celery.tasks.shared.tasks import LIGHT_SOFT_TIME_LIMIT
|
||||
from onyx.background.celery.tasks.shared.tasks import LIGHT_TIME_LIMIT
|
||||
@@ -38,6 +34,8 @@ from onyx.configs.constants import OnyxCeleryQueues
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.connector import fetch_connector_by_id
|
||||
from onyx.db.connector import mark_cc_pair_as_permissions_synced
|
||||
from onyx.db.connector import mark_ccpair_as_pruned
|
||||
from onyx.db.connector_credential_pair import add_deletion_failure_message
|
||||
from onyx.db.connector_credential_pair import (
|
||||
delete_connector_credential_pair__no_commit,
|
||||
@@ -63,17 +61,19 @@ from onyx.db.index_attempt import get_index_attempt
|
||||
from onyx.db.index_attempt import mark_attempt_failed
|
||||
from onyx.db.models import DocumentSet
|
||||
from onyx.db.models import UserGroup
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.sync_record import cleanup_sync_records
|
||||
from onyx.db.sync_record import insert_sync_record
|
||||
from onyx.db.sync_record import update_sync_record_status
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
|
||||
from onyx.redis.redis_connector_delete import RedisConnectorDelete
|
||||
from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
|
||||
from onyx.redis.redis_connector_doc_perm_sync import (
|
||||
RedisConnectorPermissionSyncPayload,
|
||||
)
|
||||
from onyx.redis.redis_connector_index import RedisConnectorIndex
|
||||
from onyx.redis.redis_connector_prune import RedisConnectorPrune
|
||||
from onyx.redis.redis_document_set import RedisDocumentSet
|
||||
@@ -652,6 +652,83 @@ def monitor_connector_deletion_taskset(
|
||||
redis_connector.delete.reset()
|
||||
|
||||
|
||||
def monitor_ccpair_pruning_taskset(
|
||||
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
|
||||
) -> None:
|
||||
fence_key = key_bytes.decode("utf-8")
|
||||
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
|
||||
if cc_pair_id_str is None:
|
||||
task_logger.warning(
|
||||
f"monitor_ccpair_pruning_taskset: could not parse cc_pair_id from {fence_key}"
|
||||
)
|
||||
return
|
||||
|
||||
cc_pair_id = int(cc_pair_id_str)
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
if not redis_connector.prune.fenced:
|
||||
return
|
||||
|
||||
initial = redis_connector.prune.generator_complete
|
||||
if initial is None:
|
||||
return
|
||||
|
||||
remaining = redis_connector.prune.get_remaining()
|
||||
task_logger.info(
|
||||
f"Connector pruning progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
|
||||
)
|
||||
if remaining > 0:
|
||||
return
|
||||
|
||||
mark_ccpair_as_pruned(int(cc_pair_id), db_session)
|
||||
task_logger.info(
|
||||
f"Successfully pruned connector credential pair. cc_pair={cc_pair_id}"
|
||||
)
|
||||
|
||||
redis_connector.prune.taskset_clear()
|
||||
redis_connector.prune.generator_clear()
|
||||
redis_connector.prune.set_fence(False)
|
||||
|
||||
|
||||
def monitor_ccpair_permissions_taskset(
|
||||
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
|
||||
) -> None:
|
||||
fence_key = key_bytes.decode("utf-8")
|
||||
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
|
||||
if cc_pair_id_str is None:
|
||||
task_logger.warning(
|
||||
f"monitor_ccpair_permissions_taskset: could not parse cc_pair_id from {fence_key}"
|
||||
)
|
||||
return
|
||||
|
||||
cc_pair_id = int(cc_pair_id_str)
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
if not redis_connector.permissions.fenced:
|
||||
return
|
||||
|
||||
initial = redis_connector.permissions.generator_complete
|
||||
if initial is None:
|
||||
return
|
||||
|
||||
remaining = redis_connector.permissions.get_remaining()
|
||||
task_logger.info(
|
||||
f"Permissions sync progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
|
||||
)
|
||||
if remaining > 0:
|
||||
return
|
||||
|
||||
payload: RedisConnectorPermissionSyncPayload | None = (
|
||||
redis_connector.permissions.payload
|
||||
)
|
||||
start_time: datetime | None = payload.started if payload else None
|
||||
|
||||
mark_cc_pair_as_permissions_synced(db_session, int(cc_pair_id), start_time)
|
||||
task_logger.info(f"Successfully synced permissions for cc_pair={cc_pair_id}")
|
||||
|
||||
redis_connector.permissions.reset()
|
||||
|
||||
|
||||
def monitor_ccpair_indexing_taskset(
|
||||
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
|
||||
) -> None:
|
||||
@@ -1019,11 +1096,9 @@ def vespa_metadata_sync_task(
|
||||
|
||||
try:
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
||||
doc_index = get_default_document_index(
|
||||
search_settings=active_search_settings.primary,
|
||||
secondary_search_settings=active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
|
||||
)
|
||||
|
||||
retry_index = RetryDocumentIndex(doc_index)
|
||||
|
||||
@@ -35,7 +35,6 @@ from onyx.db.models import IndexAttempt
|
||||
from onyx.db.models import IndexingStatus
|
||||
from onyx.db.models import IndexModelStatus
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.indexing.embedder import DefaultIndexingEmbedder
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
|
||||
@@ -220,10 +219,9 @@ def _run_indexing(
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
# Indexing is only done into one index at a time
|
||||
document_index = get_default_document_index(
|
||||
index_attempt_start.search_settings,
|
||||
None,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
primary_index_name=ctx.index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
indexing_pipeline = build_indexing_pipeline(
|
||||
|
||||
@@ -254,7 +254,6 @@ def _get_force_search_settings(
|
||||
and new_msg_req.retrieval_options.run_search
|
||||
== OptionalSearchSetting.ALWAYS,
|
||||
new_msg_req.search_doc_ids,
|
||||
new_msg_req.query_override is not None,
|
||||
DISABLE_LLM_CHOOSE_SEARCH,
|
||||
]
|
||||
)
|
||||
@@ -426,7 +425,9 @@ def stream_chat_message_objects(
|
||||
)
|
||||
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
# Every chat Session begins with an empty root message
|
||||
root_message = get_or_create_root_message(
|
||||
@@ -498,6 +499,14 @@ def stream_chat_message_objects(
|
||||
f"existing assistant message id: {existing_assistant_message_id}"
|
||||
)
|
||||
|
||||
# Disable Query Rephrasing for the first message
|
||||
# This leads to a better first response since the LLM rephrasing the question
|
||||
# leads to worst search quality
|
||||
if not history_msgs:
|
||||
new_msg_req.query_override = (
|
||||
new_msg_req.query_override or new_msg_req.message
|
||||
)
|
||||
|
||||
# load all files needed for this chat chain in memory
|
||||
files = load_all_chat_files(
|
||||
history_msgs, new_msg_req.file_descriptors, db_session
|
||||
|
||||
@@ -478,12 +478,6 @@ INDEXING_SIZE_WARNING_THRESHOLD = int(
|
||||
# 0 disables this behavior and is the default.
|
||||
INDEXING_TRACER_INTERVAL = int(os.environ.get("INDEXING_TRACER_INTERVAL") or 0)
|
||||
|
||||
# Enable multi-threaded embedding model calls for parallel processing
|
||||
# Note: only applies for API-based embedding models
|
||||
INDEXING_EMBEDDING_MODEL_NUM_THREADS = int(
|
||||
os.environ.get("INDEXING_EMBEDDING_MODEL_NUM_THREADS") or 1
|
||||
)
|
||||
|
||||
# During an indexing attempt, specifies the number of batches which are allowed to
|
||||
# exception without aborting the attempt.
|
||||
INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT") or 0)
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from concurrent.futures import as_completed
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
@@ -22,9 +20,9 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
# NOTE: all are made lowercase to avoid case sensitivity issues
|
||||
# These field types are considered metadata by default when
|
||||
# treat_all_non_attachment_fields_as_metadata is False
|
||||
DEFAULT_METADATA_FIELD_TYPES = {
|
||||
# these are the field types that are considered metadata rather
|
||||
# than sections
|
||||
_METADATA_FIELD_TYPES = {
|
||||
"singlecollaborator",
|
||||
"collaborator",
|
||||
"createdby",
|
||||
@@ -62,16 +60,12 @@ class AirtableConnector(LoadConnector):
|
||||
self,
|
||||
base_id: str,
|
||||
table_name_or_id: str,
|
||||
treat_all_non_attachment_fields_as_metadata: bool = False,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
self.base_id = base_id
|
||||
self.table_name_or_id = table_name_or_id
|
||||
self.batch_size = batch_size
|
||||
self.airtable_client: AirtableApi | None = None
|
||||
self.treat_all_non_attachment_fields_as_metadata = (
|
||||
treat_all_non_attachment_fields_as_metadata
|
||||
)
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
self.airtable_client = AirtableApi(credentials["airtable_access_token"])
|
||||
@@ -172,14 +166,8 @@ class AirtableConnector(LoadConnector):
|
||||
return [(str(field_info), default_link)]
|
||||
|
||||
def _should_be_metadata(self, field_type: str) -> bool:
|
||||
"""Determine if a field type should be treated as metadata.
|
||||
|
||||
When treat_all_non_attachment_fields_as_metadata is True, all fields except
|
||||
attachments are treated as metadata. Otherwise, only fields with types listed
|
||||
in DEFAULT_METADATA_FIELD_TYPES are treated as metadata."""
|
||||
if self.treat_all_non_attachment_fields_as_metadata:
|
||||
return field_type.lower() != "multipleattachments"
|
||||
return field_type.lower() in DEFAULT_METADATA_FIELD_TYPES
|
||||
"""Determine if a field type should be treated as metadata."""
|
||||
return field_type.lower() in _METADATA_FIELD_TYPES
|
||||
|
||||
def _process_field(
|
||||
self,
|
||||
@@ -245,7 +233,7 @@ class AirtableConnector(LoadConnector):
|
||||
record: RecordDict,
|
||||
table_schema: TableSchema,
|
||||
primary_field_name: str | None,
|
||||
) -> Document | None:
|
||||
) -> Document:
|
||||
"""Process a single Airtable record into a Document.
|
||||
|
||||
Args:
|
||||
@@ -276,11 +264,6 @@ class AirtableConnector(LoadConnector):
|
||||
field_val = fields.get(field_name)
|
||||
field_type = field_schema.type
|
||||
|
||||
logger.debug(
|
||||
f"Processing field '{field_name}' of type '{field_type}' "
|
||||
f"for record '{record_id}'."
|
||||
)
|
||||
|
||||
field_sections, field_metadata = self._process_field(
|
||||
field_id=field_schema.id,
|
||||
field_name=field_name,
|
||||
@@ -294,10 +277,6 @@ class AirtableConnector(LoadConnector):
|
||||
sections.extend(field_sections)
|
||||
metadata.update(field_metadata)
|
||||
|
||||
if not sections:
|
||||
logger.warning(f"No sections found for record {record_id}")
|
||||
return None
|
||||
|
||||
semantic_id = (
|
||||
f"{table_name}: {primary_field_value}"
|
||||
if primary_field_value
|
||||
@@ -334,45 +313,18 @@ class AirtableConnector(LoadConnector):
|
||||
primary_field_name = field.name
|
||||
break
|
||||
|
||||
logger.info(f"Starting to process Airtable records for {table.name}.")
|
||||
record_documents: list[Document] = []
|
||||
for record in records:
|
||||
document = self._process_record(
|
||||
record=record,
|
||||
table_schema=table_schema,
|
||||
primary_field_name=primary_field_name,
|
||||
)
|
||||
record_documents.append(document)
|
||||
|
||||
# Process records in parallel batches using ThreadPoolExecutor
|
||||
PARALLEL_BATCH_SIZE = 16
|
||||
max_workers = min(PARALLEL_BATCH_SIZE, len(records))
|
||||
|
||||
# Process records in batches
|
||||
for i in range(0, len(records), PARALLEL_BATCH_SIZE):
|
||||
batch_records = records[i : i + PARALLEL_BATCH_SIZE]
|
||||
record_documents: list[Document] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Submit batch tasks
|
||||
future_to_record = {
|
||||
executor.submit(
|
||||
self._process_record,
|
||||
record=record,
|
||||
table_schema=table_schema,
|
||||
primary_field_name=primary_field_name,
|
||||
): record
|
||||
for record in batch_records
|
||||
}
|
||||
|
||||
# Wait for all tasks in this batch to complete
|
||||
for future in as_completed(future_to_record):
|
||||
record = future_to_record[future]
|
||||
try:
|
||||
document = future.result()
|
||||
if document:
|
||||
record_documents.append(document)
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to process record {record['id']}")
|
||||
raise e
|
||||
|
||||
# After batch is complete, yield if we've hit the batch size
|
||||
if len(record_documents) >= self.batch_size:
|
||||
yield record_documents
|
||||
record_documents = []
|
||||
|
||||
# Yield any remaining records
|
||||
if record_documents:
|
||||
yield record_documents
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from onyx.connectors.interfaces import BaseConnector
|
||||
@@ -46,17 +45,7 @@ class ConnectorRunner:
|
||||
def run(self) -> GenerateDocumentsOutput:
|
||||
"""Adds additional exception logging to the connector."""
|
||||
try:
|
||||
start = time.monotonic()
|
||||
for batch in self.doc_batch_generator:
|
||||
# to know how long connector is taking
|
||||
logger.debug(
|
||||
f"Connector took {time.monotonic() - start} seconds to build a batch."
|
||||
)
|
||||
|
||||
yield batch
|
||||
|
||||
start = time.monotonic()
|
||||
|
||||
yield from self.doc_batch_generator
|
||||
except Exception:
|
||||
exc_type, _, exc_traceback = sys.exc_info()
|
||||
|
||||
|
||||
@@ -50,9 +50,6 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
current_link = ""
|
||||
current_text = ""
|
||||
|
||||
if transcript["sentences"] is None:
|
||||
return None
|
||||
|
||||
for sentence in transcript["sentences"]:
|
||||
if sentence["speaker_name"] != current_speaker_name:
|
||||
if current_speaker_name is not None:
|
||||
|
||||
@@ -150,16 +150,6 @@ class Document(DocumentBase):
|
||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||
source: DocumentSource
|
||||
|
||||
def get_total_char_length(self) -> int:
|
||||
"""Calculate the total character length of the document including sections, metadata, and identifiers."""
|
||||
section_length = sum(len(section.text) for section in self.sections)
|
||||
identifier_length = len(self.semantic_identifier) + len(self.title or "")
|
||||
metadata_length = sum(
|
||||
len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v)
|
||||
for k, v in self.metadata.items()
|
||||
)
|
||||
return section_length + identifier_length + metadata_length
|
||||
|
||||
def to_short_descriptor(self) -> str:
|
||||
"""Used when logging the identity of a document"""
|
||||
return f"ID: '{self.id}'; Semantic ID: '{self.semantic_identifier}'"
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
import io
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import field
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote
|
||||
|
||||
import msal # type: ignore
|
||||
from office365.graph_client import GraphClient # type: ignore
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore
|
||||
from pydantic import BaseModel
|
||||
from office365.onedrive.sites.site import Site # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
@@ -27,25 +30,16 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class SiteDescriptor(BaseModel):
|
||||
"""Data class for storing SharePoint site information.
|
||||
|
||||
Args:
|
||||
url: The base site URL (e.g. https://danswerai.sharepoint.com/sites/sharepoint-tests)
|
||||
drive_name: The name of the drive to access (e.g. "Shared Documents", "Other Library")
|
||||
If None, all drives will be accessed.
|
||||
folder_path: The folder path within the drive to access (e.g. "test/nested with spaces")
|
||||
If None, all folders will be accessed.
|
||||
"""
|
||||
|
||||
url: str
|
||||
drive_name: str | None
|
||||
folder_path: str | None
|
||||
@dataclass
|
||||
class SiteData:
|
||||
url: str | None
|
||||
folder: Optional[str]
|
||||
sites: list = field(default_factory=list)
|
||||
driveitems: list = field(default_factory=list)
|
||||
|
||||
|
||||
def _convert_driveitem_to_document(
|
||||
driveitem: DriveItem,
|
||||
drive_name: str,
|
||||
) -> Document:
|
||||
file_text = extract_file_text(
|
||||
file=io.BytesIO(driveitem.get_content().execute_query().value),
|
||||
@@ -65,7 +59,7 @@ def _convert_driveitem_to_document(
|
||||
email=driveitem.last_modified_by.user.email,
|
||||
)
|
||||
],
|
||||
metadata={"drive": drive_name},
|
||||
metadata={},
|
||||
)
|
||||
return doc
|
||||
|
||||
@@ -77,179 +71,106 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
sites: list[str] = [],
|
||||
) -> None:
|
||||
self.batch_size = batch_size
|
||||
self._graph_client: GraphClient | None = None
|
||||
self.site_descriptors: list[SiteDescriptor] = self._extract_site_and_drive_info(
|
||||
sites
|
||||
)
|
||||
self.msal_app: msal.ConfidentialClientApplication | None = None
|
||||
|
||||
@property
|
||||
def graph_client(self) -> GraphClient:
|
||||
if self._graph_client is None:
|
||||
raise ConnectorMissingCredentialError("Sharepoint")
|
||||
|
||||
return self._graph_client
|
||||
self.graph_client: GraphClient | None = None
|
||||
self.site_data: list[SiteData] = self._extract_site_and_folder(sites)
|
||||
|
||||
@staticmethod
|
||||
def _extract_site_and_drive_info(site_urls: list[str]) -> list[SiteDescriptor]:
|
||||
def _extract_site_and_folder(site_urls: list[str]) -> list[SiteData]:
|
||||
site_data_list = []
|
||||
for url in site_urls:
|
||||
parts = url.strip().split("/")
|
||||
if "sites" in parts:
|
||||
sites_index = parts.index("sites")
|
||||
site_url = "/".join(parts[: sites_index + 2])
|
||||
remaining_parts = parts[sites_index + 2 :]
|
||||
|
||||
# Extract drive name and folder path
|
||||
if remaining_parts:
|
||||
drive_name = unquote(remaining_parts[0])
|
||||
folder_path = (
|
||||
"/".join(unquote(part) for part in remaining_parts[1:])
|
||||
if len(remaining_parts) > 1
|
||||
else None
|
||||
)
|
||||
else:
|
||||
drive_name = None
|
||||
folder_path = None
|
||||
|
||||
folder = (
|
||||
"/".join(unquote(part) for part in parts[sites_index + 2 :])
|
||||
if len(parts) > sites_index + 2
|
||||
else None
|
||||
)
|
||||
# Handling for new URL structure
|
||||
if folder and folder.startswith("Shared Documents/"):
|
||||
folder = folder[len("Shared Documents/") :]
|
||||
site_data_list.append(
|
||||
SiteDescriptor(
|
||||
url=site_url,
|
||||
drive_name=drive_name,
|
||||
folder_path=folder_path,
|
||||
)
|
||||
SiteData(url=site_url, folder=folder, sites=[], driveitems=[])
|
||||
)
|
||||
return site_data_list
|
||||
|
||||
def _fetch_driveitems(
|
||||
def _populate_sitedata_driveitems(
|
||||
self,
|
||||
site_descriptor: SiteDescriptor,
|
||||
start: datetime | None = None,
|
||||
end: datetime | None = None,
|
||||
) -> list[tuple[DriveItem, str]]:
|
||||
final_driveitems: list[tuple[DriveItem, str]] = []
|
||||
try:
|
||||
site = self.graph_client.sites.get_by_url(site_descriptor.url)
|
||||
) -> None:
|
||||
filter_str = ""
|
||||
if start is not None and end is not None:
|
||||
filter_str = f"last_modified_datetime ge {start.isoformat()} and last_modified_datetime le {end.isoformat()}"
|
||||
|
||||
# Get all drives in the site
|
||||
drives = site.drives.get().execute_query()
|
||||
logger.debug(f"Found drives: {[drive.name for drive in drives]}")
|
||||
for element in self.site_data:
|
||||
sites: list[Site] = []
|
||||
for site in element.sites:
|
||||
site_sublist = site.lists.get().execute_query()
|
||||
sites.extend(site_sublist)
|
||||
|
||||
# Filter drives based on the requested drive name
|
||||
if site_descriptor.drive_name:
|
||||
drives = [
|
||||
drive
|
||||
for drive in drives
|
||||
if drive.name == site_descriptor.drive_name
|
||||
or (
|
||||
drive.name == "Documents"
|
||||
and site_descriptor.drive_name == "Shared Documents"
|
||||
)
|
||||
]
|
||||
if not drives:
|
||||
logger.warning(f"Drive '{site_descriptor.drive_name}' not found")
|
||||
return []
|
||||
|
||||
# Process each matching drive
|
||||
for drive in drives:
|
||||
for site in sites:
|
||||
try:
|
||||
root_folder = drive.root
|
||||
if site_descriptor.folder_path:
|
||||
# If a specific folder is requested, navigate to it
|
||||
for folder_part in site_descriptor.folder_path.split("/"):
|
||||
root_folder = root_folder.get_by_path(folder_part)
|
||||
|
||||
# Get all items recursively
|
||||
query = root_folder.get_files(
|
||||
recursive=True,
|
||||
page_size=1000,
|
||||
)
|
||||
query = site.drive.root.get_files(True, 1000)
|
||||
if filter_str:
|
||||
query = query.filter(filter_str)
|
||||
driveitems = query.execute_query()
|
||||
logger.debug(
|
||||
f"Found {len(driveitems)} items in drive '{drive.name}'"
|
||||
)
|
||||
|
||||
# Use "Shared Documents" as the library name for the default "Documents" drive
|
||||
drive_name = (
|
||||
"Shared Documents" if drive.name == "Documents" else drive.name
|
||||
)
|
||||
|
||||
# Filter items based on folder path if specified
|
||||
if site_descriptor.folder_path:
|
||||
# Filter items to ensure they're in the specified folder or its subfolders
|
||||
# The path will be in format: /drives/{drive_id}/root:/folder/path
|
||||
driveitems = [
|
||||
if element.folder:
|
||||
expected_path = f"/root:/{element.folder}"
|
||||
filtered_driveitems = [
|
||||
item
|
||||
for item in driveitems
|
||||
if any(
|
||||
path_part == site_descriptor.folder_path
|
||||
or path_part.startswith(
|
||||
site_descriptor.folder_path + "/"
|
||||
)
|
||||
for path_part in item.parent_reference.path.split(
|
||||
"root:/"
|
||||
)[1].split("/")
|
||||
)
|
||||
if item.parent_reference.path.endswith(expected_path)
|
||||
]
|
||||
if len(driveitems) == 0:
|
||||
if len(filtered_driveitems) == 0:
|
||||
all_paths = [
|
||||
item.parent_reference.path for item in driveitems
|
||||
]
|
||||
logger.warning(
|
||||
f"Nothing found for folder '{site_descriptor.folder_path}' "
|
||||
f"in; any of valid paths: {all_paths}"
|
||||
f"Nothing found for folder '{expected_path}' in any of valid paths: {all_paths}"
|
||||
)
|
||||
element.driveitems.extend(filtered_driveitems)
|
||||
else:
|
||||
element.driveitems.extend(driveitems)
|
||||
|
||||
# Filter items based on time window if specified
|
||||
if start is not None and end is not None:
|
||||
driveitems = [
|
||||
item
|
||||
for item in driveitems
|
||||
if start
|
||||
<= item.last_modified_datetime.replace(tzinfo=timezone.utc)
|
||||
<= end
|
||||
]
|
||||
logger.debug(
|
||||
f"Found {len(driveitems)} items within time window in drive '{drive.name}'"
|
||||
)
|
||||
except Exception:
|
||||
# Sites include things that do not contain .drive.root so this fails
|
||||
# but this is fine, as there are no actually documents in those
|
||||
pass
|
||||
|
||||
for item in driveitems:
|
||||
final_driveitems.append((item, drive_name))
|
||||
def _populate_sitedata_sites(self) -> None:
|
||||
if self.graph_client is None:
|
||||
raise ConnectorMissingCredentialError("Sharepoint")
|
||||
|
||||
except Exception as e:
|
||||
# Some drives might not be accessible
|
||||
logger.warning(f"Failed to process drive: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
# Sites include things that do not contain drives so this fails
|
||||
# but this is fine, as there are no actual documents in those
|
||||
logger.warning(f"Failed to process site: {str(e)}")
|
||||
|
||||
return final_driveitems
|
||||
|
||||
def _fetch_sites(self) -> list[SiteDescriptor]:
|
||||
sites = self.graph_client.sites.get_all().execute_query()
|
||||
site_descriptors = [
|
||||
SiteDescriptor(
|
||||
url=sites.resource_url,
|
||||
drive_name=None,
|
||||
folder_path=None,
|
||||
)
|
||||
]
|
||||
return site_descriptors
|
||||
if self.site_data:
|
||||
for element in self.site_data:
|
||||
element.sites = [
|
||||
self.graph_client.sites.get_by_url(element.url)
|
||||
.get()
|
||||
.execute_query()
|
||||
]
|
||||
else:
|
||||
sites = self.graph_client.sites.get_all().execute_query()
|
||||
self.site_data = [
|
||||
SiteData(url=None, folder=None, sites=sites, driveitems=[])
|
||||
]
|
||||
|
||||
def _fetch_from_sharepoint(
|
||||
self, start: datetime | None = None, end: datetime | None = None
|
||||
) -> GenerateDocumentsOutput:
|
||||
site_descriptors = self.site_descriptors or self._fetch_sites()
|
||||
if self.graph_client is None:
|
||||
raise ConnectorMissingCredentialError("Sharepoint")
|
||||
|
||||
self._populate_sitedata_sites()
|
||||
self._populate_sitedata_driveitems(start=start, end=end)
|
||||
|
||||
# goes over all urls, converts them into Document objects and then yields them in batches
|
||||
doc_batch: list[Document] = []
|
||||
for site_descriptor in site_descriptors:
|
||||
driveitems = self._fetch_driveitems(site_descriptor, start=start, end=end)
|
||||
for driveitem, drive_name in driveitems:
|
||||
for element in self.site_data:
|
||||
for driveitem in element.driveitems:
|
||||
logger.debug(f"Processing: {driveitem.web_url}")
|
||||
doc_batch.append(_convert_driveitem_to_document(driveitem, drive_name))
|
||||
doc_batch.append(_convert_driveitem_to_document(driveitem))
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
@@ -261,26 +182,22 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
sp_client_secret = credentials["sp_client_secret"]
|
||||
sp_directory_id = credentials["sp_directory_id"]
|
||||
|
||||
authority_url = f"https://login.microsoftonline.com/{sp_directory_id}"
|
||||
self.msal_app = msal.ConfidentialClientApplication(
|
||||
authority=authority_url,
|
||||
client_id=sp_client_id,
|
||||
client_credential=sp_client_secret,
|
||||
)
|
||||
|
||||
def _acquire_token_func() -> dict[str, Any]:
|
||||
"""
|
||||
Acquire token via MSAL
|
||||
"""
|
||||
if self.msal_app is None:
|
||||
raise RuntimeError("MSAL app is not initialized")
|
||||
|
||||
token = self.msal_app.acquire_token_for_client(
|
||||
authority_url = f"https://login.microsoftonline.com/{sp_directory_id}"
|
||||
app = msal.ConfidentialClientApplication(
|
||||
authority=authority_url,
|
||||
client_id=sp_client_id,
|
||||
client_credential=sp_client_secret,
|
||||
)
|
||||
token = app.acquire_token_for_client(
|
||||
scopes=["https://graph.microsoft.com/.default"]
|
||||
)
|
||||
return token
|
||||
|
||||
self._graph_client = GraphClient(_acquire_token_func)
|
||||
self.graph_client = GraphClient(_acquire_token_func)
|
||||
return None
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
@@ -289,19 +206,19 @@ class SharepointConnector(LoadConnector, PollConnector):
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
start_datetime = datetime.fromtimestamp(start, timezone.utc)
|
||||
end_datetime = datetime.fromtimestamp(end, timezone.utc)
|
||||
start_datetime = datetime.utcfromtimestamp(start)
|
||||
end_datetime = datetime.utcfromtimestamp(end)
|
||||
return self._fetch_from_sharepoint(start=start_datetime, end=end_datetime)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
connector = SharepointConnector(sites=os.environ["SHAREPOINT_SITES"].split(","))
|
||||
connector = SharepointConnector(sites=os.environ["SITES"].split(","))
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"sp_client_id": os.environ["SHAREPOINT_CLIENT_ID"],
|
||||
"sp_client_secret": os.environ["SHAREPOINT_CLIENT_SECRET"],
|
||||
"sp_directory_id": os.environ["SHAREPOINT_CLIENT_DIRECTORY_ID"],
|
||||
"sp_client_id": os.environ["SP_CLIENT_ID"],
|
||||
"sp_client_secret": os.environ["SP_CLIENT_SECRET"],
|
||||
"sp_directory_id": os.environ["SP_CLIENT_DIRECTORY_ID"],
|
||||
}
|
||||
)
|
||||
document_batches = connector.load_from_state()
|
||||
|
||||
@@ -104,11 +104,8 @@ def make_slack_api_rate_limited(
|
||||
f"Slack call rate limited, retrying after {retry_after} seconds. Exception: {e}"
|
||||
)
|
||||
time.sleep(retry_after)
|
||||
elif error in ["already_reacted", "no_reaction", "internal_error"]:
|
||||
# Log internal_error and return the response instead of failing
|
||||
logger.warning(
|
||||
f"Slack call encountered '{error}', skipping and continuing..."
|
||||
)
|
||||
elif error in ["already_reacted", "no_reaction"]:
|
||||
# The response isn't used for reactions, this is basically just a pass
|
||||
return e.response
|
||||
else:
|
||||
# Raise the error for non-transient errors
|
||||
|
||||
@@ -180,28 +180,23 @@ class TeamsConnector(LoadConnector, PollConnector):
|
||||
self.batch_size = batch_size
|
||||
self.graph_client: GraphClient | None = None
|
||||
self.requested_team_list: list[str] = teams
|
||||
self.msal_app: msal.ConfidentialClientApplication | None = None
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
teams_client_id = credentials["teams_client_id"]
|
||||
teams_client_secret = credentials["teams_client_secret"]
|
||||
teams_directory_id = credentials["teams_directory_id"]
|
||||
|
||||
authority_url = f"https://login.microsoftonline.com/{teams_directory_id}"
|
||||
self.msal_app = msal.ConfidentialClientApplication(
|
||||
authority=authority_url,
|
||||
client_id=teams_client_id,
|
||||
client_credential=teams_client_secret,
|
||||
)
|
||||
|
||||
def _acquire_token_func() -> dict[str, Any]:
|
||||
"""
|
||||
Acquire token via MSAL
|
||||
"""
|
||||
if self.msal_app is None:
|
||||
raise RuntimeError("MSAL app is not initialized")
|
||||
|
||||
token = self.msal_app.acquire_token_for_client(
|
||||
authority_url = f"https://login.microsoftonline.com/{teams_directory_id}"
|
||||
app = msal.ConfidentialClientApplication(
|
||||
authority=authority_url,
|
||||
client_id=teams_client_id,
|
||||
client_credential=teams_client_secret,
|
||||
)
|
||||
token = app.acquire_token_for_client(
|
||||
scopes=["https://graph.microsoft.com/.default"]
|
||||
)
|
||||
return token
|
||||
|
||||
@@ -67,7 +67,10 @@ class SearchPipeline:
|
||||
self.rerank_metrics_callback = rerank_metrics_callback
|
||||
|
||||
self.search_settings = get_current_search_settings(db_session)
|
||||
self.document_index = get_default_document_index(self.search_settings, None)
|
||||
self.document_index = get_default_document_index(
|
||||
primary_index_name=self.search_settings.index_name,
|
||||
secondary_index_name=None,
|
||||
)
|
||||
self.prompt_config: PromptConfig | None = prompt_config
|
||||
|
||||
# Preprocessing steps generate this
|
||||
|
||||
@@ -28,9 +28,6 @@ class SyncType(str, PyEnum):
|
||||
DOCUMENT_SET = "document_set"
|
||||
USER_GROUP = "user_group"
|
||||
CONNECTOR_DELETION = "connector_deletion"
|
||||
PRUNING = "pruning" # not really a sync, but close enough
|
||||
EXTERNAL_PERMISSIONS = "external_permissions"
|
||||
EXTERNAL_GROUP = "external_group"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.value
|
||||
|
||||
@@ -3,8 +3,6 @@ from sqlalchemy import or_
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import AUTH_TYPE
|
||||
from onyx.configs.constants import AuthType
|
||||
from onyx.db.models import CloudEmbeddingProvider as CloudEmbeddingProviderModel
|
||||
from onyx.db.models import DocumentSet
|
||||
from onyx.db.models import LLMProvider as LLMProviderModel
|
||||
@@ -126,29 +124,10 @@ def fetch_existing_tools(db_session: Session, tool_ids: list[int]) -> list[ToolM
|
||||
|
||||
def fetch_existing_llm_providers(
|
||||
db_session: Session,
|
||||
) -> list[LLMProviderModel]:
|
||||
stmt = select(LLMProviderModel)
|
||||
return list(db_session.scalars(stmt).all())
|
||||
|
||||
|
||||
def fetch_existing_llm_providers_for_user(
|
||||
db_session: Session,
|
||||
user: User | None = None,
|
||||
) -> list[LLMProviderModel]:
|
||||
if not user:
|
||||
if AUTH_TYPE != AuthType.DISABLED:
|
||||
# User is anonymous
|
||||
return list(
|
||||
db_session.scalars(
|
||||
select(LLMProviderModel).where(
|
||||
LLMProviderModel.is_public == True # noqa: E712
|
||||
)
|
||||
).all()
|
||||
)
|
||||
else:
|
||||
# If auth is disabled, user has access to all providers
|
||||
return fetch_existing_llm_providers(db_session)
|
||||
|
||||
return list(db_session.scalars(select(LLMProviderModel)).all())
|
||||
stmt = select(LLMProviderModel).distinct()
|
||||
user_groups_select = select(User__UserGroup.user_group_id).where(
|
||||
User__UserGroup.user_id == user.id
|
||||
|
||||
@@ -161,7 +161,9 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
|
||||
hidden_assistants: Mapped[list[int]] = mapped_column(
|
||||
postgresql.JSONB(), nullable=False, default=[]
|
||||
)
|
||||
|
||||
recent_assistants: Mapped[list[dict]] = mapped_column(
|
||||
postgresql.JSONB(), nullable=False, default=list, server_default="[]"
|
||||
)
|
||||
pinned_assistants: Mapped[list[int] | None] = mapped_column(
|
||||
postgresql.JSONB(), nullable=True, default=None
|
||||
)
|
||||
@@ -745,34 +747,6 @@ class SearchSettings(Base):
|
||||
def api_key(self) -> str | None:
|
||||
return self.cloud_provider.api_key if self.cloud_provider is not None else None
|
||||
|
||||
@property
|
||||
def large_chunks_enabled(self) -> bool:
|
||||
"""
|
||||
Given multipass usage and an embedder, decides whether large chunks are allowed
|
||||
based on model/provider constraints.
|
||||
"""
|
||||
# Only local models that support a larger context are from Nomic
|
||||
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
|
||||
return SearchSettings.can_use_large_chunks(
|
||||
self.multipass_indexing, self.model_name, self.provider_type
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def can_use_large_chunks(
|
||||
multipass: bool, model_name: str, provider_type: EmbeddingProvider | None
|
||||
) -> bool:
|
||||
"""
|
||||
Given multipass usage and an embedder, decides whether large chunks are allowed
|
||||
based on model/provider constraints.
|
||||
"""
|
||||
# Only local models that support a larger context are from Nomic
|
||||
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
|
||||
return (
|
||||
multipass
|
||||
and model_name.startswith("nomic-ai")
|
||||
and provider_type != EmbeddingProvider.COHERE
|
||||
)
|
||||
|
||||
|
||||
class IndexAttempt(Base):
|
||||
"""
|
||||
|
||||
@@ -11,7 +11,7 @@ from sqlalchemy import Select
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import aliased
|
||||
from sqlalchemy.orm import selectinload
|
||||
from sqlalchemy.orm import joinedload
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.auth.schemas import UserRole
|
||||
@@ -291,9 +291,8 @@ def get_personas_for_user(
|
||||
include_deleted: bool = False,
|
||||
joinedload_all: bool = False,
|
||||
) -> Sequence[Persona]:
|
||||
stmt = select(Persona)
|
||||
stmt = _add_user_filters(stmt, user, get_editable)
|
||||
|
||||
stmt = select(Persona).distinct()
|
||||
stmt = _add_user_filters(stmt=stmt, user=user, get_editable=get_editable)
|
||||
if not include_default:
|
||||
stmt = stmt.where(Persona.builtin_persona.is_(False))
|
||||
if not include_slack_bot_personas:
|
||||
@@ -303,16 +302,14 @@ def get_personas_for_user(
|
||||
|
||||
if joinedload_all:
|
||||
stmt = stmt.options(
|
||||
selectinload(Persona.prompts),
|
||||
selectinload(Persona.tools),
|
||||
selectinload(Persona.document_sets),
|
||||
selectinload(Persona.groups),
|
||||
selectinload(Persona.users),
|
||||
selectinload(Persona.labels),
|
||||
joinedload(Persona.prompts),
|
||||
joinedload(Persona.tools),
|
||||
joinedload(Persona.document_sets),
|
||||
joinedload(Persona.groups),
|
||||
joinedload(Persona.users),
|
||||
)
|
||||
|
||||
results = db_session.execute(stmt).scalars().all()
|
||||
return results
|
||||
return db_session.execute(stmt).unique().scalars().all()
|
||||
|
||||
|
||||
def get_personas(db_session: Session) -> Sequence[Persona]:
|
||||
|
||||
@@ -29,21 +29,9 @@ from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import PRESERVED_SEARCH_FIELDS
|
||||
from shared_configs.enums import EmbeddingProvider
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class ActiveSearchSettings:
|
||||
primary: SearchSettings
|
||||
secondary: SearchSettings | None
|
||||
|
||||
def __init__(
|
||||
self, primary: SearchSettings, secondary: SearchSettings | None
|
||||
) -> None:
|
||||
self.primary = primary
|
||||
self.secondary = secondary
|
||||
|
||||
|
||||
def create_search_settings(
|
||||
search_settings: SavedSearchSettings,
|
||||
db_session: Session,
|
||||
@@ -155,27 +143,21 @@ def get_secondary_search_settings(db_session: Session) -> SearchSettings | None:
|
||||
return latest_settings
|
||||
|
||||
|
||||
def get_active_search_settings(db_session: Session) -> ActiveSearchSettings:
|
||||
"""Returns active search settings. Secondary search settings may be None."""
|
||||
|
||||
# Get the primary and secondary search settings
|
||||
primary_search_settings = get_current_search_settings(db_session)
|
||||
secondary_search_settings = get_secondary_search_settings(db_session)
|
||||
return ActiveSearchSettings(
|
||||
primary=primary_search_settings, secondary=secondary_search_settings
|
||||
)
|
||||
|
||||
|
||||
def get_active_search_settings_list(db_session: Session) -> list[SearchSettings]:
|
||||
"""Returns active search settings as a list. Primary settings are the first element,
|
||||
and if secondary search settings exist, they will be the second element."""
|
||||
|
||||
def get_active_search_settings(db_session: Session) -> list[SearchSettings]:
|
||||
"""Returns active search settings. The first entry will always be the current search
|
||||
settings. If there are new search settings that are being migrated to, those will be
|
||||
the second entry."""
|
||||
search_settings_list: list[SearchSettings] = []
|
||||
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
search_settings_list.append(active_search_settings.primary)
|
||||
if active_search_settings.secondary:
|
||||
search_settings_list.append(active_search_settings.secondary)
|
||||
# Get the primary search settings
|
||||
primary_search_settings = get_current_search_settings(db_session)
|
||||
search_settings_list.append(primary_search_settings)
|
||||
|
||||
# Check for secondary search settings
|
||||
secondary_search_settings = get_secondary_search_settings(db_session)
|
||||
if secondary_search_settings is not None:
|
||||
# If secondary settings exist, add them to the list
|
||||
search_settings_list.append(secondary_search_settings)
|
||||
|
||||
return search_settings_list
|
||||
|
||||
|
||||
@@ -4,63 +4,24 @@ from uuid import UUID
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.indexing.models import MultipassConfig
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
DEFAULT_BATCH_SIZE = 30
|
||||
DEFAULT_INDEX_NAME = "danswer_chunk"
|
||||
|
||||
|
||||
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
|
||||
"""
|
||||
Determines whether multipass should be used based on the search settings
|
||||
or the default config if settings are unavailable.
|
||||
"""
|
||||
if search_settings is not None:
|
||||
return search_settings.multipass_indexing
|
||||
return ENABLE_MULTIPASS_INDEXING
|
||||
|
||||
|
||||
def get_multipass_config(search_settings: SearchSettings) -> MultipassConfig:
|
||||
"""
|
||||
Determines whether to enable multipass and large chunks by examining
|
||||
the current search settings and the embedder configuration.
|
||||
"""
|
||||
if not search_settings:
|
||||
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
|
||||
|
||||
multipass = should_use_multipass(search_settings)
|
||||
enable_large_chunks = SearchSettings.can_use_large_chunks(
|
||||
multipass, search_settings.model_name, search_settings.provider_type
|
||||
)
|
||||
return MultipassConfig(
|
||||
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
|
||||
)
|
||||
|
||||
|
||||
def get_both_index_properties(
|
||||
db_session: Session,
|
||||
) -> tuple[str, str | None, bool, bool | None]:
|
||||
def get_both_index_names(db_session: Session) -> tuple[str, str | None]:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
config_1 = get_multipass_config(search_settings)
|
||||
|
||||
search_settings_new = get_secondary_search_settings(db_session)
|
||||
if not search_settings_new:
|
||||
return search_settings.index_name, None, config_1.enable_large_chunks, None
|
||||
return search_settings.index_name, None
|
||||
|
||||
config_2 = get_multipass_config(search_settings)
|
||||
return (
|
||||
search_settings.index_name,
|
||||
search_settings_new.index_name,
|
||||
config_1.enable_large_chunks,
|
||||
config_2.enable_large_chunks,
|
||||
)
|
||||
return search_settings.index_name, search_settings_new.index_name
|
||||
|
||||
|
||||
def translate_boost_count_to_multiplier(boost: int) -> float:
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
@@ -9,28 +7,17 @@ from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
def get_default_document_index(
|
||||
search_settings: SearchSettings,
|
||||
secondary_search_settings: SearchSettings | None,
|
||||
httpx_client: httpx.Client | None = None,
|
||||
primary_index_name: str,
|
||||
secondary_index_name: str | None,
|
||||
) -> DocumentIndex:
|
||||
"""Primary index is the index that is used for querying/updating etc.
|
||||
Secondary index is for when both the currently used index and the upcoming
|
||||
index both need to be updated, updates are applied to both indices"""
|
||||
|
||||
secondary_index_name: str | None = None
|
||||
secondary_large_chunks_enabled: bool | None = None
|
||||
if secondary_search_settings:
|
||||
secondary_index_name = secondary_search_settings.index_name
|
||||
secondary_large_chunks_enabled = secondary_search_settings.large_chunks_enabled
|
||||
|
||||
# Currently only supporting Vespa
|
||||
return VespaIndex(
|
||||
index_name=search_settings.index_name,
|
||||
index_name=primary_index_name,
|
||||
secondary_index_name=secondary_index_name,
|
||||
large_chunks_enabled=search_settings.large_chunks_enabled,
|
||||
secondary_large_chunks_enabled=secondary_large_chunks_enabled,
|
||||
multitenant=MULTI_TENANT,
|
||||
httpx_client=httpx_client,
|
||||
)
|
||||
|
||||
|
||||
@@ -40,6 +27,6 @@ def get_current_primary_default_document_index(db_session: Session) -> DocumentI
|
||||
"""
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
return get_default_document_index(
|
||||
search_settings,
|
||||
None,
|
||||
primary_index_name=search_settings.index_name,
|
||||
secondary_index_name=None,
|
||||
)
|
||||
|
||||
@@ -231,22 +231,21 @@ def _get_chunks_via_visit_api(
|
||||
return document_chunks
|
||||
|
||||
|
||||
# TODO(rkuo): candidate for removal if not being used
|
||||
# @retry(tries=10, delay=1, backoff=2)
|
||||
# def get_all_vespa_ids_for_document_id(
|
||||
# document_id: str,
|
||||
# index_name: str,
|
||||
# filters: IndexFilters | None = None,
|
||||
# get_large_chunks: bool = False,
|
||||
# ) -> list[str]:
|
||||
# document_chunks = _get_chunks_via_visit_api(
|
||||
# chunk_request=VespaChunkRequest(document_id=document_id),
|
||||
# index_name=index_name,
|
||||
# filters=filters or IndexFilters(access_control_list=None),
|
||||
# field_names=[DOCUMENT_ID],
|
||||
# get_large_chunks=get_large_chunks,
|
||||
# )
|
||||
# return [chunk["id"].split("::", 1)[-1] for chunk in document_chunks]
|
||||
@retry(tries=10, delay=1, backoff=2)
|
||||
def get_all_vespa_ids_for_document_id(
|
||||
document_id: str,
|
||||
index_name: str,
|
||||
filters: IndexFilters | None = None,
|
||||
get_large_chunks: bool = False,
|
||||
) -> list[str]:
|
||||
document_chunks = _get_chunks_via_visit_api(
|
||||
chunk_request=VespaChunkRequest(document_id=document_id),
|
||||
index_name=index_name,
|
||||
filters=filters or IndexFilters(access_control_list=None),
|
||||
field_names=[DOCUMENT_ID],
|
||||
get_large_chunks=get_large_chunks,
|
||||
)
|
||||
return [chunk["id"].split("::", 1)[-1] for chunk in document_chunks]
|
||||
|
||||
|
||||
def parallel_visit_api_retrieval(
|
||||
|
||||
@@ -25,6 +25,7 @@ from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
|
||||
from onyx.configs.constants import KV_REINDEX_KEY
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.document_index.document_index_utils import get_document_chunk_ids
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentInsertionRecord
|
||||
@@ -40,12 +41,12 @@ from onyx.document_index.vespa.chunk_retrieval import (
|
||||
)
|
||||
from onyx.document_index.vespa.chunk_retrieval import query_vespa
|
||||
from onyx.document_index.vespa.deletion import delete_vespa_chunks
|
||||
from onyx.document_index.vespa.indexing_utils import BaseHTTPXClientContext
|
||||
from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
|
||||
from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence
|
||||
from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
|
||||
from onyx.document_index.vespa.indexing_utils import GlobalHTTPXClientContext
|
||||
from onyx.document_index.vespa.indexing_utils import TemporaryHTTPXClientContext
|
||||
from onyx.document_index.vespa.indexing_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa.shared_utils.utils import (
|
||||
replace_invalid_doc_id_characters,
|
||||
@@ -131,34 +132,12 @@ class VespaIndex(DocumentIndex):
|
||||
self,
|
||||
index_name: str,
|
||||
secondary_index_name: str | None,
|
||||
large_chunks_enabled: bool,
|
||||
secondary_large_chunks_enabled: bool | None,
|
||||
multitenant: bool = False,
|
||||
httpx_client: httpx.Client | None = None,
|
||||
) -> None:
|
||||
self.index_name = index_name
|
||||
self.secondary_index_name = secondary_index_name
|
||||
|
||||
self.large_chunks_enabled = large_chunks_enabled
|
||||
self.secondary_large_chunks_enabled = secondary_large_chunks_enabled
|
||||
|
||||
self.multitenant = multitenant
|
||||
|
||||
self.httpx_client_context: BaseHTTPXClientContext
|
||||
|
||||
if httpx_client:
|
||||
self.httpx_client_context = GlobalHTTPXClientContext(httpx_client)
|
||||
else:
|
||||
self.httpx_client_context = TemporaryHTTPXClientContext(
|
||||
get_vespa_http_client
|
||||
)
|
||||
|
||||
self.index_to_large_chunks_enabled: dict[str, bool] = {}
|
||||
self.index_to_large_chunks_enabled[index_name] = large_chunks_enabled
|
||||
if secondary_index_name and secondary_large_chunks_enabled:
|
||||
self.index_to_large_chunks_enabled[
|
||||
secondary_index_name
|
||||
] = secondary_large_chunks_enabled
|
||||
self.http_client = get_vespa_http_client()
|
||||
|
||||
def ensure_indices_exist(
|
||||
self,
|
||||
@@ -352,7 +331,7 @@ class VespaIndex(DocumentIndex):
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
with (
|
||||
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
|
||||
self.httpx_client_context as http_client,
|
||||
get_vespa_http_client() as http_client,
|
||||
):
|
||||
# We require the start and end index for each document in order to
|
||||
# know precisely which chunks to delete. This information exists for
|
||||
@@ -411,11 +390,9 @@ class VespaIndex(DocumentIndex):
|
||||
for doc_id in all_doc_ids
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@staticmethod
|
||||
def _apply_updates_batched(
|
||||
cls,
|
||||
updates: list[_VespaUpdateRequest],
|
||||
httpx_client: httpx.Client,
|
||||
batch_size: int = BATCH_SIZE,
|
||||
) -> None:
|
||||
"""Runs a batch of updates in parallel via the ThreadPoolExecutor."""
|
||||
@@ -437,7 +414,7 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
with (
|
||||
concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
|
||||
httpx_client as http_client,
|
||||
get_vespa_http_client() as http_client,
|
||||
):
|
||||
for update_batch in batch_generator(updates, batch_size):
|
||||
future_to_document_id = {
|
||||
@@ -478,7 +455,7 @@ class VespaIndex(DocumentIndex):
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
chunk_id_start_time = time.monotonic()
|
||||
with self.httpx_client_context as http_client:
|
||||
with get_vespa_http_client() as http_client:
|
||||
for update_request in update_requests:
|
||||
for doc_info in update_request.minimal_document_indexing_info:
|
||||
for index_name in index_names:
|
||||
@@ -534,8 +511,7 @@ class VespaIndex(DocumentIndex):
|
||||
)
|
||||
)
|
||||
|
||||
with self.httpx_client_context as httpx_client:
|
||||
self._apply_updates_batched(processed_updates_requests, httpx_client)
|
||||
self._apply_updates_batched(processed_updates_requests)
|
||||
logger.debug(
|
||||
"Finished updating Vespa documents in %.2f seconds",
|
||||
time.monotonic() - update_start,
|
||||
@@ -547,7 +523,6 @@ class VespaIndex(DocumentIndex):
|
||||
index_name: str,
|
||||
fields: VespaDocumentFields,
|
||||
doc_id: str,
|
||||
http_client: httpx.Client,
|
||||
) -> None:
|
||||
"""
|
||||
Update a single "chunk" (document) in Vespa using its chunk ID.
|
||||
@@ -579,17 +554,18 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true"
|
||||
|
||||
try:
|
||||
resp = http_client.put(
|
||||
vespa_url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update_dict,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
|
||||
logger.error(error_message)
|
||||
raise
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
try:
|
||||
resp = http_client.put(
|
||||
vespa_url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update_dict,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
|
||||
logger.error(error_message)
|
||||
raise
|
||||
|
||||
def update_single(
|
||||
self,
|
||||
@@ -603,16 +579,24 @@ class VespaIndex(DocumentIndex):
|
||||
function will complete with no errors or exceptions.
|
||||
Handle other exceptions if you wish to implement retry behavior
|
||||
"""
|
||||
|
||||
doc_chunk_count = 0
|
||||
|
||||
with self.httpx_client_context as httpx_client:
|
||||
for (
|
||||
index_name,
|
||||
large_chunks_enabled,
|
||||
) in self.index_to_large_chunks_enabled.items():
|
||||
index_names = [self.index_name]
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
for index_name in index_names:
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
multipass_config = get_multipass_config(
|
||||
db_session=db_session,
|
||||
primary_index=index_name == self.index_name,
|
||||
)
|
||||
large_chunks_enabled = multipass_config.enable_large_chunks
|
||||
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
|
||||
index_name=index_name,
|
||||
http_client=httpx_client,
|
||||
http_client=http_client,
|
||||
document_id=doc_id,
|
||||
previous_chunk_count=chunk_count,
|
||||
new_chunk_count=0,
|
||||
@@ -628,7 +612,10 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
for doc_chunk_id in doc_chunk_ids:
|
||||
self.update_single_chunk(
|
||||
doc_chunk_id, index_name, fields, doc_id, httpx_client
|
||||
doc_chunk_id=doc_chunk_id,
|
||||
index_name=index_name,
|
||||
fields=fields,
|
||||
doc_id=doc_id,
|
||||
)
|
||||
|
||||
return doc_chunk_count
|
||||
@@ -650,13 +637,19 @@ class VespaIndex(DocumentIndex):
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with self.httpx_client_context as http_client, concurrent.futures.ThreadPoolExecutor(
|
||||
with get_vespa_http_client(
|
||||
http2=False
|
||||
) as http_client, concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=NUM_THREADS
|
||||
) as executor:
|
||||
for (
|
||||
index_name,
|
||||
large_chunks_enabled,
|
||||
) in self.index_to_large_chunks_enabled.items():
|
||||
for index_name in index_names:
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
multipass_config = get_multipass_config(
|
||||
db_session=db_session,
|
||||
primary_index=index_name == self.index_name,
|
||||
)
|
||||
large_chunks_enabled = multipass_config.enable_large_chunks
|
||||
|
||||
enriched_doc_infos = VespaIndex.enrich_basic_chunk_info(
|
||||
index_name=index_name,
|
||||
http_client=http_client,
|
||||
@@ -825,9 +818,6 @@ class VespaIndex(DocumentIndex):
|
||||
"""
|
||||
Deletes all entries in the specified index with the given tenant_id.
|
||||
|
||||
Currently unused, but we anticipate this being useful. The entire flow does not
|
||||
use the httpx connection pool of an instance.
|
||||
|
||||
Parameters:
|
||||
tenant_id (str): The tenant ID whose documents are to be deleted.
|
||||
index_name (str): The name of the index from which to delete documents.
|
||||
@@ -860,8 +850,6 @@ class VespaIndex(DocumentIndex):
|
||||
"""
|
||||
Retrieves all document IDs with the specified tenant_id, handling pagination.
|
||||
|
||||
Internal helper function for delete_entries_by_tenant_id.
|
||||
|
||||
Parameters:
|
||||
tenant_id (str): The tenant ID to search for.
|
||||
index_name (str): The name of the index to search in.
|
||||
@@ -894,8 +882,8 @@ class VespaIndex(DocumentIndex):
|
||||
f"Querying for document IDs with tenant_id: {tenant_id}, offset: {offset}"
|
||||
)
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
response = http_client.get(url, params=query_params, timeout=None)
|
||||
with get_vespa_http_client(no_timeout=True) as http_client:
|
||||
response = http_client.get(url, params=query_params)
|
||||
response.raise_for_status()
|
||||
|
||||
search_result = response.json()
|
||||
@@ -925,11 +913,6 @@ class VespaIndex(DocumentIndex):
|
||||
"""
|
||||
Deletes documents in batches using multiple threads.
|
||||
|
||||
Internal helper function for delete_entries_by_tenant_id.
|
||||
|
||||
This is a class method and does not use the httpx pool of the instance.
|
||||
This is OK because we don't use this method often.
|
||||
|
||||
Parameters:
|
||||
delete_requests (List[_VespaDeleteRequest]): The list of delete requests.
|
||||
batch_size (int): The number of documents to delete in each batch.
|
||||
@@ -942,14 +925,13 @@ class VespaIndex(DocumentIndex):
|
||||
response = http_client.delete(
|
||||
delete_request.url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=None,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.debug(f"Starting batch deletion for {len(delete_requests)} documents")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
|
||||
with get_vespa_http_client() as http_client:
|
||||
with get_vespa_http_client(no_timeout=True) as http_client:
|
||||
for batch_start in range(0, len(delete_requests), batch_size):
|
||||
batch = delete_requests[batch_start : batch_start + batch_size]
|
||||
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
import concurrent.futures
|
||||
import json
|
||||
import uuid
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from http import HTTPStatus
|
||||
|
||||
import httpx
|
||||
from retry import retry
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import ENABLE_MULTIPASS_INDEXING
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
|
||||
from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
|
||||
@@ -48,9 +50,10 @@ from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.indexing.models import EmbeddingProvider
|
||||
from onyx.indexing.models import MultipassConfig
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@@ -272,42 +275,46 @@ def check_for_final_chunk_existence(
|
||||
index += 1
|
||||
|
||||
|
||||
class BaseHTTPXClientContext(ABC):
|
||||
"""Abstract base class for an HTTPX client context manager."""
|
||||
|
||||
@abstractmethod
|
||||
def __enter__(self) -> httpx.Client:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
|
||||
pass
|
||||
def should_use_multipass(search_settings: SearchSettings | None) -> bool:
|
||||
"""
|
||||
Determines whether multipass should be used based on the search settings
|
||||
or the default config if settings are unavailable.
|
||||
"""
|
||||
if search_settings is not None:
|
||||
return search_settings.multipass_indexing
|
||||
return ENABLE_MULTIPASS_INDEXING
|
||||
|
||||
|
||||
class GlobalHTTPXClientContext(BaseHTTPXClientContext):
|
||||
"""Context manager for a global HTTPX client that does not close it."""
|
||||
|
||||
def __init__(self, client: httpx.Client):
|
||||
self._client = client
|
||||
|
||||
def __enter__(self) -> httpx.Client:
|
||||
return self._client # Reuse the global client
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
|
||||
pass # Do nothing; don't close the global client
|
||||
def can_use_large_chunks(multipass: bool, search_settings: SearchSettings) -> bool:
|
||||
"""
|
||||
Given multipass usage and an embedder, decides whether large chunks are allowed
|
||||
based on model/provider constraints.
|
||||
"""
|
||||
# Only local models that support a larger context are from Nomic
|
||||
# Cohere does not support larger contexts (they recommend not going above ~512 tokens)
|
||||
return (
|
||||
multipass
|
||||
and search_settings.model_name.startswith("nomic-ai")
|
||||
and search_settings.provider_type != EmbeddingProvider.COHERE
|
||||
)
|
||||
|
||||
|
||||
class TemporaryHTTPXClientContext(BaseHTTPXClientContext):
|
||||
"""Context manager for a temporary HTTPX client that closes it after use."""
|
||||
|
||||
def __init__(self, client_factory: Callable[[], httpx.Client]):
|
||||
self._client_factory = client_factory
|
||||
self._client: httpx.Client | None = None # Client will be created in __enter__
|
||||
|
||||
def __enter__(self) -> httpx.Client:
|
||||
self._client = self._client_factory() # Create a new client
|
||||
return self._client
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback): # type: ignore
|
||||
if self._client:
|
||||
self._client.close()
|
||||
def get_multipass_config(
|
||||
db_session: Session, primary_index: bool = True
|
||||
) -> MultipassConfig:
|
||||
"""
|
||||
Determines whether to enable multipass and large chunks by examining
|
||||
the current search settings and the embedder configuration.
|
||||
"""
|
||||
search_settings = (
|
||||
get_current_search_settings(db_session)
|
||||
if primary_index
|
||||
else get_secondary_search_settings(db_session)
|
||||
)
|
||||
multipass = should_use_multipass(search_settings)
|
||||
if not search_settings:
|
||||
return MultipassConfig(multipass_indexing=False, enable_large_chunks=False)
|
||||
enable_large_chunks = can_use_large_chunks(multipass, search_settings)
|
||||
return MultipassConfig(
|
||||
multipass_indexing=multipass, enable_large_chunks=enable_large_chunks
|
||||
)
|
||||
|
||||
@@ -55,7 +55,7 @@ def remove_invalid_unicode_chars(text: str) -> str:
|
||||
"""Vespa does not take in unicode chars that aren't valid for XML.
|
||||
This removes them."""
|
||||
_illegal_xml_chars_RE: re.Pattern = re.compile(
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF]"
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
|
||||
)
|
||||
return _illegal_xml_chars_RE.sub("", text)
|
||||
|
||||
|
||||
@@ -358,13 +358,7 @@ def extract_file_text(
|
||||
|
||||
try:
|
||||
if get_unstructured_api_key():
|
||||
try:
|
||||
return unstructured_to_text(file, file_name)
|
||||
except Exception as unstructured_error:
|
||||
logger.error(
|
||||
f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
|
||||
)
|
||||
# Fall through to normal processing
|
||||
return unstructured_to_text(file, file_name)
|
||||
|
||||
if file_name or extension:
|
||||
if extension is not None:
|
||||
|
||||
@@ -52,7 +52,7 @@ def _sdk_partition_request(
|
||||
|
||||
def unstructured_to_text(file: IO[Any], file_name: str) -> str:
|
||||
logger.debug(f"Starting to read file: {file_name}")
|
||||
req = _sdk_partition_request(file, file_name, strategy="fast")
|
||||
req = _sdk_partition_request(file, file_name, strategy="auto")
|
||||
|
||||
unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key())
|
||||
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
import threading
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class HttpxPool:
|
||||
"""Class to manage a global httpx Client instance"""
|
||||
|
||||
_clients: dict[str, httpx.Client] = {}
|
||||
_lock: threading.Lock = threading.Lock()
|
||||
|
||||
# Default parameters for creation
|
||||
DEFAULT_KWARGS = {
|
||||
"http2": True,
|
||||
"limits": lambda: httpx.Limits(),
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def _init_client(cls, **kwargs: Any) -> httpx.Client:
|
||||
"""Private helper method to create and return an httpx.Client."""
|
||||
merged_kwargs = {**cls.DEFAULT_KWARGS, **kwargs}
|
||||
return httpx.Client(**merged_kwargs)
|
||||
|
||||
@classmethod
|
||||
def init_client(cls, name: str, **kwargs: Any) -> None:
|
||||
"""Allow the caller to init the client with extra params."""
|
||||
with cls._lock:
|
||||
if name not in cls._clients:
|
||||
cls._clients[name] = cls._init_client(**kwargs)
|
||||
|
||||
@classmethod
|
||||
def close_client(cls, name: str) -> None:
|
||||
"""Allow the caller to close the client."""
|
||||
with cls._lock:
|
||||
client = cls._clients.pop(name, None)
|
||||
if client:
|
||||
client.close()
|
||||
|
||||
@classmethod
|
||||
def close_all(cls) -> None:
|
||||
"""Close all registered clients."""
|
||||
with cls._lock:
|
||||
for client in cls._clients.values():
|
||||
client.close()
|
||||
cls._clients.clear()
|
||||
|
||||
@classmethod
|
||||
def get(cls, name: str) -> httpx.Client:
|
||||
"""Gets the httpx.Client. Will init to default settings if not init'd."""
|
||||
with cls._lock:
|
||||
if name not in cls._clients:
|
||||
cls._clients[name] = cls._init_client()
|
||||
return cls._clients[name]
|
||||
@@ -31,15 +31,14 @@ from onyx.db.document import upsert_documents
|
||||
from onyx.db.document_set import fetch_document_sets_for_documents
|
||||
from onyx.db.index_attempt import create_index_attempt_error
|
||||
from onyx.db.models import Document as DBDocument
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.tag import create_or_add_document_tag
|
||||
from onyx.db.tag import create_or_add_document_tag_list
|
||||
from onyx.document_index.document_index_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentMetadata
|
||||
from onyx.document_index.interfaces import IndexBatchParams
|
||||
from onyx.document_index.vespa.indexing_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
from onyx.indexing.chunker import Chunker
|
||||
from onyx.indexing.embedder import IndexingEmbedder
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
@@ -358,6 +357,7 @@ def index_doc_batch(
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
logger.debug("Filtering Documents")
|
||||
filtered_documents = filter_fnc(document_batch)
|
||||
|
||||
ctx = index_doc_batch_prepare(
|
||||
@@ -380,15 +380,6 @@ def index_doc_batch(
|
||||
new_docs=0, total_docs=len(filtered_documents), total_chunks=0
|
||||
)
|
||||
|
||||
doc_descriptors = [
|
||||
{
|
||||
"doc_id": doc.id,
|
||||
"doc_length": doc.get_total_char_length(),
|
||||
}
|
||||
for doc in ctx.updatable_docs
|
||||
]
|
||||
logger.debug(f"Starting indexing process for documents: {doc_descriptors}")
|
||||
|
||||
logger.debug("Starting chunking")
|
||||
chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs)
|
||||
|
||||
@@ -536,8 +527,7 @@ def build_indexing_pipeline(
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> IndexingPipelineProtocol:
|
||||
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
multipass_config = get_multipass_config(db_session, primary_index=True)
|
||||
|
||||
chunker = chunker or Chunker(
|
||||
tokenizer=embedder.embedding_model.tokenizer,
|
||||
|
||||
@@ -55,7 +55,9 @@ class DocAwareChunk(BaseChunk):
|
||||
|
||||
def to_short_descriptor(self) -> str:
|
||||
"""Used when logging the identity of a chunk"""
|
||||
return f"{self.source_document.to_short_descriptor()} Chunk ID: {self.chunk_id}"
|
||||
return (
|
||||
f"Chunk ID: '{self.chunk_id}'; {self.source_document.to_short_descriptor()}"
|
||||
)
|
||||
|
||||
|
||||
class IndexChunk(DocAwareChunk):
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from concurrent.futures import as_completed
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import wraps
|
||||
from typing import Any
|
||||
|
||||
@@ -13,7 +11,6 @@ from requests import RequestException
|
||||
from requests import Response
|
||||
from retry import retry
|
||||
|
||||
from onyx.configs.app_configs import INDEXING_EMBEDDING_MODEL_NUM_THREADS
|
||||
from onyx.configs.app_configs import LARGE_CHUNK_RATIO
|
||||
from onyx.configs.app_configs import SKIP_WARM_UP
|
||||
from onyx.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
|
||||
@@ -158,7 +155,6 @@ class EmbeddingModel:
|
||||
text_type: EmbedTextType,
|
||||
batch_size: int,
|
||||
max_seq_length: int,
|
||||
num_threads: int = INDEXING_EMBEDDING_MODEL_NUM_THREADS,
|
||||
) -> list[Embedding]:
|
||||
text_batches = batch_list(texts, batch_size)
|
||||
|
||||
@@ -167,15 +163,12 @@ class EmbeddingModel:
|
||||
)
|
||||
|
||||
embeddings: list[Embedding] = []
|
||||
|
||||
def process_batch(
|
||||
batch_idx: int, text_batch: list[str]
|
||||
) -> tuple[int, list[Embedding]]:
|
||||
for idx, text_batch in enumerate(text_batches, start=1):
|
||||
if self.callback:
|
||||
if self.callback.should_stop():
|
||||
raise RuntimeError("_batch_encode_texts detected stop signal")
|
||||
|
||||
logger.debug(f"Encoding batch {batch_idx} of {len(text_batches)}")
|
||||
logger.debug(f"Encoding batch {idx} of {len(text_batches)}")
|
||||
embed_request = EmbedRequest(
|
||||
model_name=self.model_name,
|
||||
texts=text_batch,
|
||||
@@ -192,43 +185,10 @@ class EmbeddingModel:
|
||||
)
|
||||
|
||||
response = self._make_model_server_request(embed_request)
|
||||
return batch_idx, response.embeddings
|
||||
|
||||
# only multi thread if:
|
||||
# 1. num_threads is greater than 1
|
||||
# 2. we are using an API-based embedding model (provider_type is not None)
|
||||
# 3. there are more than 1 batch (no point in threading if only 1)
|
||||
if num_threads >= 1 and self.provider_type and len(text_batches) > 1:
|
||||
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
future_to_batch = {
|
||||
executor.submit(process_batch, idx, batch): idx
|
||||
for idx, batch in enumerate(text_batches, start=1)
|
||||
}
|
||||
|
||||
# Collect results in order
|
||||
batch_results: list[tuple[int, list[Embedding]]] = []
|
||||
for future in as_completed(future_to_batch):
|
||||
try:
|
||||
result = future.result()
|
||||
batch_results.append(result)
|
||||
if self.callback:
|
||||
self.callback.progress("_batch_encode_texts", 1)
|
||||
except Exception as e:
|
||||
logger.exception("Embedding model failed to process batch")
|
||||
raise e
|
||||
|
||||
# Sort by batch index and extend embeddings
|
||||
batch_results.sort(key=lambda x: x[0])
|
||||
for _, batch_embeddings in batch_results:
|
||||
embeddings.extend(batch_embeddings)
|
||||
else:
|
||||
# Original sequential processing
|
||||
for idx, text_batch in enumerate(text_batches, start=1):
|
||||
_, batch_embeddings = process_batch(idx, text_batch)
|
||||
embeddings.extend(batch_embeddings)
|
||||
if self.callback:
|
||||
self.callback.progress("_batch_encode_texts", 1)
|
||||
embeddings.extend(response.embeddings)
|
||||
|
||||
if self.callback:
|
||||
self.callback.progress("_batch_encode_texts", 1)
|
||||
return embeddings
|
||||
|
||||
def encode(
|
||||
|
||||
@@ -537,36 +537,30 @@ def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -
|
||||
# Let the tag flow handle this case, don't reply twice
|
||||
return False
|
||||
|
||||
# Check if this is a bot message (either via bot_profile or bot_message subtype)
|
||||
is_bot_message = bool(
|
||||
event.get("bot_profile") or event.get("subtype") == "bot_message"
|
||||
)
|
||||
if is_bot_message:
|
||||
if event.get("bot_profile"):
|
||||
channel_name, _ = get_channel_name_from_id(
|
||||
client=client.web_client, channel_id=channel
|
||||
)
|
||||
|
||||
with get_session_with_tenant(client.tenant_id) as db_session:
|
||||
slack_channel_config = get_slack_channel_config_for_bot_and_channel(
|
||||
db_session=db_session,
|
||||
slack_bot_id=client.slack_bot_id,
|
||||
channel_name=channel_name,
|
||||
)
|
||||
|
||||
# If OnyxBot is not specifically tagged and the channel is not set to respond to bots, ignore the message
|
||||
if (not bot_tag_id or bot_tag_id not in msg) and (
|
||||
not slack_channel_config
|
||||
or not slack_channel_config.channel_config.get("respond_to_bots")
|
||||
):
|
||||
channel_specific_logger.info(
|
||||
"Ignoring message from bot since respond_to_bots is disabled"
|
||||
)
|
||||
channel_specific_logger.info("Ignoring message from bot")
|
||||
return False
|
||||
|
||||
# Ignore things like channel_join, channel_leave, etc.
|
||||
# NOTE: "file_share" is just a message with a file attachment, so we
|
||||
# should not ignore it
|
||||
message_subtype = event.get("subtype")
|
||||
if message_subtype not in [None, "file_share", "bot_message"]:
|
||||
if message_subtype not in [None, "file_share"]:
|
||||
channel_specific_logger.info(
|
||||
f"Ignoring message with subtype '{message_subtype}' since it is a special message type"
|
||||
)
|
||||
|
||||
@@ -92,7 +92,7 @@ class RedisConnectorPrune:
|
||||
if fence_bytes is None:
|
||||
return None
|
||||
|
||||
fence_int = int(cast(bytes, fence_bytes))
|
||||
fence_int = cast(int, fence_bytes)
|
||||
return fence_int
|
||||
|
||||
@generator_complete.setter
|
||||
|
||||
@@ -16,7 +16,7 @@ from onyx.context.search.preprocessing.access_filters import (
|
||||
from onyx.db.document_set import get_document_sets_by_ids
|
||||
from onyx.db.models import StarterMessageModel as StarterMessage
|
||||
from onyx.db.models import User
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.llm.factory import get_default_llms
|
||||
from onyx.prompts.starter_messages import format_persona_starter_message_prompt
|
||||
@@ -34,11 +34,8 @@ def get_random_chunks_from_doc_sets(
|
||||
"""
|
||||
Retrieves random chunks from the specified document sets.
|
||||
"""
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
document_index = get_default_document_index(
|
||||
search_settings=active_search_settings.primary,
|
||||
secondary_search_settings=active_search_settings.secondary,
|
||||
)
|
||||
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
||||
document_index = get_default_document_index(curr_ind_name, sec_ind_name)
|
||||
|
||||
acl_filters = build_access_filters_for_user(user, db_session)
|
||||
filters = IndexFilters(document_set=doc_sets, access_control_list=acl_filters)
|
||||
|
||||
@@ -3,7 +3,6 @@ import json
|
||||
import os
|
||||
from typing import cast
|
||||
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.access.models import default_public_access
|
||||
@@ -24,7 +23,6 @@ from onyx.db.document import check_docs_exist
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.index_attempt import mock_successful_index_attempt
|
||||
from onyx.db.models import Document as DbDocument
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.interfaces import IndexBatchParams
|
||||
@@ -61,7 +59,6 @@ def _create_indexable_chunks(
|
||||
doc_updated_at=None,
|
||||
primary_owners=[],
|
||||
secondary_owners=[],
|
||||
chunk_count=1,
|
||||
)
|
||||
if preprocessed_doc["chunk_ind"] == 0:
|
||||
ids_to_documents[document.id] = document
|
||||
@@ -158,7 +155,9 @@ def seed_initial_documents(
|
||||
logger.info("Embedding model has been updated, skipping")
|
||||
return
|
||||
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
# Create a connector so the user can delete it if they want
|
||||
# or reindex it with a new search model if they want
|
||||
@@ -241,12 +240,4 @@ def seed_initial_documents(
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
# Since we bypass the indexing flow, we need to manually update the chunk count
|
||||
for doc in docs:
|
||||
db_session.execute(
|
||||
update(DbDocument)
|
||||
.where(DbDocument.id == doc.id)
|
||||
.values(chunk_count=doc.chunk_count)
|
||||
)
|
||||
|
||||
kv_store.store(KV_DOCUMENTS_SEEDED_KEY, True)
|
||||
|
||||
@@ -15,9 +15,6 @@ from onyx.background.celery.celery_utils import get_deletion_attempt_snapshot
|
||||
from onyx.background.celery.tasks.doc_permission_syncing.tasks import (
|
||||
try_creating_permissions_sync_task,
|
||||
)
|
||||
from onyx.background.celery.tasks.external_group_syncing.tasks import (
|
||||
try_creating_external_group_sync_task,
|
||||
)
|
||||
from onyx.background.celery.tasks.pruning.tasks import (
|
||||
try_creating_prune_generator_task,
|
||||
)
|
||||
@@ -42,7 +39,7 @@ from onyx.db.index_attempt import get_latest_index_attempt_for_cc_pair_id
|
||||
from onyx.db.index_attempt import get_paginated_index_attempts_for_cc_pair_id
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.models import User
|
||||
from onyx.db.search_settings import get_active_search_settings_list
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
@@ -192,7 +189,7 @@ def update_cc_pair_status(
|
||||
if status_update_request.status == ConnectorCredentialPairStatus.PAUSED:
|
||||
redis_connector.stop.set_fence(True)
|
||||
|
||||
search_settings_list: list[SearchSettings] = get_active_search_settings_list(
|
||||
search_settings_list: list[SearchSettings] = get_active_search_settings(
|
||||
db_session
|
||||
)
|
||||
|
||||
@@ -446,78 +443,6 @@ def sync_cc_pair(
|
||||
)
|
||||
|
||||
|
||||
@router.get("/admin/cc-pair/{cc_pair_id}/sync-groups")
|
||||
def get_cc_pair_latest_group_sync(
|
||||
cc_pair_id: int,
|
||||
user: User = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> datetime | None:
|
||||
cc_pair = get_connector_credential_pair_from_id_for_user(
|
||||
cc_pair_id=cc_pair_id,
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
get_editable=False,
|
||||
)
|
||||
if not cc_pair:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="cc_pair not found for current user's permissions",
|
||||
)
|
||||
|
||||
return cc_pair.last_time_external_group_sync
|
||||
|
||||
|
||||
@router.post("/admin/cc-pair/{cc_pair_id}/sync-groups")
|
||||
def sync_cc_pair_groups(
|
||||
cc_pair_id: int,
|
||||
user: User = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
tenant_id: str | None = Depends(get_current_tenant_id),
|
||||
) -> StatusResponse[list[int]]:
|
||||
"""Triggers group sync on a particular cc_pair immediately"""
|
||||
|
||||
cc_pair = get_connector_credential_pair_from_id_for_user(
|
||||
cc_pair_id=cc_pair_id,
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
get_editable=False,
|
||||
)
|
||||
if not cc_pair:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Connection not found for current user's permissions",
|
||||
)
|
||||
|
||||
r = get_redis_client(tenant_id=tenant_id)
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
if redis_connector.external_group_sync.fenced:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.CONFLICT,
|
||||
detail="External group sync task already in progress.",
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"External group sync cc_pair={cc_pair_id} "
|
||||
f"connector_id={cc_pair.connector_id} "
|
||||
f"credential_id={cc_pair.credential_id} "
|
||||
f"{cc_pair.connector.name} connector."
|
||||
)
|
||||
tasks_created = try_creating_external_group_sync_task(
|
||||
primary_app, cc_pair_id, r, CURRENT_TENANT_ID_CONTEXTVAR.get()
|
||||
)
|
||||
if not tasks_created:
|
||||
raise HTTPException(
|
||||
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
||||
detail="External group sync task creation failed.",
|
||||
)
|
||||
|
||||
return StatusResponse(
|
||||
success=True,
|
||||
message="Successfully created the external group sync task.",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/admin/cc-pair/{cc_pair_id}/get-docs-sync-status")
|
||||
def get_docs_sync_status(
|
||||
cc_pair_id: int,
|
||||
|
||||
@@ -32,7 +32,10 @@ def get_document_info(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> DocumentInfo:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
user_acl_filters = build_access_filters_for_user(user, db_session)
|
||||
inference_chunks = document_index.id_based_retrieval(
|
||||
@@ -76,7 +79,10 @@ def get_chunk_info(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> ChunkInfo:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
user_acl_filters = build_access_filters_for_user(user, db_session)
|
||||
chunk_request = VespaChunkRequest(
|
||||
|
||||
@@ -357,7 +357,6 @@ class ConnectorCredentialPairDescriptor(BaseModel):
|
||||
name: str | None = None
|
||||
connector: ConnectorSnapshot
|
||||
credential: CredentialSnapshot
|
||||
access_type: AccessType
|
||||
|
||||
|
||||
class RunConnectorRequest(BaseModel):
|
||||
|
||||
@@ -68,7 +68,6 @@ class DocumentSet(BaseModel):
|
||||
credential=CredentialSnapshot.from_credential_db_model(
|
||||
cc_pair.credential
|
||||
),
|
||||
access_type=cc_pair.access_type,
|
||||
)
|
||||
for cc_pair in document_set_model.connector_credential_pairs
|
||||
],
|
||||
|
||||
@@ -10,7 +10,6 @@ from onyx.auth.users import current_admin_user
|
||||
from onyx.auth.users import current_chat_accesssible_user
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.llm import fetch_existing_llm_providers
|
||||
from onyx.db.llm import fetch_existing_llm_providers_for_user
|
||||
from onyx.db.llm import fetch_provider
|
||||
from onyx.db.llm import remove_llm_provider
|
||||
from onyx.db.llm import update_default_provider
|
||||
@@ -196,7 +195,5 @@ def list_llm_provider_basics(
|
||||
) -> list[LLMProviderDescriptor]:
|
||||
return [
|
||||
LLMProviderDescriptor.from_model(llm_provider_model)
|
||||
for llm_provider_model in fetch_existing_llm_providers_for_user(
|
||||
db_session, user
|
||||
)
|
||||
for llm_provider_model in fetch_existing_llm_providers(db_session, user)
|
||||
]
|
||||
|
||||
@@ -44,6 +44,7 @@ class UserPreferences(BaseModel):
|
||||
chosen_assistants: list[int] | None = None
|
||||
hidden_assistants: list[int] = []
|
||||
visible_assistants: list[int] = []
|
||||
recent_assistants: list[int] | None = None
|
||||
default_model: str | None = None
|
||||
auto_scroll: bool | None = None
|
||||
pinned_assistants: list[int] | None = None
|
||||
|
||||
@@ -22,7 +22,6 @@ from onyx.db.search_settings import get_embedding_provider_from_provider_type
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.db.search_settings import update_current_search_settings
|
||||
from onyx.db.search_settings import update_search_settings_status
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.file_processing.unstructured import delete_unstructured_api_key
|
||||
from onyx.file_processing.unstructured import get_unstructured_api_key
|
||||
@@ -98,9 +97,10 @@ def set_new_search_settings(
|
||||
)
|
||||
|
||||
# Ensure Vespa has the new index immediately
|
||||
get_multipass_config(search_settings)
|
||||
get_multipass_config(new_search_settings)
|
||||
document_index = get_default_document_index(search_settings, new_search_settings)
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name,
|
||||
secondary_index_name=new_search_settings.index_name,
|
||||
)
|
||||
|
||||
document_index.ensure_indices_exist(
|
||||
index_embedding_dim=search_settings.model_dim,
|
||||
|
||||
@@ -572,6 +572,59 @@ class ChosenDefaultModelRequest(BaseModel):
|
||||
default_model: str | None = None
|
||||
|
||||
|
||||
class RecentAssistantsRequest(BaseModel):
|
||||
current_assistant: int
|
||||
|
||||
|
||||
def update_recent_assistants(
|
||||
recent_assistants: list[int] | None, current_assistant: int
|
||||
) -> list[int]:
|
||||
if recent_assistants is None:
|
||||
recent_assistants = []
|
||||
else:
|
||||
recent_assistants = [x for x in recent_assistants if x != current_assistant]
|
||||
|
||||
# Add current assistant to start of list
|
||||
recent_assistants.insert(0, current_assistant)
|
||||
|
||||
# Keep only the 5 most recent assistants
|
||||
recent_assistants = recent_assistants[:5]
|
||||
return recent_assistants
|
||||
|
||||
|
||||
@router.patch("/user/recent-assistants")
|
||||
def update_user_recent_assistants(
|
||||
request: RecentAssistantsRequest,
|
||||
user: User | None = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> None:
|
||||
if user is None:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
store = get_kv_store()
|
||||
no_auth_user = fetch_no_auth_user(store)
|
||||
preferences = no_auth_user.preferences
|
||||
recent_assistants = preferences.recent_assistants
|
||||
updated_preferences = update_recent_assistants(
|
||||
recent_assistants, request.current_assistant
|
||||
)
|
||||
preferences.recent_assistants = updated_preferences
|
||||
set_no_auth_user_preferences(store, preferences)
|
||||
return
|
||||
else:
|
||||
raise RuntimeError("This should never happen")
|
||||
|
||||
recent_assistants = UserInfo.from_model(user).preferences.recent_assistants
|
||||
updated_recent_assistants = update_recent_assistants(
|
||||
recent_assistants, request.current_assistant
|
||||
)
|
||||
db_session.execute(
|
||||
update(User)
|
||||
.where(User.id == user.id) # type: ignore
|
||||
.values(recent_assistants=updated_recent_assistants)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
@router.patch("/shortcut-enabled")
|
||||
def update_user_shortcut_enabled(
|
||||
shortcut_enabled: bool,
|
||||
@@ -678,6 +731,30 @@ class ChosenAssistantsRequest(BaseModel):
|
||||
chosen_assistants: list[int]
|
||||
|
||||
|
||||
@router.patch("/user/assistant-list")
|
||||
def update_user_assistant_list(
|
||||
request: ChosenAssistantsRequest,
|
||||
user: User | None = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> None:
|
||||
if user is None:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
store = get_kv_store()
|
||||
no_auth_user = fetch_no_auth_user(store)
|
||||
no_auth_user.preferences.chosen_assistants = request.chosen_assistants
|
||||
set_no_auth_user_preferences(store, no_auth_user.preferences)
|
||||
return
|
||||
else:
|
||||
raise RuntimeError("This should never happen")
|
||||
|
||||
db_session.execute(
|
||||
update(User)
|
||||
.where(User.id == user.id) # type: ignore
|
||||
.values(chosen_assistants=request.chosen_assistants)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def update_assistant_visibility(
|
||||
preferences: UserPreferences, assistant_id: int, show: bool
|
||||
) -> UserPreferences:
|
||||
|
||||
@@ -14,9 +14,9 @@ from onyx.db.document import get_ingestion_documents
|
||||
from onyx.db.engine import get_current_tenant_id
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.indexing.embedder import DefaultIndexingEmbedder
|
||||
from onyx.indexing.indexing_pipeline import build_indexing_pipeline
|
||||
@@ -89,10 +89,9 @@ def upsert_ingestion_doc(
|
||||
)
|
||||
|
||||
# Need to index for both the primary and secondary index if possible
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
||||
curr_doc_index = get_default_document_index(
|
||||
active_search_settings.primary,
|
||||
None,
|
||||
primary_index_name=curr_ind_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
@@ -118,7 +117,11 @@ def upsert_ingestion_doc(
|
||||
)
|
||||
|
||||
# If there's a secondary index being built, index the doc but don't use it for return here
|
||||
if active_search_settings.secondary:
|
||||
if sec_ind_name:
|
||||
sec_doc_index = get_default_document_index(
|
||||
primary_index_name=curr_ind_name, secondary_index_name=None
|
||||
)
|
||||
|
||||
sec_search_settings = get_secondary_search_settings(db_session)
|
||||
|
||||
if sec_search_settings is None:
|
||||
@@ -131,10 +134,6 @@ def upsert_ingestion_doc(
|
||||
search_settings=sec_search_settings
|
||||
)
|
||||
|
||||
sec_doc_index = get_default_document_index(
|
||||
active_search_settings.secondary, None
|
||||
)
|
||||
|
||||
sec_ind_pipeline = build_indexing_pipeline(
|
||||
embedder=new_index_embedding_model,
|
||||
document_index=sec_doc_index,
|
||||
|
||||
@@ -672,25 +672,23 @@ def upload_files_for_chat(
|
||||
else ChatFileType.PLAIN_TEXT
|
||||
)
|
||||
|
||||
file_content = file.file.read() # Read the file content
|
||||
|
||||
if file_type == ChatFileType.IMAGE:
|
||||
file_content_io = file.file
|
||||
file_content = file.file
|
||||
# NOTE: Image conversion to JPEG used to be enforced here.
|
||||
# This was removed to:
|
||||
# 1. Preserve original file content for downloads
|
||||
# 2. Maintain transparency in formats like PNG
|
||||
# 3. Ameliorate issue with file conversion
|
||||
else:
|
||||
file_content_io = io.BytesIO(file_content)
|
||||
file_content = io.BytesIO(file.file.read())
|
||||
|
||||
new_content_type = file.content_type
|
||||
|
||||
# Store the file normally
|
||||
# store the file (now JPEG for images)
|
||||
file_id = str(uuid.uuid4())
|
||||
file_store.save_file(
|
||||
file_name=file_id,
|
||||
content=file_content_io,
|
||||
content=file_content,
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CHAT_UPLOAD,
|
||||
file_type=new_content_type or file_type.value,
|
||||
@@ -700,7 +698,7 @@ def upload_files_for_chat(
|
||||
# to re-extract it every time we send a message
|
||||
if file_type == ChatFileType.DOC:
|
||||
extracted_text = extract_file_text(
|
||||
file=io.BytesIO(file_content), # use the bytes we already read
|
||||
file=file.file,
|
||||
file_name=file.filename or "",
|
||||
)
|
||||
text_file_id = str(uuid.uuid4())
|
||||
|
||||
@@ -64,8 +64,9 @@ def admin_search(
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
|
||||
document_index = get_default_document_index(
|
||||
primary_index_name=search_settings.index_name, secondary_index_name=None
|
||||
)
|
||||
if not isinstance(document_index, VespaIndex):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
||||
@@ -25,7 +25,6 @@ from onyx.db.llm import fetch_default_provider
|
||||
from onyx.db.llm import update_default_provider
|
||||
from onyx.db.llm import upsert_llm_provider
|
||||
from onyx.db.persona import delete_old_default_personas
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.db.search_settings import update_current_search_settings
|
||||
@@ -71,19 +70,8 @@ def setup_onyx(
|
||||
The Tenant Service calls the tenants/create endpoint which runs this.
|
||||
"""
|
||||
check_index_swap(db_session=db_session)
|
||||
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
search_settings = active_search_settings.primary
|
||||
secondary_search_settings = active_search_settings.secondary
|
||||
|
||||
# search_settings = get_current_search_settings(db_session)
|
||||
# multipass_config_1 = get_multipass_config(search_settings)
|
||||
|
||||
# secondary_large_chunks_enabled: bool | None = None
|
||||
# secondary_search_settings = get_secondary_search_settings(db_session)
|
||||
# if secondary_search_settings:
|
||||
# multipass_config_2 = get_multipass_config(secondary_search_settings)
|
||||
# secondary_large_chunks_enabled = multipass_config_2.enable_large_chunks
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
secondary_search_settings = get_secondary_search_settings(db_session)
|
||||
|
||||
# Break bad state for thrashing indexes
|
||||
if secondary_search_settings and DISABLE_INDEX_UPDATE_ON_SWAP:
|
||||
@@ -134,8 +122,10 @@ def setup_onyx(
|
||||
# takes a bit of time to start up
|
||||
logger.notice("Verifying Document Index(s) is/are available.")
|
||||
document_index = get_default_document_index(
|
||||
search_settings,
|
||||
secondary_search_settings,
|
||||
primary_index_name=search_settings.index_name,
|
||||
secondary_index_name=secondary_search_settings.index_name
|
||||
if secondary_search_settings
|
||||
else None,
|
||||
)
|
||||
|
||||
success = setup_vespa(
|
||||
|
||||
@@ -220,13 +220,6 @@ class InternetSearchTool(Tool):
|
||||
)
|
||||
results = response.json()
|
||||
|
||||
# If no hits, Bing does not include the webPages key
|
||||
search_results = (
|
||||
results["webPages"]["value"][: self.num_results]
|
||||
if "webPages" in results
|
||||
else []
|
||||
)
|
||||
|
||||
return InternetSearchResponse(
|
||||
revised_query=query,
|
||||
internet_results=[
|
||||
@@ -235,7 +228,7 @@ class InternetSearchTool(Tool):
|
||||
link=result["url"],
|
||||
snippet=result["snippet"],
|
||||
)
|
||||
for result in search_results
|
||||
for result in results["webPages"]["value"][: self.num_results]
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@@ -81,7 +81,6 @@ hubspot-api-client==8.1.0
|
||||
asana==5.0.8
|
||||
dropbox==11.36.2
|
||||
boto3-stubs[s3]==1.34.133
|
||||
shapely==2.0.6
|
||||
stripe==10.12.0
|
||||
urllib3==2.2.3
|
||||
mistune==0.8.4
|
||||
|
||||
@@ -7,7 +7,6 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.document import delete_documents_complete__no_commit
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
|
||||
# Modify sys.path
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
@@ -39,6 +38,7 @@ from onyx.db.connector_credential_pair import (
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
|
||||
# pylint: enable=E402
|
||||
# flake8: noqa: E402
|
||||
@@ -191,10 +191,9 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None:
|
||||
)
|
||||
try:
|
||||
logger.notice("Deleting information from Vespa and Postgres")
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
||||
document_index = get_default_document_index(
|
||||
active_search_settings.primary,
|
||||
active_search_settings.secondary,
|
||||
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
|
||||
)
|
||||
|
||||
files_deleted_count = _unsafe_deletion(
|
||||
|
||||
@@ -5,8 +5,6 @@ import sys
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
|
||||
# makes it so `PYTHONPATH=.` is not required when running this script
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
@@ -56,14 +54,8 @@ def main() -> None:
|
||||
|
||||
# Setup Vespa index
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
vespa_index = VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
)
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
|
||||
# Delete chunks from Vespa first
|
||||
print("Deleting orphaned document chunks from Vespa")
|
||||
|
||||
@@ -16,7 +16,6 @@ from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from onyx.indexing.indexing_pipeline import IndexBatchParams
|
||||
from onyx.indexing.models import ChunkEmbedding
|
||||
@@ -134,16 +133,10 @@ def seed_dummy_docs(
|
||||
) -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
embedding_dim = search_settings.model_dim
|
||||
|
||||
vespa_index = VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
)
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
print(index_name)
|
||||
|
||||
all_chunks = []
|
||||
|
||||
@@ -9,7 +9,6 @@ from onyx.configs.model_configs import DOC_EMBEDDING_DIM
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
|
||||
from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
|
||||
@@ -63,15 +62,9 @@ def test_hybrid_retrieval_times(
|
||||
) -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
|
||||
vespa_index = VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
)
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
|
||||
# Generate random queries
|
||||
queries = [f"Random Query {i}" for i in range(number_of_queries)]
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import os
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.airtable.airtable_connector import AirtableConnector
|
||||
@@ -10,25 +10,26 @@ from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
|
||||
|
||||
class AirtableConfig(BaseModel):
|
||||
base_id: str
|
||||
table_identifier: str
|
||||
access_token: str
|
||||
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def airtable_config(request: pytest.FixtureRequest) -> AirtableConfig:
|
||||
table_identifier = (
|
||||
os.environ["AIRTABLE_TEST_TABLE_NAME"]
|
||||
if request.param
|
||||
else os.environ["AIRTABLE_TEST_TABLE_ID"]
|
||||
)
|
||||
return AirtableConfig(
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("table_name", os.environ["AIRTABLE_TEST_TABLE_NAME"]),
|
||||
("table_id", os.environ["AIRTABLE_TEST_TABLE_ID"]),
|
||||
]
|
||||
)
|
||||
def airtable_connector(request: pytest.FixtureRequest) -> AirtableConnector:
|
||||
param_type, table_identifier = request.param
|
||||
connector = AirtableConnector(
|
||||
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
|
||||
table_identifier=table_identifier,
|
||||
access_token=os.environ["AIRTABLE_ACCESS_TOKEN"],
|
||||
table_name_or_id=table_identifier,
|
||||
)
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"airtable_access_token": os.environ["AIRTABLE_ACCESS_TOKEN"],
|
||||
}
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
def create_test_document(
|
||||
id: str,
|
||||
@@ -45,37 +46,18 @@ def create_test_document(
|
||||
assignee: str,
|
||||
days_since_status_change: int | None,
|
||||
attachments: list[tuple[str, str]] | None = None,
|
||||
all_fields_as_metadata: bool = False,
|
||||
) -> Document:
|
||||
base_id = os.environ.get("AIRTABLE_TEST_BASE_ID")
|
||||
table_id = os.environ.get("AIRTABLE_TEST_TABLE_ID")
|
||||
missing_vars = []
|
||||
if not base_id:
|
||||
missing_vars.append("AIRTABLE_TEST_BASE_ID")
|
||||
if not table_id:
|
||||
missing_vars.append("AIRTABLE_TEST_TABLE_ID")
|
||||
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Required environment variables not set: {', '.join(missing_vars)}. "
|
||||
"These variables are required to run Airtable connector tests."
|
||||
)
|
||||
link_base = f"https://airtable.com/{base_id}/{table_id}"
|
||||
sections = []
|
||||
|
||||
if not all_fields_as_metadata:
|
||||
sections.extend(
|
||||
[
|
||||
Section(
|
||||
text=f"Title:\n------------------------\n{title}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
Section(
|
||||
text=f"Description:\n------------------------\n{description}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
]
|
||||
)
|
||||
link_base = f"https://airtable.com/{os.environ['AIRTABLE_TEST_BASE_ID']}/{os.environ['AIRTABLE_TEST_TABLE_ID']}"
|
||||
sections = [
|
||||
Section(
|
||||
text=f"Title:\n------------------------\n{title}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
Section(
|
||||
text=f"Description:\n------------------------\n{description}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
]
|
||||
|
||||
if attachments:
|
||||
for attachment_text, attachment_link in attachments:
|
||||
@@ -86,36 +68,26 @@ def create_test_document(
|
||||
),
|
||||
)
|
||||
|
||||
metadata: dict[str, str | list[str]] = {
|
||||
# "Category": category,
|
||||
"Assignee": assignee,
|
||||
"Submitted by": submitted_by,
|
||||
"Priority": priority,
|
||||
"Status": status,
|
||||
"Created time": created_time,
|
||||
"ID": ticket_id,
|
||||
"Status last changed": status_last_changed,
|
||||
**(
|
||||
{"Days since status change": str(days_since_status_change)}
|
||||
if days_since_status_change is not None
|
||||
else {}
|
||||
),
|
||||
}
|
||||
|
||||
if all_fields_as_metadata:
|
||||
metadata.update(
|
||||
{
|
||||
"Title": title,
|
||||
"Description": description,
|
||||
}
|
||||
)
|
||||
|
||||
return Document(
|
||||
id=f"airtable__{id}",
|
||||
sections=sections,
|
||||
source=DocumentSource.AIRTABLE,
|
||||
semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}",
|
||||
metadata=metadata,
|
||||
semantic_identifier=f"{os.environ['AIRTABLE_TEST_TABLE_NAME']}: {title}",
|
||||
metadata={
|
||||
# "Category": category,
|
||||
"Assignee": assignee,
|
||||
"Submitted by": submitted_by,
|
||||
"Priority": priority,
|
||||
"Status": status,
|
||||
"Created time": created_time,
|
||||
"ID": ticket_id,
|
||||
"Status last changed": status_last_changed,
|
||||
**(
|
||||
{"Days since status change": str(days_since_status_change)}
|
||||
if days_since_status_change is not None
|
||||
else {}
|
||||
),
|
||||
},
|
||||
doc_updated_at=None,
|
||||
primary_owners=None,
|
||||
secondary_owners=None,
|
||||
@@ -125,75 +97,15 @@ def create_test_document(
|
||||
)
|
||||
|
||||
|
||||
def compare_documents(
|
||||
actual_docs: list[Document], expected_docs: list[Document]
|
||||
) -> None:
|
||||
"""Utility function to compare actual and expected documents, ignoring order."""
|
||||
actual_docs_dict = {doc.id: doc for doc in actual_docs}
|
||||
expected_docs_dict = {doc.id: doc for doc in expected_docs}
|
||||
|
||||
assert actual_docs_dict.keys() == expected_docs_dict.keys(), "Document ID mismatch"
|
||||
|
||||
for doc_id in actual_docs_dict:
|
||||
actual = actual_docs_dict[doc_id]
|
||||
expected = expected_docs_dict[doc_id]
|
||||
|
||||
assert (
|
||||
actual.source == expected.source
|
||||
), f"Source mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.semantic_identifier == expected.semantic_identifier
|
||||
), f"Semantic identifier mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.metadata == expected.metadata
|
||||
), f"Metadata mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.doc_updated_at == expected.doc_updated_at
|
||||
), f"Updated at mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.primary_owners == expected.primary_owners
|
||||
), f"Primary owners mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.secondary_owners == expected.secondary_owners
|
||||
), f"Secondary owners mismatch for document {doc_id}"
|
||||
assert actual.title == expected.title, f"Title mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.from_ingestion_api == expected.from_ingestion_api
|
||||
), f"Ingestion API flag mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.additional_info == expected.additional_info
|
||||
), f"Additional info mismatch for document {doc_id}"
|
||||
|
||||
# Compare sections
|
||||
assert len(actual.sections) == len(
|
||||
expected.sections
|
||||
), f"Number of sections mismatch for document {doc_id}"
|
||||
for i, (actual_section, expected_section) in enumerate(
|
||||
zip(actual.sections, expected.sections)
|
||||
):
|
||||
assert (
|
||||
actual_section.text == expected_section.text
|
||||
), f"Section {i} text mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual_section.link == expected_section.link
|
||||
), f"Section {i} link mismatch for document {doc_id}"
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_airtable_connector_basic(
|
||||
mock_get_unstructured_api_key: MagicMock, airtable_config: AirtableConfig
|
||||
mock_get_api_key: MagicMock, airtable_connector: AirtableConnector
|
||||
) -> None:
|
||||
"""Test behavior when all non-attachment fields are treated as metadata."""
|
||||
connector = AirtableConnector(
|
||||
base_id=airtable_config.base_id,
|
||||
table_name_or_id=airtable_config.table_identifier,
|
||||
treat_all_non_attachment_fields_as_metadata=False,
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"airtable_access_token": airtable_config.access_token,
|
||||
}
|
||||
)
|
||||
doc_batch_generator = connector.load_from_state()
|
||||
doc_batch_generator = airtable_connector.load_from_state()
|
||||
|
||||
doc_batch = next(doc_batch_generator)
|
||||
with pytest.raises(StopIteration):
|
||||
next(doc_batch_generator)
|
||||
@@ -207,62 +119,15 @@ def test_airtable_connector_basic(
|
||||
description="The internet connection is very slow.",
|
||||
priority="Medium",
|
||||
status="In Progress",
|
||||
# Link to another record is skipped for now
|
||||
# category="Data Science",
|
||||
ticket_id="2",
|
||||
created_time="2024-12-24T21:02:49.000Z",
|
||||
status_last_changed="2024-12-24T21:02:49.000Z",
|
||||
days_since_status_change=0,
|
||||
assignee="Chris Weaver (chris@onyx.app)",
|
||||
submitted_by="Chris Weaver (chris@onyx.app)",
|
||||
all_fields_as_metadata=False,
|
||||
),
|
||||
create_test_document(
|
||||
id="reccSlIA4pZEFxPBg",
|
||||
title="Printer Issue",
|
||||
description="The office printer is not working.",
|
||||
priority="High",
|
||||
status="Open",
|
||||
ticket_id="1",
|
||||
created_time="2024-12-24T21:02:49.000Z",
|
||||
status_last_changed="2024-12-24T21:02:49.000Z",
|
||||
days_since_status_change=0,
|
||||
assignee="Chris Weaver (chris@onyx.app)",
|
||||
submitted_by="Chris Weaver (chris@onyx.app)",
|
||||
attachments=[
|
||||
(
|
||||
"Test.pdf:\ntesting!!!",
|
||||
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
|
||||
)
|
||||
],
|
||||
all_fields_as_metadata=False,
|
||||
),
|
||||
]
|
||||
|
||||
# Compare documents using the utility function
|
||||
compare_documents(doc_batch, expected_docs)
|
||||
|
||||
|
||||
def test_airtable_connector_all_metadata(
|
||||
mock_get_unstructured_api_key: MagicMock, airtable_config: AirtableConfig
|
||||
) -> None:
|
||||
connector = AirtableConnector(
|
||||
base_id=airtable_config.base_id,
|
||||
table_name_or_id=airtable_config.table_identifier,
|
||||
treat_all_non_attachment_fields_as_metadata=True,
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"airtable_access_token": airtable_config.access_token,
|
||||
}
|
||||
)
|
||||
doc_batch_generator = connector.load_from_state()
|
||||
doc_batch = next(doc_batch_generator)
|
||||
with pytest.raises(StopIteration):
|
||||
next(doc_batch_generator)
|
||||
|
||||
# NOTE: one of the rows has no attachments -> no content -> no document
|
||||
assert len(doc_batch) == 1
|
||||
|
||||
expected_docs = [
|
||||
create_test_document(
|
||||
id="reccSlIA4pZEFxPBg",
|
||||
title="Printer Issue",
|
||||
@@ -284,9 +149,50 @@ def test_airtable_connector_all_metadata(
|
||||
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
|
||||
)
|
||||
],
|
||||
all_fields_as_metadata=True,
|
||||
),
|
||||
]
|
||||
|
||||
# Compare documents using the utility function
|
||||
compare_documents(doc_batch, expected_docs)
|
||||
# Compare each document field by field
|
||||
for actual, expected in zip(doc_batch, expected_docs):
|
||||
assert actual.id == expected.id, f"ID mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.source == expected.source
|
||||
), f"Source mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.semantic_identifier == expected.semantic_identifier
|
||||
), f"Semantic identifier mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.metadata == expected.metadata
|
||||
), f"Metadata mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.doc_updated_at == expected.doc_updated_at
|
||||
), f"Updated at mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.primary_owners == expected.primary_owners
|
||||
), f"Primary owners mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.secondary_owners == expected.secondary_owners
|
||||
), f"Secondary owners mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.title == expected.title
|
||||
), f"Title mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.from_ingestion_api == expected.from_ingestion_api
|
||||
), f"Ingestion API flag mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.additional_info == expected.additional_info
|
||||
), f"Additional info mismatch for document {actual.id}"
|
||||
|
||||
# Compare sections
|
||||
assert len(actual.sections) == len(
|
||||
expected.sections
|
||||
), f"Number of sections mismatch for document {actual.id}"
|
||||
for i, (actual_section, expected_section) in enumerate(
|
||||
zip(actual.sections, expected.sections)
|
||||
):
|
||||
assert (
|
||||
actual_section.text == expected_section.text
|
||||
), f"Section {i} text mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual_section.link == expected_section.link
|
||||
), f"Section {i} link mismatch for document {actual.id}"
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
from collections.abc import Generator
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_get_unstructured_api_key() -> Generator[MagicMock, None, None]:
|
||||
with patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
) as mock:
|
||||
yield mock
|
||||
@@ -1,210 +0,0 @@
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.sharepoint.connector import SharepointConnector
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExpectedDocument:
|
||||
semantic_identifier: str
|
||||
content: str
|
||||
folder_path: str | None = None
|
||||
library: str = "Shared Documents" # Default to main library
|
||||
|
||||
|
||||
EXPECTED_DOCUMENTS = [
|
||||
ExpectedDocument(
|
||||
semantic_identifier="test1.docx",
|
||||
content="test1",
|
||||
folder_path="test",
|
||||
),
|
||||
ExpectedDocument(
|
||||
semantic_identifier="test2.docx",
|
||||
content="test2",
|
||||
folder_path="test/nested with spaces",
|
||||
),
|
||||
ExpectedDocument(
|
||||
semantic_identifier="should-not-index-on-specific-folder.docx",
|
||||
content="should-not-index-on-specific-folder",
|
||||
folder_path=None, # root folder
|
||||
),
|
||||
ExpectedDocument(
|
||||
semantic_identifier="other.docx",
|
||||
content="other",
|
||||
folder_path=None,
|
||||
library="Other Library",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def verify_document_metadata(doc: Document) -> None:
|
||||
"""Verify common metadata that should be present on all documents."""
|
||||
assert isinstance(doc.doc_updated_at, datetime)
|
||||
assert doc.doc_updated_at.tzinfo == timezone.utc
|
||||
assert doc.source == DocumentSource.SHAREPOINT
|
||||
assert doc.primary_owners is not None
|
||||
assert len(doc.primary_owners) == 1
|
||||
owner = doc.primary_owners[0]
|
||||
assert owner.display_name is not None
|
||||
assert owner.email is not None
|
||||
|
||||
|
||||
def verify_document_content(doc: Document, expected: ExpectedDocument) -> None:
|
||||
"""Verify a document matches its expected content."""
|
||||
assert doc.semantic_identifier == expected.semantic_identifier
|
||||
assert len(doc.sections) == 1
|
||||
assert expected.content in doc.sections[0].text
|
||||
verify_document_metadata(doc)
|
||||
|
||||
|
||||
def find_document(documents: list[Document], semantic_identifier: str) -> Document:
|
||||
"""Find a document by its semantic identifier."""
|
||||
matching_docs = [
|
||||
d for d in documents if d.semantic_identifier == semantic_identifier
|
||||
]
|
||||
assert (
|
||||
len(matching_docs) == 1
|
||||
), f"Expected exactly one document with identifier {semantic_identifier}"
|
||||
return matching_docs[0]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sharepoint_credentials() -> dict[str, str]:
|
||||
return {
|
||||
"sp_client_id": os.environ["SHAREPOINT_CLIENT_ID"],
|
||||
"sp_client_secret": os.environ["SHAREPOINT_CLIENT_SECRET"],
|
||||
"sp_directory_id": os.environ["SHAREPOINT_CLIENT_DIRECTORY_ID"],
|
||||
}
|
||||
|
||||
|
||||
def test_sharepoint_connector_specific_folder(
|
||||
mock_get_unstructured_api_key: MagicMock,
|
||||
sharepoint_credentials: dict[str, str],
|
||||
) -> None:
|
||||
# Initialize connector with the test site URL and specific folder
|
||||
connector = SharepointConnector(
|
||||
sites=[os.environ["SHAREPOINT_SITE"] + "/Shared Documents/test"]
|
||||
)
|
||||
|
||||
# Load credentials
|
||||
connector.load_credentials(sharepoint_credentials)
|
||||
|
||||
# Get all documents
|
||||
document_batches = list(connector.load_from_state())
|
||||
found_documents: list[Document] = [
|
||||
doc for batch in document_batches for doc in batch
|
||||
]
|
||||
|
||||
# Should only find documents in the test folder
|
||||
test_folder_docs = [
|
||||
doc
|
||||
for doc in EXPECTED_DOCUMENTS
|
||||
if doc.folder_path and doc.folder_path.startswith("test")
|
||||
]
|
||||
assert len(found_documents) == len(
|
||||
test_folder_docs
|
||||
), "Should only find documents in test folder"
|
||||
|
||||
# Verify each expected document
|
||||
for expected in test_folder_docs:
|
||||
doc = find_document(found_documents, expected.semantic_identifier)
|
||||
verify_document_content(doc, expected)
|
||||
|
||||
|
||||
def test_sharepoint_connector_root_folder(
|
||||
mock_get_unstructured_api_key: MagicMock,
|
||||
sharepoint_credentials: dict[str, str],
|
||||
) -> None:
|
||||
# Initialize connector with the base site URL
|
||||
connector = SharepointConnector(sites=[os.environ["SHAREPOINT_SITE"]])
|
||||
|
||||
# Load credentials
|
||||
connector.load_credentials(sharepoint_credentials)
|
||||
|
||||
# Get all documents
|
||||
document_batches = list(connector.load_from_state())
|
||||
found_documents: list[Document] = [
|
||||
doc for batch in document_batches for doc in batch
|
||||
]
|
||||
|
||||
assert len(found_documents) == len(
|
||||
EXPECTED_DOCUMENTS
|
||||
), "Should find all documents in main library"
|
||||
|
||||
# Verify each expected document
|
||||
for expected in EXPECTED_DOCUMENTS:
|
||||
doc = find_document(found_documents, expected.semantic_identifier)
|
||||
verify_document_content(doc, expected)
|
||||
|
||||
|
||||
def test_sharepoint_connector_other_library(
|
||||
mock_get_unstructured_api_key: MagicMock,
|
||||
sharepoint_credentials: dict[str, str],
|
||||
) -> None:
|
||||
# Initialize connector with the other library
|
||||
connector = SharepointConnector(
|
||||
sites=[
|
||||
os.environ["SHAREPOINT_SITE"] + "/Other Library",
|
||||
]
|
||||
)
|
||||
|
||||
# Load credentials
|
||||
connector.load_credentials(sharepoint_credentials)
|
||||
|
||||
# Get all documents
|
||||
document_batches = list(connector.load_from_state())
|
||||
found_documents: list[Document] = [
|
||||
doc for batch in document_batches for doc in batch
|
||||
]
|
||||
expected_documents: list[ExpectedDocument] = [
|
||||
doc for doc in EXPECTED_DOCUMENTS if doc.library == "Other Library"
|
||||
]
|
||||
|
||||
# Should find all documents in `Other Library`
|
||||
assert len(found_documents) == len(
|
||||
expected_documents
|
||||
), "Should find all documents in `Other Library`"
|
||||
|
||||
# Verify each expected document
|
||||
for expected in expected_documents:
|
||||
doc = find_document(found_documents, expected.semantic_identifier)
|
||||
verify_document_content(doc, expected)
|
||||
|
||||
|
||||
def test_sharepoint_connector_poll(
|
||||
mock_get_unstructured_api_key: MagicMock,
|
||||
sharepoint_credentials: dict[str, str],
|
||||
) -> None:
|
||||
# Initialize connector with the base site URL
|
||||
connector = SharepointConnector(
|
||||
sites=["https://danswerai.sharepoint.com/sites/sharepoint-tests"]
|
||||
)
|
||||
|
||||
# Load credentials
|
||||
connector.load_credentials(sharepoint_credentials)
|
||||
|
||||
# Set time window to only capture test1.docx (modified at 2025-01-28 20:51:42+00:00)
|
||||
start = datetime(2025, 1, 28, 20, 51, 30, tzinfo=timezone.utc) # 12 seconds before
|
||||
end = datetime(2025, 1, 28, 20, 51, 50, tzinfo=timezone.utc) # 8 seconds after
|
||||
|
||||
# Get documents within the time window
|
||||
document_batches = list(connector._fetch_from_sharepoint(start=start, end=end))
|
||||
found_documents: list[Document] = [
|
||||
doc for batch in document_batches for doc in batch
|
||||
]
|
||||
|
||||
# Should only find test1.docx
|
||||
assert len(found_documents) == 1, "Should only find one document in the time window"
|
||||
doc = found_documents[0]
|
||||
assert doc.semantic_identifier == "test1.docx"
|
||||
verify_document_metadata(doc)
|
||||
verify_document_content(
|
||||
doc, [d for d in EXPECTED_DOCUMENTS if d.semantic_identifier == "test1.docx"][0]
|
||||
)
|
||||
@@ -432,61 +432,30 @@ class CCPairManager:
|
||||
if user_performing_action
|
||||
else GENERAL_HEADERS,
|
||||
)
|
||||
#
|
||||
if result.status_code != 409:
|
||||
result.raise_for_status()
|
||||
|
||||
group_sync_result = requests.post(
|
||||
url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/sync-groups",
|
||||
headers=user_performing_action.headers
|
||||
if user_performing_action
|
||||
else GENERAL_HEADERS,
|
||||
)
|
||||
if group_sync_result.status_code != 409:
|
||||
group_sync_result.raise_for_status()
|
||||
|
||||
@staticmethod
|
||||
def get_doc_sync_task(
|
||||
def get_sync_task(
|
||||
cc_pair: DATestCCPair,
|
||||
user_performing_action: DATestUser | None = None,
|
||||
) -> datetime | None:
|
||||
doc_sync_response = requests.get(
|
||||
response = requests.get(
|
||||
url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/sync-permissions",
|
||||
headers=user_performing_action.headers
|
||||
if user_performing_action
|
||||
else GENERAL_HEADERS,
|
||||
)
|
||||
doc_sync_response.raise_for_status()
|
||||
doc_sync_response_str = doc_sync_response.json()
|
||||
response.raise_for_status()
|
||||
response_str = response.json()
|
||||
|
||||
# If the response itself is a datetime string, parse it
|
||||
if not isinstance(doc_sync_response_str, str):
|
||||
if not isinstance(response_str, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
return datetime.fromisoformat(doc_sync_response_str)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_group_sync_task(
|
||||
cc_pair: DATestCCPair,
|
||||
user_performing_action: DATestUser | None = None,
|
||||
) -> datetime | None:
|
||||
group_sync_response = requests.get(
|
||||
url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/sync-groups",
|
||||
headers=user_performing_action.headers
|
||||
if user_performing_action
|
||||
else GENERAL_HEADERS,
|
||||
)
|
||||
group_sync_response.raise_for_status()
|
||||
group_sync_response_str = group_sync_response.json()
|
||||
|
||||
# If the response itself is a datetime string, parse it
|
||||
if not isinstance(group_sync_response_str, str):
|
||||
return None
|
||||
|
||||
try:
|
||||
return datetime.fromisoformat(group_sync_response_str)
|
||||
return datetime.fromisoformat(response_str)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@@ -529,37 +498,15 @@ class CCPairManager:
|
||||
timeout: float = MAX_DELAY,
|
||||
number_of_updated_docs: int = 0,
|
||||
user_performing_action: DATestUser | None = None,
|
||||
# Sometimes waiting for a group sync is not necessary
|
||||
should_wait_for_group_sync: bool = True,
|
||||
# Sometimes waiting for a vespa sync is not necessary
|
||||
should_wait_for_vespa_sync: bool = True,
|
||||
) -> None:
|
||||
"""after: The task register time must be after this time."""
|
||||
doc_synced = False
|
||||
group_synced = False
|
||||
start = time.monotonic()
|
||||
while True:
|
||||
# We are treating both syncs as part of one larger permission sync job
|
||||
doc_last_synced = CCPairManager.get_doc_sync_task(
|
||||
cc_pair, user_performing_action
|
||||
)
|
||||
group_last_synced = CCPairManager.get_group_sync_task(
|
||||
cc_pair, user_performing_action
|
||||
)
|
||||
|
||||
if not doc_synced and doc_last_synced and doc_last_synced > after:
|
||||
print(f"doc_last_synced: {doc_last_synced}")
|
||||
last_synced = CCPairManager.get_sync_task(cc_pair, user_performing_action)
|
||||
if last_synced and last_synced > after:
|
||||
print(f"last_synced: {last_synced}")
|
||||
print(f"sync command start time: {after}")
|
||||
print(f"permission sync complete: cc_pair={cc_pair.id}")
|
||||
doc_synced = True
|
||||
|
||||
if not group_synced and group_last_synced and group_last_synced > after:
|
||||
print(f"group_last_synced: {group_last_synced}")
|
||||
print(f"sync command start time: {after}")
|
||||
print(f"group sync complete: cc_pair={cc_pair.id}")
|
||||
group_synced = True
|
||||
|
||||
if doc_synced and (group_synced or not should_wait_for_group_sync):
|
||||
break
|
||||
|
||||
elapsed = time.monotonic() - start
|
||||
@@ -577,9 +524,6 @@ class CCPairManager:
|
||||
# this shouldnt be necessary but something is off with the timing for the sync jobs
|
||||
time.sleep(5)
|
||||
|
||||
if not should_wait_for_vespa_sync:
|
||||
return
|
||||
|
||||
print("waiting for vespa sync")
|
||||
# wait for the vespa sync to complete once the permission sync is complete
|
||||
start = time.monotonic()
|
||||
|
||||
@@ -18,7 +18,6 @@ from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.engine import SYNC_DB_API
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.swap_index import check_index_swap
|
||||
from onyx.document_index.document_index_utils import get_multipass_config
|
||||
from onyx.document_index.vespa.index import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from onyx.indexing.models import IndexingSetting
|
||||
@@ -174,16 +173,10 @@ def reset_vespa() -> None:
|
||||
check_index_swap(db_session)
|
||||
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
|
||||
success = setup_vespa(
|
||||
document_index=VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
),
|
||||
document_index=VespaIndex(index_name=index_name, secondary_index_name=None),
|
||||
index_setting=IndexingSetting.from_db_model(search_settings),
|
||||
secondary_index_setting=None,
|
||||
)
|
||||
@@ -257,16 +250,10 @@ def reset_vespa_multitenant() -> None:
|
||||
check_index_swap(db_session)
|
||||
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
multipass_config = get_multipass_config(search_settings)
|
||||
index_name = search_settings.index_name
|
||||
|
||||
success = setup_vespa(
|
||||
document_index=VespaIndex(
|
||||
index_name=index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=multipass_config.enable_large_chunks,
|
||||
secondary_large_chunks_enabled=None,
|
||||
),
|
||||
document_index=VespaIndex(index_name=index_name, secondary_index_name=None),
|
||||
index_setting=IndexingSetting.from_db_model(search_settings),
|
||||
secondary_index_setting=None,
|
||||
)
|
||||
|
||||
@@ -1,186 +0,0 @@
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from google.oauth2.service_account import Credentials
|
||||
|
||||
from onyx.connectors.google_utils.resources import get_drive_service
|
||||
from onyx.connectors.google_utils.resources import get_google_docs_service
|
||||
from onyx.connectors.google_utils.resources import GoogleDocsService
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
|
||||
|
||||
GOOGLE_SCOPES = {
|
||||
"google_drive": [
|
||||
"https://www.googleapis.com/auth/drive",
|
||||
"https://www.googleapis.com/auth/admin.directory.group",
|
||||
"https://www.googleapis.com/auth/admin.directory.user",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _create_doc_service(drive_service: GoogleDriveService) -> GoogleDocsService:
|
||||
docs_service = get_google_docs_service(
|
||||
creds=drive_service._http.credentials,
|
||||
user_email=drive_service._http.credentials._subject,
|
||||
)
|
||||
return docs_service
|
||||
|
||||
|
||||
class GoogleDriveManager:
|
||||
@staticmethod
|
||||
def create_impersonated_drive_service(
|
||||
service_account_key: dict, impersonated_user_email: str
|
||||
) -> GoogleDriveService:
|
||||
"""Gets a drive service that impersonates a specific user"""
|
||||
credentials = Credentials.from_service_account_info(
|
||||
service_account_key,
|
||||
scopes=GOOGLE_SCOPES["google_drive"],
|
||||
subject=impersonated_user_email,
|
||||
)
|
||||
|
||||
service = get_drive_service(credentials, impersonated_user_email)
|
||||
|
||||
# Verify impersonation
|
||||
about = service.about().get(fields="user").execute()
|
||||
if about.get("user", {}).get("emailAddress") != impersonated_user_email:
|
||||
raise ValueError(
|
||||
f"Failed to impersonate {impersonated_user_email}. "
|
||||
f"Instead got {about.get('user', {}).get('emailAddress')}"
|
||||
)
|
||||
return service
|
||||
|
||||
@staticmethod
|
||||
def create_shared_drive(
|
||||
drive_service: GoogleDriveService, admin_email: str, test_id: str
|
||||
) -> str:
|
||||
"""
|
||||
Creates a shared drive and returns the drive's ID
|
||||
"""
|
||||
try:
|
||||
about = drive_service.about().get(fields="user").execute()
|
||||
creating_user = about["user"]["emailAddress"]
|
||||
|
||||
# Verify we're still impersonating the admin
|
||||
if creating_user != admin_email:
|
||||
raise ValueError(
|
||||
f"Expected to create drive as {admin_email}, but instead created drive as {creating_user}"
|
||||
)
|
||||
|
||||
drive_metadata = {"name": f"perm_sync_drive_{test_id}"}
|
||||
|
||||
request_id = str(uuid4())
|
||||
drive = (
|
||||
drive_service.drives()
|
||||
.create(
|
||||
body=drive_metadata,
|
||||
requestId=request_id,
|
||||
fields="id,name,capabilities",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
|
||||
return drive["id"]
|
||||
except Exception as e:
|
||||
print(f"Error creating shared drive: {str(e)}")
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def create_empty_doc(
|
||||
drive_service: Any,
|
||||
drive_id: str,
|
||||
) -> str:
|
||||
"""
|
||||
Creates an empty document in the given drive and returns the document's ID
|
||||
"""
|
||||
file_metadata = {
|
||||
"name": f"perm_sync_doc_{drive_id}_{str(uuid4())}",
|
||||
"mimeType": "application/vnd.google-apps.document",
|
||||
"parents": [drive_id],
|
||||
}
|
||||
file = (
|
||||
drive_service.files()
|
||||
.create(body=file_metadata, supportsAllDrives=True)
|
||||
.execute()
|
||||
)
|
||||
|
||||
return file["id"]
|
||||
|
||||
@staticmethod
|
||||
def append_text_to_doc(
|
||||
drive_service: GoogleDriveService, doc_id: str, text: str
|
||||
) -> None:
|
||||
docs_service = _create_doc_service(drive_service)
|
||||
|
||||
docs_service.documents().batchUpdate(
|
||||
documentId=doc_id,
|
||||
body={
|
||||
"requests": [{"insertText": {"location": {"index": 1}, "text": text}}]
|
||||
},
|
||||
).execute()
|
||||
|
||||
@staticmethod
|
||||
def update_file_permissions(
|
||||
drive_service: Any, file_id: str, email: str, role: str = "reader"
|
||||
) -> None:
|
||||
permission = {"type": "user", "role": role, "emailAddress": email}
|
||||
drive_service.permissions().create(
|
||||
fileId=file_id,
|
||||
body=permission,
|
||||
supportsAllDrives=True,
|
||||
sendNotificationEmail=False,
|
||||
).execute()
|
||||
|
||||
@staticmethod
|
||||
def remove_file_permissions(drive_service: Any, file_id: str, email: str) -> None:
|
||||
permissions = (
|
||||
drive_service.permissions()
|
||||
.list(fileId=file_id, supportsAllDrives=True)
|
||||
.execute()
|
||||
)
|
||||
# TODO: This is a hacky way to remove permissions. Removes anyone with reader role.
|
||||
# Need to find a way to map a user's email to a permission id.
|
||||
# The permissions.get returns a permissionID but email field is None,
|
||||
# something to do with it being a group or domain wide delegation.
|
||||
for permission in permissions.get("permissions", []):
|
||||
if permission.get("role") == "reader":
|
||||
drive_service.permissions().delete(
|
||||
fileId=file_id,
|
||||
permissionId=permission["id"],
|
||||
supportsAllDrives=True,
|
||||
).execute()
|
||||
break
|
||||
|
||||
@staticmethod
|
||||
def make_file_public(drive_service: Any, file_id: str) -> None:
|
||||
permission = {"type": "anyone", "role": "reader"}
|
||||
drive_service.permissions().create(
|
||||
fileId=file_id, body=permission, supportsAllDrives=True
|
||||
).execute()
|
||||
|
||||
@staticmethod
|
||||
def cleanup_drive(drive_service: Any, drive_id: str) -> None:
|
||||
try:
|
||||
# Delete up to 2 files that match our pattern
|
||||
file_name_prefix = f"perm_sync_doc_{drive_id}"
|
||||
files = (
|
||||
drive_service.files()
|
||||
.list(
|
||||
q=f"name contains '{file_name_prefix}'",
|
||||
driveId=drive_id,
|
||||
includeItemsFromAllDrives=True,
|
||||
supportsAllDrives=True,
|
||||
corpora="drive",
|
||||
fields="files(id)",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
|
||||
for file in files.get("files", []):
|
||||
drive_service.files().delete(
|
||||
fileId=file["id"], supportsAllDrives=True
|
||||
).execute()
|
||||
|
||||
# Then delete the drive
|
||||
drive_service.drives().delete(driveId=drive_id).execute()
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up drive {drive_id}: {e}")
|
||||
@@ -1,332 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
)
|
||||
from onyx.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.enums import AccessType
|
||||
from tests.integration.common_utils.managers.cc_pair import CCPairManager
|
||||
from tests.integration.common_utils.managers.connector import ConnectorManager
|
||||
from tests.integration.common_utils.managers.credential import CredentialManager
|
||||
from tests.integration.common_utils.managers.document_search import (
|
||||
DocumentSearchManager,
|
||||
)
|
||||
from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
|
||||
from tests.integration.common_utils.managers.user import UserManager
|
||||
from tests.integration.common_utils.test_models import DATestCCPair
|
||||
from tests.integration.common_utils.test_models import DATestConnector
|
||||
from tests.integration.common_utils.test_models import DATestCredential
|
||||
from tests.integration.common_utils.test_models import DATestUser
|
||||
from tests.integration.common_utils.vespa import vespa_fixture
|
||||
from tests.integration.connector_job_tests.google.google_drive_api_utils import (
|
||||
GoogleDriveManager,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def google_drive_test_env_setup() -> (
|
||||
Generator[
|
||||
tuple[
|
||||
GoogleDriveService, str, DATestCCPair, DATestUser, DATestUser, DATestUser
|
||||
],
|
||||
None,
|
||||
None,
|
||||
]
|
||||
):
|
||||
# Creating an admin user (first user created is automatically an admin)
|
||||
admin_user: DATestUser = UserManager.create(email="admin@onyx-test.com")
|
||||
# Creating a non-admin user
|
||||
test_user_1: DATestUser = UserManager.create(email="test_user_1@onyx-test.com")
|
||||
# Creating a non-admin user
|
||||
test_user_2: DATestUser = UserManager.create(email="test_user_2@onyx-test.com")
|
||||
|
||||
service_account_key = os.environ["FULL_CONTROL_DRIVE_SERVICE_ACCOUNT"]
|
||||
drive_id: str | None = None
|
||||
|
||||
try:
|
||||
credentials = {
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: admin_user.email,
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: service_account_key,
|
||||
}
|
||||
|
||||
# Setup Google Drive
|
||||
drive_service = GoogleDriveManager.create_impersonated_drive_service(
|
||||
json.loads(service_account_key), admin_user.email
|
||||
)
|
||||
test_id = str(uuid4())
|
||||
drive_id = GoogleDriveManager.create_shared_drive(
|
||||
drive_service, admin_user.email, test_id
|
||||
)
|
||||
|
||||
# Setup Onyx infrastructure
|
||||
LLMProviderManager.create(user_performing_action=admin_user)
|
||||
|
||||
before = datetime.now(timezone.utc)
|
||||
credential: DATestCredential = CredentialManager.create(
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
credential_json=credentials,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
connector: DATestConnector = ConnectorManager.create(
|
||||
name="Google Drive Test",
|
||||
input_type=InputType.POLL,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
connector_specific_config={
|
||||
"shared_drive_urls": f"https://drive.google.com/drive/folders/{drive_id}"
|
||||
},
|
||||
access_type=AccessType.SYNC,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
cc_pair: DATestCCPair = CCPairManager.create(
|
||||
credential_id=credential.id,
|
||||
connector_id=connector.id,
|
||||
access_type=AccessType.SYNC,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
CCPairManager.wait_for_indexing_completion(
|
||||
cc_pair=cc_pair, after=before, user_performing_action=admin_user
|
||||
)
|
||||
|
||||
yield drive_service, drive_id, cc_pair, admin_user, test_user_1, test_user_2
|
||||
|
||||
except json.JSONDecodeError:
|
||||
pytest.skip("FULL_CONTROL_DRIVE_SERVICE_ACCOUNT is not valid JSON")
|
||||
finally:
|
||||
# Cleanup drive and file
|
||||
if drive_id is not None:
|
||||
GoogleDriveManager.cleanup_drive(drive_service, drive_id)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Needs to be tested for flakiness")
|
||||
def test_google_permission_sync(
|
||||
reset: None,
|
||||
vespa_client: vespa_fixture,
|
||||
google_drive_test_env_setup: tuple[
|
||||
GoogleDriveService, str, DATestCCPair, DATestUser, DATestUser, DATestUser
|
||||
],
|
||||
) -> None:
|
||||
(
|
||||
drive_service,
|
||||
drive_id,
|
||||
cc_pair,
|
||||
admin_user,
|
||||
test_user_1,
|
||||
test_user_2,
|
||||
) = google_drive_test_env_setup
|
||||
|
||||
# ----------------------BASELINE TEST----------------------
|
||||
before = datetime.now(timezone.utc)
|
||||
|
||||
# Create empty test doc in drive
|
||||
doc_id_1 = GoogleDriveManager.create_empty_doc(drive_service, drive_id)
|
||||
|
||||
# Append text to doc
|
||||
doc_text_1 = "The secret number is 12345"
|
||||
GoogleDriveManager.append_text_to_doc(drive_service, doc_id_1, doc_text_1)
|
||||
|
||||
# run indexing
|
||||
CCPairManager.run_once(cc_pair, admin_user)
|
||||
CCPairManager.wait_for_indexing_completion(
|
||||
cc_pair=cc_pair, after=before, user_performing_action=admin_user
|
||||
)
|
||||
|
||||
# run permission sync
|
||||
CCPairManager.sync(
|
||||
cc_pair=cc_pair,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
CCPairManager.wait_for_sync(
|
||||
cc_pair=cc_pair,
|
||||
after=before,
|
||||
number_of_updated_docs=1,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Verify admin has access to document
|
||||
admin_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=admin_user
|
||||
)
|
||||
assert doc_text_1 in [result.strip("\ufeff") for result in admin_results]
|
||||
|
||||
# Verify test_user_1 cannot access document
|
||||
user1_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=test_user_1
|
||||
)
|
||||
assert doc_text_1 not in [result.strip("\ufeff") for result in user1_results]
|
||||
|
||||
# ----------------------GRANT USER 1 DOC PERMISSIONS TEST--------------------------
|
||||
before = datetime.now(timezone.utc)
|
||||
|
||||
# Grant user 1 access to document 1
|
||||
GoogleDriveManager.update_file_permissions(
|
||||
drive_service=drive_service,
|
||||
file_id=doc_id_1,
|
||||
email=test_user_1.email,
|
||||
role="reader",
|
||||
)
|
||||
|
||||
# Create a second doc in the drive which user 1 should not have access to
|
||||
doc_id_2 = GoogleDriveManager.create_empty_doc(drive_service, drive_id)
|
||||
doc_text_2 = "The secret number is 67890"
|
||||
GoogleDriveManager.append_text_to_doc(drive_service, doc_id_2, doc_text_2)
|
||||
|
||||
# Run indexing
|
||||
CCPairManager.run_once(cc_pair, admin_user)
|
||||
CCPairManager.wait_for_indexing_completion(
|
||||
cc_pair=cc_pair,
|
||||
after=before,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Run permission sync
|
||||
CCPairManager.sync(
|
||||
cc_pair=cc_pair,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
CCPairManager.wait_for_sync(
|
||||
cc_pair=cc_pair,
|
||||
after=before,
|
||||
number_of_updated_docs=1,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Verify admin can access both documents
|
||||
admin_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=admin_user
|
||||
)
|
||||
assert {doc_text_1, doc_text_2} == {
|
||||
result.strip("\ufeff") for result in admin_results
|
||||
}
|
||||
|
||||
# Verify user 1 can access document 1
|
||||
user1_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=test_user_1
|
||||
)
|
||||
assert doc_text_1 in [result.strip("\ufeff") for result in user1_results]
|
||||
|
||||
# Verify user 1 cannot access document 2
|
||||
user1_results_2 = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=test_user_1
|
||||
)
|
||||
assert doc_text_2 not in [result.strip("\ufeff") for result in user1_results_2]
|
||||
|
||||
# ----------------------REMOVE USER 1 DOC PERMISSIONS TEST--------------------------
|
||||
before = datetime.now(timezone.utc)
|
||||
|
||||
# Remove user 1 access to document 1
|
||||
GoogleDriveManager.remove_file_permissions(
|
||||
drive_service=drive_service, file_id=doc_id_1, email=test_user_1.email
|
||||
)
|
||||
# Run permission sync
|
||||
CCPairManager.sync(
|
||||
cc_pair=cc_pair,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
CCPairManager.wait_for_sync(
|
||||
cc_pair=cc_pair,
|
||||
after=before,
|
||||
number_of_updated_docs=1,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Verify admin can access both documents
|
||||
admin_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=admin_user
|
||||
)
|
||||
assert {doc_text_1, doc_text_2} == {
|
||||
result.strip("\ufeff") for result in admin_results
|
||||
}
|
||||
|
||||
# Verify user 1 cannot access either document
|
||||
user1_results = DocumentSearchManager.search_documents(
|
||||
query="secret numbers", user_performing_action=test_user_1
|
||||
)
|
||||
assert {result.strip("\ufeff") for result in user1_results} == set()
|
||||
|
||||
# ----------------------GRANT USER 1 DRIVE PERMISSIONS TEST--------------------------
|
||||
before = datetime.now(timezone.utc)
|
||||
|
||||
# Grant user 1 access to drive
|
||||
GoogleDriveManager.update_file_permissions(
|
||||
drive_service=drive_service,
|
||||
file_id=drive_id,
|
||||
email=test_user_1.email,
|
||||
role="reader",
|
||||
)
|
||||
|
||||
# Run permission sync
|
||||
CCPairManager.sync(
|
||||
cc_pair=cc_pair,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
CCPairManager.wait_for_sync(
|
||||
cc_pair=cc_pair,
|
||||
after=before,
|
||||
number_of_updated_docs=2,
|
||||
user_performing_action=admin_user,
|
||||
# if we are only updating the group definition for this test we use this varaiable,
|
||||
# since it doesn't result in a vespa sync so we don't want to wait for it
|
||||
should_wait_for_vespa_sync=False,
|
||||
)
|
||||
|
||||
# Verify user 1 can access both documents
|
||||
user1_results = DocumentSearchManager.search_documents(
|
||||
query="secret numbers", user_performing_action=test_user_1
|
||||
)
|
||||
assert {doc_text_1, doc_text_2} == {
|
||||
result.strip("\ufeff") for result in user1_results
|
||||
}
|
||||
|
||||
# ----------------------MAKE DRIVE PUBLIC TEST--------------------------
|
||||
before = datetime.now(timezone.utc)
|
||||
|
||||
# Unable to make drive itself public as Google's security policies prevent this, so we make the documents public instead
|
||||
GoogleDriveManager.make_file_public(drive_service, doc_id_1)
|
||||
GoogleDriveManager.make_file_public(drive_service, doc_id_2)
|
||||
|
||||
# Run permission sync
|
||||
CCPairManager.sync(
|
||||
cc_pair=cc_pair,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
CCPairManager.wait_for_sync(
|
||||
cc_pair=cc_pair,
|
||||
after=before,
|
||||
number_of_updated_docs=2,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Verify all users can access both documents
|
||||
admin_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=admin_user
|
||||
)
|
||||
assert {doc_text_1, doc_text_2} == {
|
||||
result.strip("\ufeff") for result in admin_results
|
||||
}
|
||||
|
||||
user1_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=test_user_1
|
||||
)
|
||||
assert {doc_text_1, doc_text_2} == {
|
||||
result.strip("\ufeff") for result in user1_results
|
||||
}
|
||||
|
||||
user2_results = DocumentSearchManager.search_documents(
|
||||
query="secret number", user_performing_action=test_user_2
|
||||
)
|
||||
assert {doc_text_1, doc_text_2} == {
|
||||
result.strip("\ufeff") for result in user2_results
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars
|
||||
|
||||
|
||||
def test_remove_invalid_unicode_chars() -> None:
|
||||
"""Test that invalid Unicode characters are properly removed."""
|
||||
# Test removal of illegal XML character 0xFDDB
|
||||
text_with_illegal_char = "Valid text \uFDDB more text"
|
||||
sanitized = remove_invalid_unicode_chars(text_with_illegal_char)
|
||||
assert "\uFDDB" not in sanitized
|
||||
assert sanitized == "Valid text more text"
|
||||
|
||||
# Test that valid characters are preserved
|
||||
valid_text = "Hello, world! 你好世界"
|
||||
assert remove_invalid_unicode_chars(valid_text) == valid_text
|
||||
|
||||
# Test multiple invalid characters including 0xFDDB
|
||||
text_with_multiple_illegal = "\x00Hello\uFDDB World\uFFFE!"
|
||||
sanitized = remove_invalid_unicode_chars(text_with_multiple_illegal)
|
||||
assert all(c not in sanitized for c in ["\x00", "\uFDDB", "\uFFFE"])
|
||||
assert sanitized == "Hello World!"
|
||||
@@ -6,7 +6,7 @@ import pytest
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.engine import get_sqlalchemy_engine
|
||||
from onyx.document_index.document_index_utils import get_both_index_properties
|
||||
from onyx.document_index.document_index_utils import get_both_index_names
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ def test_vespa_update() -> None:
|
||||
doc_id = "test-vespa-update"
|
||||
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
primary_index_name, _, _, _ = get_both_index_properties(db_session)
|
||||
primary_index_name, _ = get_both_index_names(db_session)
|
||||
endpoint = (
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=primary_index_name)}/{doc_id}"
|
||||
)
|
||||
|
||||
75
deployment/cloud_kubernetes/hpa/workers_hpa.yaml
Normal file
75
deployment/cloud_kubernetes/hpa/workers_hpa.yaml
Normal file
@@ -0,0 +1,75 @@
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: celery-worker-heavy-hpa
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: celery-worker-heavy
|
||||
minReplicas: 1
|
||||
maxReplicas: 5
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 60
|
||||
---
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: celery-worker-light-hpa
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: celery-worker-light
|
||||
minReplicas: 1
|
||||
maxReplicas: 10
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
---
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: celery-worker-indexing-hpa
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: celery-worker-indexing
|
||||
minReplicas: 1
|
||||
maxReplicas: 10
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
---
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: celery-worker-monitoring-hpa
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: celery-worker-indexing
|
||||
minReplicas: 1
|
||||
maxReplicas: 4
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: 70
|
||||
13
deployment/cloud_kubernetes/keda/celery-worker-auth.yaml
Normal file
13
deployment/cloud_kubernetes/keda/celery-worker-auth.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: keda.sh/v1alpha1
|
||||
kind: TriggerAuthentication
|
||||
metadata:
|
||||
name: celery-worker-auth
|
||||
namespace: onyx
|
||||
spec:
|
||||
secretTargetRef:
|
||||
- parameter: host
|
||||
name: keda-redis-secret
|
||||
key: host
|
||||
- parameter: password
|
||||
name: keda-redis-secret
|
||||
key: password
|
||||
@@ -0,0 +1,53 @@
|
||||
apiVersion: keda.sh/v1alpha1
|
||||
kind: ScaledObject
|
||||
metadata:
|
||||
name: celery-worker-indexing-scaledobject
|
||||
namespace: onyx
|
||||
labels:
|
||||
app: celery-worker-indexing
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: celery-worker-indexing
|
||||
minReplicaCount: 1
|
||||
maxReplicaCount: 30
|
||||
triggers:
|
||||
- type: redis
|
||||
metadata:
|
||||
sslEnabled: "true"
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: connector_indexing
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
|
||||
- type: redis
|
||||
metadata:
|
||||
sslEnabled: "true"
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: connector_indexing:2
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
sslEnabled: "true"
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: connector_indexing:3
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: cpu
|
||||
metadata:
|
||||
type: Utilization
|
||||
value: "70"
|
||||
|
||||
- type: memory
|
||||
metadata:
|
||||
type: Utilization
|
||||
value: "70"
|
||||
@@ -0,0 +1,58 @@
|
||||
apiVersion: keda.sh/v1alpha1
|
||||
kind: ScaledObject
|
||||
metadata:
|
||||
name: celery-worker-light-scaledobject
|
||||
namespace: onyx
|
||||
labels:
|
||||
app: celery-worker-light
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: celery-worker-light
|
||||
minReplicaCount: 5
|
||||
maxReplicaCount: 20
|
||||
triggers:
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: vespa_metadata_sync
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: vespa_metadata_sync:2
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: vespa_metadata_sync:3
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: connector_deletion
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: connector_deletion:2
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
@@ -0,0 +1,70 @@
|
||||
apiVersion: keda.sh/v1alpha1
|
||||
kind: ScaledObject
|
||||
metadata:
|
||||
name: celery-worker-primary-scaledobject
|
||||
namespace: onyx
|
||||
labels:
|
||||
app: celery-worker-primary
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: celery-worker-primary
|
||||
pollingInterval: 15 # Check every 15 seconds
|
||||
cooldownPeriod: 30 # Wait 30 seconds before scaling down
|
||||
minReplicaCount: 4
|
||||
maxReplicaCount: 4
|
||||
triggers:
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: celery
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: celery:1
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: celery:2
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: celery:3
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: periodic_tasks
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
- type: redis
|
||||
metadata:
|
||||
port: "6379"
|
||||
enableTLS: "true"
|
||||
listName: periodic_tasks:2
|
||||
listLength: "1"
|
||||
databaseIndex: "15"
|
||||
authenticationRef:
|
||||
name: celery-worker-auth
|
||||
@@ -0,0 +1,19 @@
|
||||
apiVersion: keda.sh/v1alpha1
|
||||
kind: ScaledObject
|
||||
metadata:
|
||||
name: indexing-model-server-scaledobject
|
||||
namespace: onyx
|
||||
labels:
|
||||
app: indexing-model-server
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
name: indexing-model-server-deployment
|
||||
pollingInterval: 15 # Check every 15 seconds
|
||||
cooldownPeriod: 30 # Wait 30 seconds before scaling down
|
||||
minReplicaCount: 10
|
||||
maxReplicaCount: 10
|
||||
triggers:
|
||||
- type: cpu
|
||||
metadata:
|
||||
type: Utilization
|
||||
value: "70"
|
||||
9
deployment/cloud_kubernetes/keda/keda-redis-secret.yaml
Normal file
9
deployment/cloud_kubernetes/keda/keda-redis-secret.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: keda-redis-secret
|
||||
namespace: onyx
|
||||
type: Opaque
|
||||
data:
|
||||
host: { base64 encoded host here }
|
||||
password: { base64 encoded password here }
|
||||
44
deployment/cloud_kubernetes/workers/beat.yaml
Normal file
44
deployment/cloud_kubernetes/workers/beat.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: celery-beat
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: celery-beat
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: celery-beat
|
||||
spec:
|
||||
containers:
|
||||
- name: celery-beat
|
||||
image: onyxdotapp/onyx-backend-cloud:v0.14.0-cloud.beta.21
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
[
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.beat",
|
||||
"beat",
|
||||
"--loglevel=INFO",
|
||||
]
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: onyx-secrets
|
||||
key: redis_password
|
||||
- name: ONYX_VERSION
|
||||
value: "v0.11.0-cloud.beta.8"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: env-configmap
|
||||
resources:
|
||||
requests:
|
||||
cpu: "250m"
|
||||
memory: "512Mi"
|
||||
limits:
|
||||
cpu: "500m"
|
||||
memory: "1Gi"
|
||||
60
deployment/cloud_kubernetes/workers/heavy_worker.yaml
Normal file
60
deployment/cloud_kubernetes/workers/heavy_worker.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: celery-worker-heavy
|
||||
spec:
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: celery-worker-heavy
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: celery-worker-heavy
|
||||
spec:
|
||||
containers:
|
||||
- name: celery-worker-heavy
|
||||
image: onyxdotapp/onyx-backend-cloud:v0.14.0-cloud.beta.21
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
[
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.heavy",
|
||||
"worker",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=heavy@%n",
|
||||
"-Q",
|
||||
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
|
||||
]
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: onyx-secrets
|
||||
key: redis_password
|
||||
- name: ONYX_VERSION
|
||||
value: "v0.11.0-cloud.beta.8"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: env-configmap
|
||||
volumeMounts:
|
||||
- name: vespa-certificates
|
||||
mountPath: "/app/certs"
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: "1000m"
|
||||
memory: "2Gi"
|
||||
limits:
|
||||
cpu: "2000m"
|
||||
memory: "4Gi"
|
||||
volumes:
|
||||
- name: vespa-certificates
|
||||
secret:
|
||||
secretName: vespa-certificates
|
||||
items:
|
||||
- key: cert.pem
|
||||
path: cert.pem
|
||||
- key: key.pem
|
||||
path: key.pem
|
||||
62
deployment/cloud_kubernetes/workers/indexing_worker.yaml
Normal file
62
deployment/cloud_kubernetes/workers/indexing_worker.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: celery-worker-indexing
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: celery-worker-indexing
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: celery-worker-indexing
|
||||
spec:
|
||||
containers:
|
||||
- name: celery-worker-indexing
|
||||
image: onyxdotapp/onyx-backend-cloud:v0.14.0-cloud.beta.21
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
[
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=indexing@%n",
|
||||
"-Q",
|
||||
"connector_indexing",
|
||||
"--prefetch-multiplier=1",
|
||||
"--concurrency=10",
|
||||
]
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: onyx-secrets
|
||||
key: redis_password
|
||||
- name: ONYX_VERSION
|
||||
value: "v0.11.0-cloud.beta.8"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: env-configmap
|
||||
volumeMounts:
|
||||
- name: vespa-certificates
|
||||
mountPath: "/app/certs"
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: "500m"
|
||||
memory: "4Gi"
|
||||
limits:
|
||||
cpu: "1000m"
|
||||
memory: "8Gi"
|
||||
volumes:
|
||||
- name: vespa-certificates
|
||||
secret:
|
||||
secretName: vespa-certificates
|
||||
items:
|
||||
- key: cert.pem
|
||||
path: cert.pem
|
||||
- key: key.pem
|
||||
path: key.pem
|
||||
62
deployment/cloud_kubernetes/workers/light_worker.yaml
Normal file
62
deployment/cloud_kubernetes/workers/light_worker.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: celery-worker-light
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: celery-worker-light
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: celery-worker-light
|
||||
spec:
|
||||
containers:
|
||||
- name: celery-worker-light
|
||||
image: onyxdotapp/onyx-backend-cloud:v0.14.0-cloud.beta.21
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
[
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.light",
|
||||
"worker",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert",
|
||||
"--prefetch-multiplier=1",
|
||||
"--concurrency=10",
|
||||
]
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: onyx-secrets
|
||||
key: redis_password
|
||||
- name: ONYX_VERSION
|
||||
value: "v0.11.0-cloud.beta.8"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: env-configmap
|
||||
volumeMounts:
|
||||
- name: vespa-certificates
|
||||
mountPath: "/app/certs"
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: "500m"
|
||||
memory: "1Gi"
|
||||
limits:
|
||||
cpu: "1000m"
|
||||
memory: "2Gi"
|
||||
volumes:
|
||||
- name: vespa-certificates
|
||||
secret:
|
||||
secretName: vespa-certificates
|
||||
items:
|
||||
- key: cert.pem
|
||||
path: cert.pem
|
||||
- key: key.pem
|
||||
path: key.pem
|
||||
62
deployment/cloud_kubernetes/workers/monitoring.yaml
Normal file
62
deployment/cloud_kubernetes/workers/monitoring.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: celery-worker-monitoring
|
||||
spec:
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: celery-worker-monitoring
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: celery-worker-monitoring
|
||||
spec:
|
||||
containers:
|
||||
- name: celery-worker-monitoring
|
||||
image: onyxdotapp/onyx-backend-cloud:v0.14.0-cloud.beta.21
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
[
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.monitoring",
|
||||
"worker",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=monitoring@%n",
|
||||
"-Q",
|
||||
"monitoring",
|
||||
"--prefetch-multiplier=8",
|
||||
"--concurrency=8",
|
||||
]
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: onyx-secrets
|
||||
key: redis_password
|
||||
- name: ONYX_VERSION
|
||||
value: "v0.11.0-cloud.beta.8"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: env-configmap
|
||||
volumeMounts:
|
||||
- name: vespa-certificates
|
||||
mountPath: "/app/certs"
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: "1000m"
|
||||
memory: "1Gi"
|
||||
limits:
|
||||
cpu: "1000m"
|
||||
memory: "1Gi"
|
||||
volumes:
|
||||
- name: vespa-certificates
|
||||
secret:
|
||||
secretName: vespa-certificates
|
||||
items:
|
||||
- key: cert.pem
|
||||
path: cert.pem
|
||||
- key: key.pem
|
||||
path: key.pem
|
||||
62
deployment/cloud_kubernetes/workers/primary.yaml
Normal file
62
deployment/cloud_kubernetes/workers/primary.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: celery-worker-primary
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: celery-worker-primary
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: celery-worker-primary
|
||||
spec:
|
||||
containers:
|
||||
- name: celery-worker-primary
|
||||
image: onyxdotapp/onyx-backend-cloud:v0.14.0-cloud.beta.21
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
[
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.primary",
|
||||
"worker",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=primary@%n",
|
||||
"-Q",
|
||||
"celery,periodic_tasks",
|
||||
"--prefetch-multiplier=1",
|
||||
"--concurrency=10",
|
||||
]
|
||||
env:
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: onyx-secrets
|
||||
key: redis_password
|
||||
- name: ONYX_VERSION
|
||||
value: "v0.11.0-cloud.beta.8"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: env-configmap
|
||||
volumeMounts:
|
||||
- name: vespa-certificates
|
||||
mountPath: "/app/certs"
|
||||
readOnly: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: "500m"
|
||||
memory: "1Gi"
|
||||
limits:
|
||||
cpu: "1000m"
|
||||
memory: "2Gi"
|
||||
volumes:
|
||||
- name: vespa-certificates
|
||||
secret:
|
||||
secretName: vespa-certificates
|
||||
items:
|
||||
- key: cert.pem
|
||||
path: cert.pem
|
||||
- key: key.pem
|
||||
path: key.pem
|
||||
183
package-lock.json
generated
Normal file
183
package-lock.json
generated
Normal file
@@ -0,0 +1,183 @@
|
||||
{
|
||||
"name": "onyx",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"dependencies": {
|
||||
"react-datepicker": "^7.6.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react-datepicker": "^6.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@floating-ui/core": {
|
||||
"version": "1.6.9",
|
||||
"resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.6.9.tgz",
|
||||
"integrity": "sha512-uMXCuQ3BItDUbAMhIXw7UPXRfAlOAvZzdK9BWpE60MCn+Svt3aLn9jsPTi/WNGlRUu2uI0v5S7JiIUsbsvh3fw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/utils": "^0.2.9"
|
||||
}
|
||||
},
|
||||
"node_modules/@floating-ui/dom": {
|
||||
"version": "1.6.13",
|
||||
"resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.6.13.tgz",
|
||||
"integrity": "sha512-umqzocjDgNRGTuO7Q8CU32dkHkECqI8ZdMZ5Swb6QAM0t5rnlrN3lGo1hdpscRd3WS8T6DKYK4ephgIH9iRh3w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/core": "^1.6.0",
|
||||
"@floating-ui/utils": "^0.2.9"
|
||||
}
|
||||
},
|
||||
"node_modules/@floating-ui/react": {
|
||||
"version": "0.27.3",
|
||||
"resolved": "https://registry.npmjs.org/@floating-ui/react/-/react-0.27.3.tgz",
|
||||
"integrity": "sha512-CLHnes3ixIFFKVQDdICjel8muhFLOBdQH7fgtHNPY8UbCNqbeKZ262G7K66lGQOUQWWnYocf7ZbUsLJgGfsLHg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/react-dom": "^2.1.2",
|
||||
"@floating-ui/utils": "^0.2.9",
|
||||
"tabbable": "^6.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": ">=17.0.0",
|
||||
"react-dom": ">=17.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@floating-ui/react-dom": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/@floating-ui/react-dom/-/react-dom-2.1.2.tgz",
|
||||
"integrity": "sha512-06okr5cgPzMNBy+Ycse2A6udMi4bqwW/zgBF/rwjcNqWkyr82Mcg8b0vjX8OJpZFy/FKjJmw6wV7t44kK6kW7A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/dom": "^1.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": ">=16.8.0",
|
||||
"react-dom": ">=16.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@floating-ui/utils": {
|
||||
"version": "0.2.9",
|
||||
"resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.9.tgz",
|
||||
"integrity": "sha512-MDWhGtE+eHw5JW7lq4qhc5yRLS11ERl1c7Z6Xd0a58DozHES6EnNNwUWbMiG4J9Cgj053Bhk8zvlhFYKVhULwg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/react": {
|
||||
"version": "19.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.0.4.tgz",
|
||||
"integrity": "sha512-3O4QisJDYr1uTUMZHA2YswiQZRq+Pd8D+GdVFYikTutYsTz+QZgWkAPnP7rx9txoI6EXKcPiluMqWPFV3tT9Wg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"csstype": "^3.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/react-datepicker": {
|
||||
"version": "6.2.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/react-datepicker/-/react-datepicker-6.2.0.tgz",
|
||||
"integrity": "sha512-+JtO4Fm97WLkJTH8j8/v3Ldh7JCNRwjMYjRaKh4KHH0M3jJoXtwiD3JBCsdlg3tsFIw9eQSqyAPeVDN2H2oM9Q==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/react": "^0.26.2",
|
||||
"@types/react": "*",
|
||||
"date-fns": "^3.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/react-datepicker/node_modules/@floating-ui/react": {
|
||||
"version": "0.26.28",
|
||||
"resolved": "https://registry.npmjs.org/@floating-ui/react/-/react-0.26.28.tgz",
|
||||
"integrity": "sha512-yORQuuAtVpiRjpMhdc0wJj06b9JFjrYF4qp96j++v2NBpbi6SEGF7donUJ3TMieerQ6qVkAv1tgr7L4r5roTqw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/react-dom": "^2.1.2",
|
||||
"@floating-ui/utils": "^0.2.8",
|
||||
"tabbable": "^6.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": ">=16.8.0",
|
||||
"react-dom": ">=16.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/clsx": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz",
|
||||
"integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/csstype": {
|
||||
"version": "3.1.3",
|
||||
"resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
|
||||
"integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/date-fns": {
|
||||
"version": "3.6.0",
|
||||
"resolved": "https://registry.npmjs.org/date-fns/-/date-fns-3.6.0.tgz",
|
||||
"integrity": "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww==",
|
||||
"license": "MIT",
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/kossnocorp"
|
||||
}
|
||||
},
|
||||
"node_modules/react": {
|
||||
"version": "19.0.0",
|
||||
"resolved": "https://registry.npmjs.org/react/-/react-19.0.0.tgz",
|
||||
"integrity": "sha512-V8AVnmPIICiWpGfm6GLzCR/W5FXLchHop40W4nXBmdlEceh16rCN8O8LNWm5bh5XUX91fh7KpA+W0TgMKmgTpQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/react-datepicker": {
|
||||
"version": "7.6.0",
|
||||
"resolved": "https://registry.npmjs.org/react-datepicker/-/react-datepicker-7.6.0.tgz",
|
||||
"integrity": "sha512-9cQH6Z/qa4LrGhzdc3XoHbhrxNcMi9MKjZmYgF/1MNNaJwvdSjv3Xd+jjvrEEbKEf71ZgCA3n7fQbdwd70qCRw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@floating-ui/react": "^0.27.0",
|
||||
"clsx": "^2.1.1",
|
||||
"date-fns": "^3.6.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": "^16.9.0 || ^17 || ^18 || ^19 || ^19.0.0-rc",
|
||||
"react-dom": "^16.9.0 || ^17 || ^18 || ^19 || ^19.0.0-rc"
|
||||
}
|
||||
},
|
||||
"node_modules/react-dom": {
|
||||
"version": "19.0.0",
|
||||
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.0.0.tgz",
|
||||
"integrity": "sha512-4GV5sHFG0e/0AD4X+ySy6UJd3jVl1iNsNHdpad0qhABJ11twS3TTBnseqsKurKcsNqCEFeGL3uLpVChpIO3QfQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"scheduler": "^0.25.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": "^19.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/scheduler": {
|
||||
"version": "0.25.0",
|
||||
"resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.25.0.tgz",
|
||||
"integrity": "sha512-xFVuu11jh+xcO7JOAGJNOXld8/TcEHK/4CituBUeUb5hqxJLj9YuemAEuvm9gQ/+pgXYfbQuqAkiYu+u7YEsNA==",
|
||||
"license": "MIT",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/tabbable": {
|
||||
"version": "6.2.0",
|
||||
"resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.2.0.tgz",
|
||||
"integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==",
|
||||
"license": "MIT"
|
||||
}
|
||||
}
|
||||
}
|
||||
8
package.json
Normal file
8
package.json
Normal file
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"dependencies": {
|
||||
"react-datepicker": "^7.6.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react-datepicker": "^6.2.0"
|
||||
}
|
||||
}
|
||||
132
web/package-lock.json
generated
132
web/package-lock.json
generated
@@ -16,7 +16,6 @@
|
||||
"@headlessui/tailwindcss": "^0.2.1",
|
||||
"@phosphor-icons/react": "^2.0.8",
|
||||
"@radix-ui/react-checkbox": "^1.1.2",
|
||||
"@radix-ui/react-collapsible": "^1.1.2",
|
||||
"@radix-ui/react-dialog": "^1.1.2",
|
||||
"@radix-ui/react-dropdown-menu": "^2.1.4",
|
||||
"@radix-ui/react-label": "^2.1.1",
|
||||
@@ -3508,137 +3507,6 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-collapsible/-/react-collapsible-1.1.2.tgz",
|
||||
"integrity": "sha512-PliMB63vxz7vggcyq0IxNYk8vGDrLXVWw4+W4B8YnwI1s18x7YZYqlG9PLX7XxAJUi0g2DxP4XKJMFHh/iVh9A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@radix-ui/primitive": "1.1.1",
|
||||
"@radix-ui/react-compose-refs": "1.1.1",
|
||||
"@radix-ui/react-context": "1.1.1",
|
||||
"@radix-ui/react-id": "1.1.0",
|
||||
"@radix-ui/react-presence": "1.1.2",
|
||||
"@radix-ui/react-primitive": "2.0.1",
|
||||
"@radix-ui/react-use-controllable-state": "1.1.0",
|
||||
"@radix-ui/react-use-layout-effect": "1.1.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/primitive": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.1.tgz",
|
||||
"integrity": "sha512-SJ31y+Q/zAyShtXJc8x83i9TYdbAfHZ++tUZnvjJJqFjzsdUnKsxPL6IEtBlxKkU7yzer//GQtZSV4GbldL3YA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-compose-refs": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-compose-refs/-/react-compose-refs-1.1.1.tgz",
|
||||
"integrity": "sha512-Y9VzoRDSJtgFMUCoiZBDVo084VQ5hfpXxVE+NgkdNsjiDBByiImMZKKhxMwCbdHvhlENG6a833CbFkOQvTricw==",
|
||||
"license": "MIT",
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-context": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-context/-/react-context-1.1.1.tgz",
|
||||
"integrity": "sha512-UASk9zi+crv9WteK/NU4PLvOoL3OuE6BWVKNF6hPRBtYBDXQ2u5iu3O59zUlJiTVvkyuycnqrztsHVJwcK9K+Q==",
|
||||
"license": "MIT",
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-presence": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.2.tgz",
|
||||
"integrity": "sha512-18TFr80t5EVgL9x1SwF/YGtfG+l0BS0PRAlCWBDoBEiDQjeKgnNZRVJp/oVBl24sr3Gbfwc/Qpj4OcWTQMsAEg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@radix-ui/react-compose-refs": "1.1.1",
|
||||
"@radix-ui/react-use-layout-effect": "1.1.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-primitive": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.0.1.tgz",
|
||||
"integrity": "sha512-sHCWTtxwNn3L3fH8qAfnF3WbUZycW93SM1j3NFDzXBiz8D6F5UTTy8G1+WFEaiCdvCVRJWj6N2R4Xq6HdiHmDg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@radix-ui/react-slot": "1.1.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collapsible/node_modules/@radix-ui/react-slot": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.1.1.tgz",
|
||||
"integrity": "sha512-RApLLOcINYJA+dMVbOju7MYv1Mb2EBp2nH4HdDzXTSyaR5optlm6Otrz1euW3HbdOR8UmmFK06TD+A9frYWv+g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@radix-ui/react-compose-refs": "1.1.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@radix-ui/react-collection": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.0.tgz",
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
"@headlessui/tailwindcss": "^0.2.1",
|
||||
"@phosphor-icons/react": "^2.0.8",
|
||||
"@radix-ui/react-checkbox": "^1.1.2",
|
||||
"@radix-ui/react-collapsible": "^1.1.2",
|
||||
"@radix-ui/react-dialog": "^1.1.2",
|
||||
"@radix-ui/react-dropdown-menu": "^2.1.4",
|
||||
"@radix-ui/react-label": "^2.1.1",
|
||||
|
||||
@@ -1,16 +1,41 @@
|
||||
import { defineConfig, devices } from "@playwright/test";
|
||||
|
||||
export default defineConfig({
|
||||
globalSetup: require.resolve("./tests/e2e/global-setup"),
|
||||
|
||||
workers: 1, // temporary change to see if single threaded testing stabilizes the tests
|
||||
testDir: "./tests/e2e", // Folder for test files
|
||||
reporter: "list",
|
||||
// Configure paths for screenshots
|
||||
// expect: {
|
||||
// toMatchSnapshot: {
|
||||
// threshold: 0.2, // Adjust the threshold for visual diffs
|
||||
// },
|
||||
// },
|
||||
// reporter: [["html", { outputFolder: "test-results/output/report" }]], // HTML report location
|
||||
// outputDir: "test-results/output/screenshots", // Set output folder for test artifacts
|
||||
projects: [
|
||||
{
|
||||
name: "admin",
|
||||
// dependency for admin workflows
|
||||
name: "admin_setup",
|
||||
testMatch: /.*\admin_auth\.setup\.ts/,
|
||||
},
|
||||
{
|
||||
// tests admin workflows
|
||||
name: "chromium-admin",
|
||||
grep: /@admin/,
|
||||
use: {
|
||||
...devices["Desktop Chrome"],
|
||||
// Use prepared auth state.
|
||||
storageState: "admin_auth.json",
|
||||
},
|
||||
testIgnore: ["**/codeUtils.test.ts"],
|
||||
dependencies: ["admin_setup"],
|
||||
},
|
||||
{
|
||||
// tests logged out / guest workflows
|
||||
name: "chromium-guest",
|
||||
grep: /@guest/,
|
||||
use: {
|
||||
...devices["Desktop Chrome"],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
@@ -40,7 +40,14 @@ import * as Yup from "yup";
|
||||
import CollapsibleSection from "./CollapsibleSection";
|
||||
import { SuccessfulPersonaUpdateRedirectType } from "./enums";
|
||||
import { Persona, PersonaLabel, StarterMessage } from "./interfaces";
|
||||
import { PersonaUpsertParameters, createPersona, updatePersona } from "./lib";
|
||||
import {
|
||||
createPersonaLabel,
|
||||
PersonaUpsertParameters,
|
||||
createPersona,
|
||||
deletePersonaLabel,
|
||||
updatePersonaLabel,
|
||||
updatePersona,
|
||||
} from "./lib";
|
||||
import {
|
||||
CameraIcon,
|
||||
GroupsIconSkeleton,
|
||||
@@ -73,10 +80,9 @@ import { errorHandlingFetcher } from "@/lib/fetcher";
|
||||
import { DeleteEntityModal } from "@/components/modals/DeleteEntityModal";
|
||||
import { DeletePersonaButton } from "./[id]/DeletePersonaButton";
|
||||
import Title from "@/components/ui/title";
|
||||
import { SEARCH_TOOL_ID } from "@/app/chat/tools/constants";
|
||||
|
||||
function findSearchTool(tools: ToolSnapshot[]) {
|
||||
return tools.find((tool) => tool.in_code_tool_id === SEARCH_TOOL_ID);
|
||||
return tools.find((tool) => tool.in_code_tool_id === "SearchTool");
|
||||
}
|
||||
|
||||
function findImageGenerationTool(tools: ToolSnapshot[]) {
|
||||
@@ -232,9 +238,11 @@ export function AssistantEditor({
|
||||
existingPersona?.llm_model_provider_override ?? null,
|
||||
llm_model_version_override:
|
||||
existingPersona?.llm_model_version_override ?? null,
|
||||
starter_messages: existingPersona?.starter_messages?.length
|
||||
? existingPersona.starter_messages
|
||||
: [{ message: "" }],
|
||||
starter_messages: existingPersona?.starter_messages ?? [
|
||||
{
|
||||
message: "",
|
||||
},
|
||||
],
|
||||
enabled_tools_map: enabledToolsMap,
|
||||
icon_color: existingPersona?.icon_color ?? defautIconColor,
|
||||
icon_shape: existingPersona?.icon_shape ?? defaultIconShape,
|
||||
@@ -902,11 +910,28 @@ export function AssistantEditor({
|
||||
|
||||
{internetSearchTool && (
|
||||
<>
|
||||
<BooleanFormField
|
||||
name={`enabled_tools_map.${internetSearchTool.id}`}
|
||||
label={internetSearchTool.display_name}
|
||||
subtext="Access real-time information and search the web for up-to-date results"
|
||||
/>
|
||||
<div className="flex items-center content-start mb-2">
|
||||
<Checkbox
|
||||
size="sm"
|
||||
id={`enabled_tools_map.${internetSearchTool.id}`}
|
||||
checked={
|
||||
values.enabled_tools_map[internetSearchTool.id]
|
||||
}
|
||||
onCheckedChange={() => {
|
||||
toggleToolInValues(internetSearchTool.id);
|
||||
}}
|
||||
name={`enabled_tools_map.${internetSearchTool.id}`}
|
||||
/>
|
||||
<div className="flex flex-col ml-2">
|
||||
<span className="text-sm">
|
||||
{internetSearchTool.display_name}
|
||||
</span>
|
||||
<span className="text-xs text-subtle">
|
||||
Access real-time information and search the web
|
||||
for up-to-date results
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
|
||||
@@ -1097,9 +1122,7 @@ export function AssistantEditor({
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<Separator />
|
||||
|
||||
<div className="w-full flex flex-col">
|
||||
<div className="flex gap-x-2 items-center">
|
||||
<div className="block font-medium text-sm">
|
||||
@@ -1110,7 +1133,6 @@ export function AssistantEditor({
|
||||
<SubLabel>
|
||||
Sample messages that help users understand what this
|
||||
assistant can do and how to interact with it effectively.
|
||||
New input fields will appear automatically as you type.
|
||||
</SubLabel>
|
||||
|
||||
<div className="w-full">
|
||||
|
||||
@@ -64,16 +64,19 @@ export default function StarterMessagesList({
|
||||
size="icon"
|
||||
onClick={() => {
|
||||
arrayHelpers.remove(index);
|
||||
if (
|
||||
index === values.length - 2 &&
|
||||
!values[values.length - 1].message
|
||||
) {
|
||||
arrayHelpers.pop();
|
||||
}
|
||||
}}
|
||||
className={`text-gray-400 hover:text-red-500 ${
|
||||
index === values.length - 1 && !starterMessage.message
|
||||
? "opacity-50 cursor-not-allowed"
|
||||
: ""
|
||||
}`}
|
||||
disabled={
|
||||
(index === values.length - 1 && !starterMessage.message) ||
|
||||
(values.length === 1 && index === 0) // should never happen, but just in case
|
||||
}
|
||||
disabled={index === values.length - 1 && !starterMessage.message}
|
||||
>
|
||||
<FiTrash2 className="h-4 w-4" />
|
||||
</Button>
|
||||
|
||||
@@ -1,21 +1,38 @@
|
||||
"use client";
|
||||
|
||||
import React, { useMemo } from "react";
|
||||
import { Formik } from "formik";
|
||||
import { ArrayHelpers, FieldArray, Form, Formik } from "formik";
|
||||
import * as Yup from "yup";
|
||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||
import { DocumentSet, SlackChannelConfig } from "@/lib/types";
|
||||
import {
|
||||
BooleanFormField,
|
||||
Label,
|
||||
SelectorFormField,
|
||||
SubLabel,
|
||||
TextArrayField,
|
||||
TextFormField,
|
||||
} from "@/components/admin/connectors/Field";
|
||||
import {
|
||||
createSlackChannelConfig,
|
||||
isPersonaASlackBotPersona,
|
||||
updateSlackChannelConfig,
|
||||
} from "../lib";
|
||||
import CardSection from "@/components/admin/CardSection";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { useRouter } from "next/navigation";
|
||||
import { Persona } from "@/app/admin/assistants/interfaces";
|
||||
import { useState } from "react";
|
||||
import { AdvancedOptionsToggle } from "@/components/AdvancedOptionsToggle";
|
||||
import { DocumentSetSelectable } from "@/components/documentSet/DocumentSetSelectable";
|
||||
import CollapsibleSection from "@/app/admin/assistants/CollapsibleSection";
|
||||
import { StandardAnswerCategoryResponse } from "@/components/standardAnswers/getStandardAnswerCategoriesIfEE";
|
||||
import { SEARCH_TOOL_ID, SEARCH_TOOL_NAME } from "@/app/chat/tools/constants";
|
||||
import { SlackChannelConfigFormFields } from "./SlackChannelConfigFormFields";
|
||||
import { StandardAnswerCategoryDropdownField } from "@/components/standardAnswers/StandardAnswerCategoryDropdown";
|
||||
import {
|
||||
Tabs,
|
||||
TabsList,
|
||||
TabsTrigger,
|
||||
TabsContent,
|
||||
} from "@/components/ui/fully_wrapped_tabs";
|
||||
|
||||
export const SlackChannelConfigCreationForm = ({
|
||||
slack_bot_id,
|
||||
@@ -30,175 +47,353 @@ export const SlackChannelConfigCreationForm = ({
|
||||
standardAnswerCategoryResponse: StandardAnswerCategoryResponse;
|
||||
existingSlackChannelConfig?: SlackChannelConfig;
|
||||
}) => {
|
||||
const isUpdate = existingSlackChannelConfig !== undefined;
|
||||
const { popup, setPopup } = usePopup();
|
||||
const router = useRouter();
|
||||
const isUpdate = Boolean(existingSlackChannelConfig);
|
||||
const existingSlackBotUsesPersona = existingSlackChannelConfig?.persona
|
||||
? !isPersonaASlackBotPersona(existingSlackChannelConfig.persona)
|
||||
: false;
|
||||
const [usingPersonas, setUsingPersonas] = useState(
|
||||
existingSlackBotUsesPersona
|
||||
);
|
||||
const [showAdvancedOptions, setShowAdvancedOptions] = useState(false);
|
||||
|
||||
const searchEnabledAssistants = useMemo(() => {
|
||||
return personas.filter((persona) => {
|
||||
return persona.tools.some(
|
||||
(tool) => tool.in_code_tool_id == SEARCH_TOOL_ID
|
||||
);
|
||||
});
|
||||
}, [personas]);
|
||||
const knowledgePersona = personas.find((persona) => persona.id === 0);
|
||||
|
||||
return (
|
||||
<CardSection className="max-w-4xl">
|
||||
{popup}
|
||||
<Formik
|
||||
initialValues={{
|
||||
slack_bot_id: slack_bot_id,
|
||||
channel_name:
|
||||
existingSlackChannelConfig?.channel_config.channel_name || "",
|
||||
answer_validity_check_enabled: (
|
||||
existingSlackChannelConfig?.channel_config?.answer_filters || []
|
||||
).includes("well_answered_postfilter"),
|
||||
questionmark_prefilter_enabled: (
|
||||
existingSlackChannelConfig?.channel_config?.answer_filters || []
|
||||
).includes("questionmark_prefilter"),
|
||||
respond_tag_only:
|
||||
existingSlackChannelConfig?.channel_config?.respond_tag_only ||
|
||||
false,
|
||||
respond_to_bots:
|
||||
existingSlackChannelConfig?.channel_config?.respond_to_bots ||
|
||||
false,
|
||||
show_continue_in_web_ui:
|
||||
existingSlackChannelConfig?.channel_config
|
||||
?.show_continue_in_web_ui ?? !isUpdate,
|
||||
enable_auto_filters:
|
||||
existingSlackChannelConfig?.enable_auto_filters || false,
|
||||
respond_member_group_list:
|
||||
existingSlackChannelConfig?.channel_config
|
||||
?.respond_member_group_list || [],
|
||||
still_need_help_enabled:
|
||||
existingSlackChannelConfig?.channel_config?.follow_up_tags !==
|
||||
undefined,
|
||||
follow_up_tags:
|
||||
existingSlackChannelConfig?.channel_config?.follow_up_tags ||
|
||||
undefined,
|
||||
document_sets:
|
||||
existingSlackChannelConfig && existingSlackChannelConfig.persona
|
||||
? existingSlackChannelConfig.persona.document_sets.map(
|
||||
(documentSet) => documentSet.id
|
||||
)
|
||||
: ([] as number[]),
|
||||
persona_id:
|
||||
existingSlackChannelConfig?.persona &&
|
||||
!isPersonaASlackBotPersona(existingSlackChannelConfig.persona)
|
||||
? existingSlackChannelConfig.persona.id
|
||||
: null,
|
||||
response_type:
|
||||
existingSlackChannelConfig?.response_type || "citations",
|
||||
standard_answer_categories:
|
||||
existingSlackChannelConfig?.standard_answer_categories || [],
|
||||
knowledge_source: existingSlackBotUsesPersona
|
||||
? "assistant"
|
||||
: existingSlackChannelConfig?.persona
|
||||
? "document_sets"
|
||||
: "all_public",
|
||||
}}
|
||||
validationSchema={Yup.object().shape({
|
||||
slack_bot_id: Yup.number().required(),
|
||||
channel_name: Yup.string().required("Channel Name is required"),
|
||||
response_type: Yup.string()
|
||||
.oneOf(["quotes", "citations"])
|
||||
.required("Response type is required"),
|
||||
answer_validity_check_enabled: Yup.boolean().required(),
|
||||
questionmark_prefilter_enabled: Yup.boolean().required(),
|
||||
respond_tag_only: Yup.boolean().required(),
|
||||
respond_to_bots: Yup.boolean().required(),
|
||||
show_continue_in_web_ui: Yup.boolean().required(),
|
||||
enable_auto_filters: Yup.boolean().required(),
|
||||
respond_member_group_list: Yup.array().of(Yup.string()).required(),
|
||||
still_need_help_enabled: Yup.boolean().required(),
|
||||
follow_up_tags: Yup.array().of(Yup.string()),
|
||||
document_sets: Yup.array()
|
||||
.of(Yup.number())
|
||||
.when("knowledge_source", {
|
||||
is: "document_sets",
|
||||
then: (schema) =>
|
||||
schema.min(
|
||||
1,
|
||||
"At least one Document Set is required when using the 'Document Sets' knowledge source"
|
||||
),
|
||||
}),
|
||||
persona_id: Yup.number()
|
||||
.nullable()
|
||||
.when("knowledge_source", {
|
||||
is: "assistant",
|
||||
then: (schema) =>
|
||||
schema.required(
|
||||
"A persona is required when using the'Assistant' knowledge source"
|
||||
),
|
||||
}),
|
||||
standard_answer_categories: Yup.array(),
|
||||
knowledge_source: Yup.string()
|
||||
.oneOf(["all_public", "document_sets", "assistant"])
|
||||
.required(),
|
||||
})}
|
||||
onSubmit={async (values, formikHelpers) => {
|
||||
formikHelpers.setSubmitting(true);
|
||||
|
||||
const cleanedValues = {
|
||||
...values,
|
||||
slack_bot_id,
|
||||
channel_name: values.channel_name,
|
||||
respond_member_group_list: values.respond_member_group_list,
|
||||
usePersona: values.knowledge_source === "assistant",
|
||||
<div>
|
||||
<CardSection>
|
||||
{popup}
|
||||
<Formik
|
||||
initialValues={{
|
||||
slack_bot_id: slack_bot_id,
|
||||
channel_name:
|
||||
existingSlackChannelConfig?.channel_config.channel_name,
|
||||
answer_validity_check_enabled: (
|
||||
existingSlackChannelConfig?.channel_config?.answer_filters || []
|
||||
).includes("well_answered_postfilter"),
|
||||
questionmark_prefilter_enabled: (
|
||||
existingSlackChannelConfig?.channel_config?.answer_filters || []
|
||||
).includes("questionmark_prefilter"),
|
||||
respond_tag_only:
|
||||
existingSlackChannelConfig?.channel_config?.respond_tag_only ||
|
||||
false,
|
||||
respond_to_bots:
|
||||
existingSlackChannelConfig?.channel_config?.respond_to_bots ||
|
||||
false,
|
||||
show_continue_in_web_ui:
|
||||
// If we're updating, we want to keep the existing value
|
||||
// Otherwise, we want to default to true
|
||||
existingSlackChannelConfig?.channel_config
|
||||
?.show_continue_in_web_ui ?? !isUpdate,
|
||||
enable_auto_filters:
|
||||
existingSlackChannelConfig?.enable_auto_filters || false,
|
||||
respond_member_group_list:
|
||||
existingSlackChannelConfig?.channel_config
|
||||
?.respond_member_group_list ?? [],
|
||||
still_need_help_enabled:
|
||||
existingSlackChannelConfig?.channel_config?.follow_up_tags !==
|
||||
undefined,
|
||||
follow_up_tags:
|
||||
existingSlackChannelConfig?.channel_config?.follow_up_tags,
|
||||
document_sets:
|
||||
values.knowledge_source === "document_sets"
|
||||
? values.document_sets
|
||||
: [],
|
||||
existingSlackChannelConfig && existingSlackChannelConfig.persona
|
||||
? existingSlackChannelConfig.persona.document_sets.map(
|
||||
(documentSet) => documentSet.id
|
||||
)
|
||||
: ([] as number[]),
|
||||
// prettier-ignore
|
||||
persona_id:
|
||||
values.knowledge_source === "assistant"
|
||||
? values.persona_id
|
||||
: null,
|
||||
standard_answer_categories: values.standard_answer_categories.map(
|
||||
(category: any) => category.id
|
||||
),
|
||||
};
|
||||
existingSlackChannelConfig?.persona &&
|
||||
!isPersonaASlackBotPersona(existingSlackChannelConfig.persona)
|
||||
? existingSlackChannelConfig.persona.id
|
||||
: knowledgePersona?.id ?? null,
|
||||
response_type:
|
||||
existingSlackChannelConfig?.response_type || "citations",
|
||||
standard_answer_categories: existingSlackChannelConfig
|
||||
? existingSlackChannelConfig.standard_answer_categories
|
||||
: [],
|
||||
}}
|
||||
validationSchema={Yup.object().shape({
|
||||
slack_bot_id: Yup.number().required(),
|
||||
channel_name: Yup.string(),
|
||||
response_type: Yup.string()
|
||||
.oneOf(["quotes", "citations"])
|
||||
.required(),
|
||||
answer_validity_check_enabled: Yup.boolean().required(),
|
||||
questionmark_prefilter_enabled: Yup.boolean().required(),
|
||||
respond_tag_only: Yup.boolean().required(),
|
||||
respond_to_bots: Yup.boolean().required(),
|
||||
show_continue_in_web_ui: Yup.boolean().required(),
|
||||
enable_auto_filters: Yup.boolean().required(),
|
||||
respond_member_group_list: Yup.array().of(Yup.string()).required(),
|
||||
still_need_help_enabled: Yup.boolean().required(),
|
||||
follow_up_tags: Yup.array().of(Yup.string()),
|
||||
document_sets: Yup.array().of(Yup.number()),
|
||||
persona_id: Yup.number().nullable(),
|
||||
standard_answer_categories: Yup.array(),
|
||||
})}
|
||||
onSubmit={async (values, formikHelpers) => {
|
||||
formikHelpers.setSubmitting(true);
|
||||
|
||||
if (!cleanedValues.still_need_help_enabled) {
|
||||
cleanedValues.follow_up_tags = undefined;
|
||||
} else {
|
||||
if (!cleanedValues.follow_up_tags) {
|
||||
cleanedValues.follow_up_tags = [];
|
||||
const cleanedValues = {
|
||||
...values,
|
||||
slack_bot_id: slack_bot_id,
|
||||
channel_name: values.channel_name!,
|
||||
respond_member_group_list: values.respond_member_group_list,
|
||||
usePersona: usingPersonas,
|
||||
standard_answer_categories: values.standard_answer_categories.map(
|
||||
(category) => category.id
|
||||
),
|
||||
};
|
||||
if (!cleanedValues.still_need_help_enabled) {
|
||||
cleanedValues.follow_up_tags = undefined;
|
||||
} else {
|
||||
if (!cleanedValues.follow_up_tags) {
|
||||
cleanedValues.follow_up_tags = [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const response = isUpdate
|
||||
? await updateSlackChannelConfig(
|
||||
existingSlackChannelConfig!.id,
|
||||
let response;
|
||||
if (isUpdate) {
|
||||
response = await updateSlackChannelConfig(
|
||||
existingSlackChannelConfig.id,
|
||||
cleanedValues
|
||||
)
|
||||
: await createSlackChannelConfig(cleanedValues);
|
||||
);
|
||||
} else {
|
||||
response = await createSlackChannelConfig(cleanedValues);
|
||||
}
|
||||
formikHelpers.setSubmitting(false);
|
||||
if (response.ok) {
|
||||
router.push(`/admin/bots/${slack_bot_id}`);
|
||||
} else {
|
||||
const responseJson = await response.json();
|
||||
const errorMsg = responseJson.detail || responseJson.message;
|
||||
setPopup({
|
||||
message: isUpdate
|
||||
? `Error updating OnyxBot config - ${errorMsg}`
|
||||
: `Error creating OnyxBot config - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
}}
|
||||
>
|
||||
{({ isSubmitting, values, setFieldValue }) => (
|
||||
<Form>
|
||||
<div className="px-6 pb-6 pt-4 w-full">
|
||||
<TextFormField
|
||||
name="channel_name"
|
||||
label="Slack Channel Name:"
|
||||
/>
|
||||
|
||||
formikHelpers.setSubmitting(false);
|
||||
if (response.ok) {
|
||||
router.push(`/admin/bots/${slack_bot_id}`);
|
||||
} else {
|
||||
const responseJson = await response.json();
|
||||
const errorMsg = responseJson.detail || responseJson.message;
|
||||
setPopup({
|
||||
message: `Error ${
|
||||
isUpdate ? "updating" : "creating"
|
||||
} OnyxBot config - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
}}
|
||||
>
|
||||
<SlackChannelConfigFormFields
|
||||
isUpdate={isUpdate}
|
||||
documentSets={documentSets}
|
||||
searchEnabledAssistants={searchEnabledAssistants}
|
||||
standardAnswerCategoryResponse={standardAnswerCategoryResponse}
|
||||
setPopup={setPopup}
|
||||
/>
|
||||
</Formik>
|
||||
</CardSection>
|
||||
<div className="mt-6">
|
||||
<Label>Knowledge Sources</Label>
|
||||
<SubLabel>
|
||||
Controls which information OnyxBot will pull from when
|
||||
answering questions.
|
||||
</SubLabel>
|
||||
|
||||
<Tabs
|
||||
defaultValue="document_sets"
|
||||
className="w-full mt-4"
|
||||
value={usingPersonas ? "assistants" : "document_sets"}
|
||||
onValueChange={(value) =>
|
||||
setUsingPersonas(value === "assistants")
|
||||
}
|
||||
>
|
||||
<TabsList>
|
||||
<TabsTrigger value="document_sets">
|
||||
Document Sets
|
||||
</TabsTrigger>
|
||||
<TabsTrigger value="assistants">Assistants</TabsTrigger>
|
||||
</TabsList>
|
||||
|
||||
<TabsContent value="assistants">
|
||||
<SubLabel>
|
||||
Select the assistant OnyxBot will use while answering
|
||||
questions in Slack.
|
||||
</SubLabel>
|
||||
<SelectorFormField
|
||||
name="persona_id"
|
||||
options={personas.map((persona) => {
|
||||
return {
|
||||
name: persona.name,
|
||||
value: persona.id,
|
||||
};
|
||||
})}
|
||||
/>
|
||||
</TabsContent>
|
||||
|
||||
<TabsContent value="document_sets">
|
||||
<SubLabel>
|
||||
Select the document sets OnyxBot will use while
|
||||
answering questions in Slack.
|
||||
</SubLabel>
|
||||
<SubLabel>
|
||||
Note: If No Document Sets are selected, OnyxBot will
|
||||
search through all connected documents.
|
||||
</SubLabel>
|
||||
<FieldArray
|
||||
name="document_sets"
|
||||
render={(arrayHelpers: ArrayHelpers) => (
|
||||
<div>
|
||||
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
|
||||
{documentSets.map((documentSet) => {
|
||||
const ind = values.document_sets.indexOf(
|
||||
documentSet.id
|
||||
);
|
||||
const isSelected = ind !== -1;
|
||||
|
||||
return (
|
||||
<DocumentSetSelectable
|
||||
key={documentSet.id}
|
||||
documentSet={documentSet}
|
||||
isSelected={isSelected}
|
||||
onSelect={() => {
|
||||
if (isSelected) {
|
||||
arrayHelpers.remove(ind);
|
||||
} else {
|
||||
arrayHelpers.push(documentSet.id);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
<div></div>
|
||||
</div>
|
||||
)}
|
||||
/>
|
||||
</TabsContent>
|
||||
</Tabs>
|
||||
</div>
|
||||
|
||||
<div className="mt-6">
|
||||
<AdvancedOptionsToggle
|
||||
showAdvancedOptions={showAdvancedOptions}
|
||||
setShowAdvancedOptions={setShowAdvancedOptions}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{showAdvancedOptions && (
|
||||
<div className="mt-4">
|
||||
<div className="w-64 mb-4">
|
||||
<SelectorFormField
|
||||
name="response_type"
|
||||
label="Answer Type"
|
||||
tooltip="Controls the format of OnyxBot's responses."
|
||||
options={[
|
||||
{ name: "Standard", value: "citations" },
|
||||
{ name: "Detailed", value: "quotes" },
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<BooleanFormField
|
||||
name="show_continue_in_web_ui"
|
||||
removeIndent
|
||||
label="Show Continue in Web UI button"
|
||||
tooltip="If set, will show a button at the bottom of the response that allows the user to continue the conversation in the Onyx Web UI"
|
||||
/>
|
||||
<div className="flex flex-col space-y-3 mt-2">
|
||||
<BooleanFormField
|
||||
name="still_need_help_enabled"
|
||||
removeIndent
|
||||
label={'Give a "Still need help?" button'}
|
||||
tooltip={`OnyxBot's response will include a button at the bottom
|
||||
of the response that asks the user if they still need help.`}
|
||||
/>
|
||||
{values.still_need_help_enabled && (
|
||||
<CollapsibleSection prompt="Configure Still Need Help Button">
|
||||
<TextArrayField
|
||||
name="follow_up_tags"
|
||||
label="(Optional) Users / Groups to Tag"
|
||||
values={values}
|
||||
subtext={
|
||||
<div>
|
||||
The Slack users / groups we should tag if the
|
||||
user clicks the "Still need help?"
|
||||
button. If no emails are provided, we will not
|
||||
tag anyone and will just react with a 🆘 emoji
|
||||
to the original message.
|
||||
</div>
|
||||
}
|
||||
placeholder="User email or user group name..."
|
||||
/>
|
||||
</CollapsibleSection>
|
||||
)}
|
||||
|
||||
<BooleanFormField
|
||||
name="answer_validity_check_enabled"
|
||||
removeIndent
|
||||
label="Only respond if citations found"
|
||||
tooltip="If set, will only answer questions where the model successfully produces citations"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="questionmark_prefilter_enabled"
|
||||
removeIndent
|
||||
label="Only respond to questions"
|
||||
tooltip="If set, will only respond to messages that contain a question mark"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="respond_tag_only"
|
||||
removeIndent
|
||||
label="Respond to @OnyxBot Only"
|
||||
tooltip="If set, OnyxBot will only respond when directly tagged"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="respond_to_bots"
|
||||
removeIndent
|
||||
label="Respond to Bot messages"
|
||||
tooltip="If not set, OnyxBot will always ignore messages from Bots"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="enable_auto_filters"
|
||||
removeIndent
|
||||
label="Enable LLM Autofiltering"
|
||||
tooltip="If set, the LLM will generate source and time filters based on the user's query"
|
||||
/>
|
||||
|
||||
<div className="mt-12">
|
||||
<TextArrayField
|
||||
name="respond_member_group_list"
|
||||
label="(Optional) Respond to Certain Users / Groups"
|
||||
subtext={
|
||||
"If specified, OnyxBot responses will only " +
|
||||
"be visible to the members or groups in this list."
|
||||
}
|
||||
values={values}
|
||||
placeholder="User email or user group name..."
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<StandardAnswerCategoryDropdownField
|
||||
standardAnswerCategoryResponse={
|
||||
standardAnswerCategoryResponse
|
||||
}
|
||||
categories={values.standard_answer_categories}
|
||||
setCategories={(categories) =>
|
||||
setFieldValue("standard_answer_categories", categories)
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="flex">
|
||||
<Button
|
||||
type="submit"
|
||||
variant="submit"
|
||||
disabled={isSubmitting || !values.channel_name}
|
||||
className="mx-auto w-64"
|
||||
>
|
||||
{isUpdate ? "Update!" : "Create!"}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</Form>
|
||||
)}
|
||||
</Formik>
|
||||
</CardSection>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -1,530 +0,0 @@
|
||||
"use client";
|
||||
|
||||
import React, { useState, useEffect, useMemo } from "react";
|
||||
import { FieldArray, Form, useFormikContext, ErrorMessage } from "formik";
|
||||
import { CCPairDescriptor, DocumentSet } from "@/lib/types";
|
||||
import {
|
||||
BooleanFormField,
|
||||
Label,
|
||||
SelectorFormField,
|
||||
SubLabel,
|
||||
TextArrayField,
|
||||
TextFormField,
|
||||
} from "@/components/admin/connectors/Field";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Persona } from "@/app/admin/assistants/interfaces";
|
||||
import { AdvancedOptionsToggle } from "@/components/AdvancedOptionsToggle";
|
||||
import { DocumentSetSelectable } from "@/components/documentSet/DocumentSetSelectable";
|
||||
import CollapsibleSection from "@/app/admin/assistants/CollapsibleSection";
|
||||
import { StandardAnswerCategoryResponse } from "@/components/standardAnswers/getStandardAnswerCategoriesIfEE";
|
||||
import { StandardAnswerCategoryDropdownField } from "@/components/standardAnswers/StandardAnswerCategoryDropdown";
|
||||
import { RadioGroup } from "@/components/ui/radio-group";
|
||||
import { RadioGroupItemField } from "@/components/ui/RadioGroupItemField";
|
||||
import { AlertCircle, View } from "lucide-react";
|
||||
import { useRouter } from "next/navigation";
|
||||
import {
|
||||
Tooltip,
|
||||
TooltipContent,
|
||||
TooltipTrigger,
|
||||
} from "@/components/ui/tooltip";
|
||||
import { TooltipProvider } from "@radix-ui/react-tooltip";
|
||||
import { SourceIcon } from "@/components/SourceIcon";
|
||||
import Link from "next/link";
|
||||
import { AssistantIcon } from "@/components/assistants/AssistantIcon";
|
||||
|
||||
interface SlackChannelConfigFormFieldsProps {
|
||||
isUpdate: boolean;
|
||||
documentSets: DocumentSet[];
|
||||
searchEnabledAssistants: Persona[];
|
||||
standardAnswerCategoryResponse: StandardAnswerCategoryResponse;
|
||||
setPopup: (popup: {
|
||||
message: string;
|
||||
type: "error" | "success" | "warning";
|
||||
}) => void;
|
||||
}
|
||||
|
||||
export function SlackChannelConfigFormFields({
|
||||
isUpdate,
|
||||
documentSets,
|
||||
searchEnabledAssistants,
|
||||
standardAnswerCategoryResponse,
|
||||
setPopup,
|
||||
}: SlackChannelConfigFormFieldsProps) {
|
||||
const router = useRouter();
|
||||
const { values, setFieldValue } = useFormikContext<any>();
|
||||
const [showAdvancedOptions, setShowAdvancedOptions] = useState(false);
|
||||
const [viewUnselectableSets, setViewUnselectableSets] = useState(false);
|
||||
const [viewSyncEnabledAssistants, setViewSyncEnabledAssistants] =
|
||||
useState(false);
|
||||
|
||||
const documentSetContainsSync = (documentSet: DocumentSet) =>
|
||||
documentSet.cc_pair_descriptors.some(
|
||||
(descriptor) => descriptor.access_type === "sync"
|
||||
);
|
||||
|
||||
const [syncEnabledAssistants, availableAssistants] = useMemo(() => {
|
||||
const sync: Persona[] = [];
|
||||
const available: Persona[] = [];
|
||||
|
||||
searchEnabledAssistants.forEach((persona) => {
|
||||
const hasSyncSet = persona.document_sets.some(documentSetContainsSync);
|
||||
if (hasSyncSet) {
|
||||
sync.push(persona);
|
||||
} else {
|
||||
available.push(persona);
|
||||
}
|
||||
});
|
||||
|
||||
return [sync, available];
|
||||
}, [searchEnabledAssistants]);
|
||||
|
||||
const unselectableSets = useMemo(() => {
|
||||
return documentSets.filter((ds) =>
|
||||
ds.cc_pair_descriptors.some(
|
||||
(descriptor) => descriptor.access_type === "sync"
|
||||
)
|
||||
);
|
||||
}, [documentSets]);
|
||||
const memoizedPrivateConnectors = useMemo(() => {
|
||||
const uniqueDescriptors = new Map();
|
||||
documentSets.forEach((ds) => {
|
||||
ds.cc_pair_descriptors.forEach((descriptor) => {
|
||||
if (
|
||||
descriptor.access_type === "private" &&
|
||||
!uniqueDescriptors.has(descriptor.id)
|
||||
) {
|
||||
uniqueDescriptors.set(descriptor.id, descriptor);
|
||||
}
|
||||
});
|
||||
});
|
||||
return Array.from(uniqueDescriptors.values());
|
||||
}, [documentSets]);
|
||||
|
||||
useEffect(() => {
|
||||
const invalidSelected = values.document_sets.filter((dsId: number) =>
|
||||
unselectableSets.some((us) => us.id === dsId)
|
||||
);
|
||||
if (invalidSelected.length > 0) {
|
||||
setFieldValue(
|
||||
"document_sets",
|
||||
values.document_sets.filter(
|
||||
(dsId: number) => !invalidSelected.includes(dsId)
|
||||
)
|
||||
);
|
||||
setPopup({
|
||||
message:
|
||||
"We removed one or more document sets from your selection because they are no longer valid. Please review and update your configuration.",
|
||||
type: "warning",
|
||||
});
|
||||
}
|
||||
}, [unselectableSets, values.document_sets, setFieldValue, setPopup]);
|
||||
|
||||
const documentSetContainsPrivate = (documentSet: DocumentSet) => {
|
||||
return documentSet.cc_pair_descriptors.some(
|
||||
(descriptor) => descriptor.access_type === "private"
|
||||
);
|
||||
};
|
||||
|
||||
const shouldShowPrivacyAlert = useMemo(() => {
|
||||
if (values.knowledge_source === "document_sets") {
|
||||
const selectedSets = documentSets.filter((ds) =>
|
||||
values.document_sets.includes(ds.id)
|
||||
);
|
||||
return selectedSets.some((ds) => documentSetContainsPrivate(ds));
|
||||
} else if (values.knowledge_source === "assistant") {
|
||||
const chosenAssistant = searchEnabledAssistants.find(
|
||||
(p) => p.id == values.persona_id
|
||||
);
|
||||
return chosenAssistant?.document_sets.some((ds) =>
|
||||
documentSetContainsPrivate(ds)
|
||||
);
|
||||
}
|
||||
return false;
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [values.knowledge_source, values.document_sets, values.persona_id]);
|
||||
|
||||
const selectableSets = useMemo(() => {
|
||||
return documentSets.filter(
|
||||
(ds) =>
|
||||
!ds.cc_pair_descriptors.some(
|
||||
(descriptor) => descriptor.access_type === "sync"
|
||||
)
|
||||
);
|
||||
}, [documentSets]);
|
||||
|
||||
return (
|
||||
<Form className="px-6 max-w-4xl">
|
||||
<div className="pt-4 w-full">
|
||||
<TextFormField name="channel_name" label="Slack Channel Name:" />
|
||||
|
||||
<div className="space-y-2 mt-4">
|
||||
<Label>Knowledge Source</Label>
|
||||
<RadioGroup
|
||||
className="flex flex-col gap-y-4"
|
||||
value={values.knowledge_source}
|
||||
onValueChange={(value: string) => {
|
||||
setFieldValue("knowledge_source", value);
|
||||
}}
|
||||
>
|
||||
<RadioGroupItemField
|
||||
value="all_public"
|
||||
id="all_public"
|
||||
label="All Public Knowledge"
|
||||
sublabel="Let OnyxBot respond based on information from all public connectors "
|
||||
/>
|
||||
{selectableSets.length + unselectableSets.length > 0 && (
|
||||
<RadioGroupItemField
|
||||
value="document_sets"
|
||||
id="document_sets"
|
||||
label="Specific Document Sets"
|
||||
sublabel="Control which documents to use for answering questions"
|
||||
/>
|
||||
)}
|
||||
<RadioGroupItemField
|
||||
value="assistant"
|
||||
id="assistant"
|
||||
label="Specific Assistant"
|
||||
sublabel="Control both the documents and the prompt to use for answering questions"
|
||||
/>
|
||||
</RadioGroup>
|
||||
</div>
|
||||
|
||||
{values.knowledge_source === "document_sets" &&
|
||||
documentSets.length > 0 && (
|
||||
<div className="mt-4">
|
||||
<SubLabel>
|
||||
<>
|
||||
Select the document sets OnyxBot will use while answering
|
||||
questions in Slack.
|
||||
<br />
|
||||
{unselectableSets.length > 0 ? (
|
||||
<span>
|
||||
Some incompatible document sets are{" "}
|
||||
{viewUnselectableSets ? "visible" : "hidden"}.{" "}
|
||||
<button
|
||||
type="button"
|
||||
onClick={() =>
|
||||
setViewUnselectableSets(
|
||||
(viewUnselectableSets) => !viewUnselectableSets
|
||||
)
|
||||
}
|
||||
className="text-sm text-link"
|
||||
>
|
||||
{viewUnselectableSets
|
||||
? "Hide un-selectable "
|
||||
: "View all "}
|
||||
document sets
|
||||
</button>
|
||||
</span>
|
||||
) : (
|
||||
""
|
||||
)}
|
||||
</>
|
||||
</SubLabel>
|
||||
<FieldArray
|
||||
name="document_sets"
|
||||
render={(arrayHelpers) => (
|
||||
<>
|
||||
{selectableSets.length > 0 && (
|
||||
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
|
||||
{selectableSets.map((documentSet) => {
|
||||
const selectedIndex = values.document_sets.indexOf(
|
||||
documentSet.id
|
||||
);
|
||||
const isSelected = selectedIndex !== -1;
|
||||
|
||||
return (
|
||||
<DocumentSetSelectable
|
||||
key={documentSet.id}
|
||||
documentSet={documentSet}
|
||||
isSelected={isSelected}
|
||||
onSelect={() => {
|
||||
if (isSelected) {
|
||||
arrayHelpers.remove(selectedIndex);
|
||||
} else {
|
||||
arrayHelpers.push(documentSet.id);
|
||||
}
|
||||
}}
|
||||
/>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{viewUnselectableSets && unselectableSets.length > 0 && (
|
||||
<div className="mt-4">
|
||||
<p className="text-sm text-text-dark/80">
|
||||
These document sets cannot be attached as they have
|
||||
auto-synced docs:
|
||||
</p>
|
||||
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
|
||||
{unselectableSets.map((documentSet) => (
|
||||
<DocumentSetSelectable
|
||||
key={documentSet.id}
|
||||
documentSet={documentSet}
|
||||
disabled
|
||||
disabledTooltip="Unable to use this document set because it contains a connector with auto-sync permissions. OnyxBot's responses in this channel are visible to all Slack users, so mirroring the asker's permissions could inadvertently expose private information."
|
||||
isSelected={false}
|
||||
onSelect={() => {}}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
<ErrorMessage
|
||||
className="text-red-500 text-sm mt-1"
|
||||
name="document_sets"
|
||||
component="div"
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{values.knowledge_source === "assistant" && (
|
||||
<div className="mt-4">
|
||||
<SubLabel>
|
||||
<>
|
||||
Select the search-enabled assistant OnyxBot will use while
|
||||
answering questions in Slack.
|
||||
{syncEnabledAssistants.length > 0 && (
|
||||
<>
|
||||
<br />
|
||||
<span className="text-sm text-text-dark/80">
|
||||
Note: Some of your assistants have auto-synced connectors
|
||||
in their document sets. You cannot select these assistants
|
||||
as they will not be able to answer questions in Slack.{" "}
|
||||
<button
|
||||
type="button"
|
||||
onClick={() =>
|
||||
setViewSyncEnabledAssistants(
|
||||
(viewSyncEnabledAssistants) =>
|
||||
!viewSyncEnabledAssistants
|
||||
)
|
||||
}
|
||||
className="text-sm text-link"
|
||||
>
|
||||
{viewSyncEnabledAssistants
|
||||
? "Hide un-selectable "
|
||||
: "View all "}
|
||||
assistants
|
||||
</button>
|
||||
</span>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
</SubLabel>
|
||||
|
||||
<SelectorFormField
|
||||
name="persona_id"
|
||||
options={availableAssistants.map((persona) => ({
|
||||
name: persona.name,
|
||||
value: persona.id,
|
||||
}))}
|
||||
/>
|
||||
{viewSyncEnabledAssistants && syncEnabledAssistants.length > 0 && (
|
||||
<div className="mt-4">
|
||||
<p className="text-sm text-text-dark/80">
|
||||
Un-selectable assistants:
|
||||
</p>
|
||||
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
|
||||
{syncEnabledAssistants.map((persona: Persona) => (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() =>
|
||||
router.push(`/admin/assistants/${persona.id}`)
|
||||
}
|
||||
key={persona.id}
|
||||
className="p-2 bg-background-100 cursor-pointer rounded-md flex items-center gap-2"
|
||||
>
|
||||
<AssistantIcon
|
||||
assistant={persona}
|
||||
size={16}
|
||||
className="flex-none"
|
||||
/>
|
||||
{persona.name}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="mt-2">
|
||||
<AdvancedOptionsToggle
|
||||
showAdvancedOptions={showAdvancedOptions}
|
||||
setShowAdvancedOptions={setShowAdvancedOptions}
|
||||
/>
|
||||
</div>
|
||||
{showAdvancedOptions && (
|
||||
<div className="mt-4">
|
||||
<div className="w-64 mb-4">
|
||||
<SelectorFormField
|
||||
name="response_type"
|
||||
label="Answer Type"
|
||||
tooltip="Controls the format of OnyxBot's responses."
|
||||
options={[
|
||||
{ name: "Standard", value: "citations" },
|
||||
{ name: "Detailed", value: "quotes" },
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<BooleanFormField
|
||||
name="show_continue_in_web_ui"
|
||||
removeIndent
|
||||
label="Show Continue in Web UI button"
|
||||
tooltip="If set, will show a button at the bottom of the response that allows the user to continue the conversation in the Onyx Web UI"
|
||||
/>
|
||||
|
||||
<div className="flex flex-col space-y-3 mt-2">
|
||||
<BooleanFormField
|
||||
name="still_need_help_enabled"
|
||||
removeIndent
|
||||
onChange={(checked: boolean) => {
|
||||
setFieldValue("still_need_help_enabled", checked);
|
||||
if (!checked) {
|
||||
setFieldValue("follow_up_tags", []);
|
||||
}
|
||||
}}
|
||||
label={'Give a "Still need help?" button'}
|
||||
tooltip={`OnyxBot's response will include a button at the bottom
|
||||
of the response that asks the user if they still need help.`}
|
||||
/>
|
||||
{values.still_need_help_enabled && (
|
||||
<CollapsibleSection prompt="Configure Still Need Help Button">
|
||||
<TextArrayField
|
||||
name="follow_up_tags"
|
||||
label="(Optional) Users / Groups to Tag"
|
||||
values={values}
|
||||
subtext={
|
||||
<div>
|
||||
The Slack users / groups we should tag if the user clicks
|
||||
the "Still need help?" button. If no emails are
|
||||
provided, we will not tag anyone and will just react with
|
||||
a 🆘 emoji to the original message.
|
||||
</div>
|
||||
}
|
||||
placeholder="User email or user group name..."
|
||||
/>
|
||||
</CollapsibleSection>
|
||||
)}
|
||||
|
||||
<BooleanFormField
|
||||
name="answer_validity_check_enabled"
|
||||
removeIndent
|
||||
label="Only respond if citations found"
|
||||
tooltip="If set, will only answer questions where the model successfully produces citations"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="questionmark_prefilter_enabled"
|
||||
removeIndent
|
||||
label="Only respond to questions"
|
||||
tooltip="If set, OnyxBot will only respond to messages that contain a question mark"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="respond_tag_only"
|
||||
removeIndent
|
||||
label="Respond to @OnyxBot Only"
|
||||
tooltip="If set, OnyxBot will only respond when directly tagged"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="respond_to_bots"
|
||||
removeIndent
|
||||
label="Respond to Bot messages"
|
||||
tooltip="If not set, OnyxBot will always ignore messages from Bots"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="enable_auto_filters"
|
||||
removeIndent
|
||||
label="Enable LLM Autofiltering"
|
||||
tooltip="If set, the LLM will generate source and time filters based on the user's query"
|
||||
/>
|
||||
|
||||
<div className="mt-12">
|
||||
<TextArrayField
|
||||
name="respond_member_group_list"
|
||||
label="(Optional) Respond to Certain Users / Groups"
|
||||
subtext={
|
||||
"If specified, OnyxBot responses will only " +
|
||||
"be visible to the members or groups in this list."
|
||||
}
|
||||
values={values}
|
||||
placeholder="User email or user group name..."
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<StandardAnswerCategoryDropdownField
|
||||
standardAnswerCategoryResponse={standardAnswerCategoryResponse}
|
||||
categories={values.standard_answer_categories}
|
||||
setCategories={(categories: any) =>
|
||||
setFieldValue("standard_answer_categories", categories)
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="flex mt-2 gap-x-2 w-full justify-end flex">
|
||||
{shouldShowPrivacyAlert && (
|
||||
<TooltipProvider>
|
||||
<Tooltip>
|
||||
<TooltipTrigger asChild>
|
||||
<div className="flex hover:bg-background-150 cursor-pointer p-2 rounded-lg items-center">
|
||||
<AlertCircle className="h-5 w-5 text-alert" />
|
||||
</div>
|
||||
</TooltipTrigger>
|
||||
<TooltipContent side="top" className="bg-white p-4 w-80">
|
||||
<Label className="text-text mb-2 font-semibold">
|
||||
Privacy Alert
|
||||
</Label>
|
||||
<p className="text-sm text-text-darker mb-4">
|
||||
Please note that at least one of the documents accessible by
|
||||
your OnyxBot is marked as private and may contain sensitive
|
||||
information. These documents will be accessible to all users
|
||||
of this OnyxBot. Ensure this aligns with your intended
|
||||
document sharing policy.
|
||||
</p>
|
||||
<div className="space-y-2">
|
||||
<h4 className="text-sm text-text font-medium">
|
||||
Relevant Connectors:
|
||||
</h4>
|
||||
<div className="max-h-40 overflow-y-auto border-t border-text-subtle flex-col gap-y-2">
|
||||
{memoizedPrivateConnectors.map(
|
||||
(ccpairinfo: CCPairDescriptor<any, any>) => (
|
||||
<Link
|
||||
key={ccpairinfo.id}
|
||||
href={`/admin/connector/${ccpairinfo.id}`}
|
||||
className="flex items-center p-2 rounded-md hover:bg-gray-100 transition-colors"
|
||||
>
|
||||
<div className="mr-2">
|
||||
<SourceIcon
|
||||
iconSize={16}
|
||||
sourceType={ccpairinfo.connector.source}
|
||||
/>
|
||||
</div>
|
||||
<span className="text-sm text-text-darker font-medium">
|
||||
{ccpairinfo.name}
|
||||
</span>
|
||||
</Link>
|
||||
)
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</TooltipContent>
|
||||
</Tooltip>
|
||||
</TooltipProvider>
|
||||
)}
|
||||
<Button onClick={() => {}} type="submit">
|
||||
{isUpdate ? "Update" : "Create"}
|
||||
</Button>
|
||||
<Button type="button" variant="outline" onClick={() => router.back()}>
|
||||
Cancel
|
||||
</Button>
|
||||
</div>
|
||||
</Form>
|
||||
);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user