Compare commits

..

1 Commits

Author SHA1 Message Date
pablonyx
25d9266da4 update 2025-02-26 08:48:35 -08:00
162 changed files with 1924 additions and 7014 deletions

View File

@@ -18,13 +18,12 @@ depends_on = None
def upgrade() -> None:
# Create a basic index on the lowercase message column for direct text matching
# Limit to 1500 characters to stay well under the 2856 byte limit of btree version 4
# op.execute(
# """
# CREATE INDEX idx_chat_message_message_lower
# ON chat_message (LOWER(substring(message, 1, 1500)))
# """
# )
pass
op.execute(
"""
CREATE INDEX idx_chat_message_message_lower
ON chat_message (LOWER(substring(message, 1, 1500)))
"""
)
def downgrade() -> None:

View File

@@ -1,107 +0,0 @@
"""add user files
Revision ID: 9aadf32dfeb4
Revises: 8f43500ee275
Create Date: 2025-01-26 16:08:21.551022
"""
from alembic import op
import sqlalchemy as sa
import datetime
# revision identifiers, used by Alembic.
revision = "9aadf32dfeb4"
down_revision = "8f43500ee275"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Create user_folder table without parent_id
op.create_table(
"user_folder",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column("name", sa.String(length=255), nullable=True),
sa.Column("description", sa.String(length=255), nullable=True),
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
sa.Column(
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
),
)
# Create user_file table with folder_id instead of parent_folder_id
op.create_table(
"user_file",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column(
"folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
nullable=True,
),
sa.Column("token_count", sa.Integer(), nullable=True),
sa.Column("file_type", sa.String(), nullable=True),
sa.Column("file_id", sa.String(length=255), nullable=False),
sa.Column("document_id", sa.String(length=255), nullable=False),
sa.Column("name", sa.String(length=255), nullable=False),
sa.Column(
"created_at",
sa.DateTime(),
default=datetime.datetime.utcnow,
),
sa.Column(
"cc_pair_id",
sa.Integer(),
sa.ForeignKey("connector_credential_pair.id"),
nullable=True,
unique=True,
),
)
# Create persona__user_file table
op.create_table(
"persona__user_file",
sa.Column(
"persona_id", sa.Integer(), sa.ForeignKey("persona.id"), primary_key=True
),
sa.Column(
"user_file_id",
sa.Integer(),
sa.ForeignKey("user_file.id"),
primary_key=True,
),
)
# Create persona__user_folder table
op.create_table(
"persona__user_folder",
sa.Column(
"persona_id", sa.Integer(), sa.ForeignKey("persona.id"), primary_key=True
),
sa.Column(
"user_folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
primary_key=True,
),
)
op.add_column(
"connector_credential_pair",
sa.Column("is_user_file", sa.Boolean(), nullable=True),
)
def downgrade() -> None:
# Drop the persona__user_folder table
op.drop_table("persona__user_folder")
# Drop the persona__user_file table
op.drop_table("persona__user_file")
# Drop the user_file table
op.drop_table("user_file")
# Drop the user_folder table
op.drop_table("user_folder")
op.drop_column("connector_credential_pair", "is_user_file")

View File

@@ -5,9 +5,11 @@ from onyx.background.celery.apps.primary import celery_app
from onyx.background.task_utils import build_celery_task_wrapper
from onyx.configs.app_configs import JOB_TIMEOUT
from onyx.db.chat import delete_chat_sessions_older_than
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.engine import get_session_with_tenant
from onyx.server.settings.store import load_settings
from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
logger = setup_logger()
@@ -16,8 +18,10 @@ logger = setup_logger()
@build_celery_task_wrapper(name_chat_ttl_task)
@celery_app.task(soft_time_limit=JOB_TIMEOUT)
def perform_ttl_management_task(retention_limit_days: int, *, tenant_id: str) -> None:
with get_session_with_current_tenant() as db_session:
def perform_ttl_management_task(
retention_limit_days: int, *, tenant_id: str | None
) -> None:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
delete_chat_sessions_older_than(retention_limit_days, db_session)
@@ -31,19 +35,24 @@ def perform_ttl_management_task(retention_limit_days: int, *, tenant_id: str) ->
ignore_result=True,
soft_time_limit=JOB_TIMEOUT,
)
def check_ttl_management_task(*, tenant_id: str) -> None:
def check_ttl_management_task(*, tenant_id: str | None) -> None:
"""Runs periodically to check if any ttl tasks should be run and adds them
to the queue"""
token = None
if MULTI_TENANT and tenant_id is not None:
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
settings = load_settings()
retention_limit_days = settings.maximum_chat_retention_days
with get_session_with_current_tenant() as db_session:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
if should_perform_chat_ttl_check(retention_limit_days, db_session):
perform_ttl_management_task.apply_async(
kwargs=dict(
retention_limit_days=retention_limit_days, tenant_id=tenant_id
),
)
if token is not None:
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
@celery_app.task(
@@ -51,9 +60,9 @@ def check_ttl_management_task(*, tenant_id: str) -> None:
ignore_result=True,
soft_time_limit=JOB_TIMEOUT,
)
def autogenerate_usage_report_task(*, tenant_id: str) -> None:
def autogenerate_usage_report_task(*, tenant_id: str | None) -> None:
"""This generates usage report under the /admin/generate-usage/report endpoint"""
with get_session_with_current_tenant() as db_session:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
create_new_usage_report(
db_session=db_session,
user_id=None,

View File

@@ -18,7 +18,7 @@ logger = setup_logger()
def monitor_usergroup_taskset(
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
"""This function is likely to move in the worker refactor happening next."""
fence_key = key_bytes.decode("utf-8")

View File

@@ -28,7 +28,7 @@ def get_tenant_id_for_email(email: str) -> str:
def user_owns_a_tenant(email: str) -> bool:
with get_session_with_tenant(tenant_id=POSTGRES_DEFAULT_SCHEMA) as db_session:
with get_session_with_tenant(tenant_id=None) as db_session:
result = (
db_session.query(UserTenantMapping)
.filter(UserTenantMapping.email == email)
@@ -38,7 +38,7 @@ def user_owns_a_tenant(email: str) -> bool:
def add_users_to_tenant(emails: list[str], tenant_id: str) -> None:
with get_session_with_tenant(tenant_id=POSTGRES_DEFAULT_SCHEMA) as db_session:
with get_session_with_tenant(tenant_id=None) as db_session:
try:
for email in emails:
db_session.add(UserTenantMapping(email=email, tenant_id=tenant_id))
@@ -48,7 +48,7 @@ def add_users_to_tenant(emails: list[str], tenant_id: str) -> None:
def remove_users_from_tenant(emails: list[str], tenant_id: str) -> None:
with get_session_with_tenant(tenant_id=POSTGRES_DEFAULT_SCHEMA) as db_session:
with get_session_with_tenant(tenant_id=None) as db_session:
try:
mappings_to_delete = (
db_session.query(UserTenantMapping)
@@ -71,7 +71,7 @@ def remove_users_from_tenant(emails: list[str], tenant_id: str) -> None:
def remove_all_users_from_tenant(tenant_id: str) -> None:
with get_session_with_tenant(tenant_id=POSTGRES_DEFAULT_SCHEMA) as db_session:
with get_session_with_tenant(tenant_id=None) as db_session:
db_session.query(UserTenantMapping).filter(
UserTenantMapping.tenant_id == tenant_id
).delete()

View File

@@ -319,10 +319,8 @@ def dispatch_separated(
sep: str = DISPATCH_SEP_CHAR,
) -> list[BaseMessage_Content]:
num = 1
accumulated_tokens = ""
streamed_tokens: list[BaseMessage_Content] = []
for token in tokens:
accumulated_tokens += cast(str, token.content)
content = cast(str, token.content)
if sep in content:
sub_question_parts = content.split(sep)

View File

@@ -10,7 +10,6 @@ from pydantic import BaseModel
from onyx.auth.schemas import UserRole
from onyx.configs.app_configs import API_KEY_HASH_ROUNDS
from shared_configs.configs import MULTI_TENANT
_API_KEY_HEADER_NAME = "Authorization"
@@ -36,7 +35,8 @@ class ApiKeyDescriptor(BaseModel):
def generate_api_key(tenant_id: str | None = None) -> str:
if not MULTI_TENANT or not tenant_id:
# For backwards compatibility, if no tenant_id, generate old style key
if not tenant_id:
return _API_KEY_PREFIX + secrets.token_urlsafe(_API_KEY_LEN)
encoded_tenant = quote(tenant_id) # URL encode the tenant ID

View File

@@ -2,8 +2,6 @@ import smtplib
from datetime import datetime
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formatdate
from email.utils import make_msgid
from onyx.configs.app_configs import EMAIL_CONFIGURED
from onyx.configs.app_configs import EMAIL_FROM
@@ -15,7 +13,6 @@ from onyx.configs.app_configs import WEB_DOMAIN
from onyx.configs.constants import AuthType
from onyx.configs.constants import TENANT_ID_COOKIE_NAME
from onyx.db.models import User
from shared_configs.configs import MULTI_TENANT
HTML_EMAIL_TEMPLATE = """\
<!DOCTYPE html>
@@ -153,9 +150,8 @@ def send_email(
msg = MIMEMultipart("alternative")
msg["Subject"] = subject
msg["To"] = user_email
msg["From"] = mail_from
msg["Date"] = formatdate(localtime=True)
msg["Message-ID"] = make_msgid(domain="onyx.app")
if mail_from:
msg["From"] = mail_from
part_text = MIMEText(text_body, "plain")
part_html = MIMEText(html_body, "html")
@@ -177,7 +173,7 @@ def send_subscription_cancellation_email(user_email: str) -> None:
subject = "Your Onyx Subscription Has Been Canceled"
heading = "Subscription Canceled"
message = (
"<p>We're sorry to see you go.</p>"
"<p>Were sorry to see you go.</p>"
"<p>Your subscription has been canceled and will end on your next billing date.</p>"
"<p>If you change your mind, you can always come back!</p>"
)
@@ -243,13 +239,13 @@ def send_user_email_invite(
def send_forgot_password_email(
user_email: str,
token: str,
tenant_id: str,
mail_from: str = EMAIL_FROM,
tenant_id: str | None = None,
) -> None:
# Builds a forgot password email with or without fancy HTML
subject = "Onyx Forgot Password"
link = f"{WEB_DOMAIN}/auth/reset-password?token={token}"
if MULTI_TENANT:
if tenant_id:
link += f"&{TENANT_ID_COOKIE_NAME}={tenant_id}"
message = f"<p>Click the following link to reset your password:</p><p>{link}</p>"
html_content = build_html_email("Reset Your Password", message)

View File

@@ -214,7 +214,7 @@ def verify_email_is_invited(email: str) -> None:
raise PermissionError("User not on allowed user whitelist")
def verify_email_in_whitelist(email: str, tenant_id: str) -> None:
def verify_email_in_whitelist(email: str, tenant_id: str | None = None) -> None:
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
if not get_user_by_email(email, db_session):
verify_email_is_invited(email)
@@ -553,7 +553,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
async_return_default_schema,
)(email=user.email)
send_forgot_password_email(user.email, tenant_id=tenant_id, token=token)
send_forgot_password_email(user.email, token, tenant_id=tenant_id)
async def on_after_request_verify(
self, user: User, token: str, request: Optional[Request] = None

View File

@@ -2,7 +2,6 @@ import logging
import multiprocessing
import time
from typing import Any
from typing import cast
import sentry_sdk
from celery import Task
@@ -132,9 +131,9 @@ def on_task_postrun(
# Get tenant_id directly from kwargs- each celery task has a tenant_id kwarg
if not kwargs:
logger.error(f"Task {task.name} (ID: {task_id}) is missing kwargs")
tenant_id = POSTGRES_DEFAULT_SCHEMA
tenant_id = None
else:
tenant_id = cast(str, kwargs.get("tenant_id", POSTGRES_DEFAULT_SCHEMA))
tenant_id = kwargs.get("tenant_id")
task_logger.debug(
f"Task {task.name} (ID: {task_id}) completed with state: {state} "

View File

@@ -34,7 +34,7 @@ def _get_deletion_status(
connector_id: int,
credential_id: int,
db_session: Session,
tenant_id: str,
tenant_id: str | None = None,
) -> TaskQueueState | None:
"""We no longer store TaskQueueState in the DB for a deletion attempt.
This function populates TaskQueueState by just checking redis.
@@ -67,7 +67,7 @@ def get_deletion_attempt_snapshot(
connector_id: int,
credential_id: int,
db_session: Session,
tenant_id: str,
tenant_id: str | None = None,
) -> DeletionAttemptSnapshot | None:
deletion_task = _get_deletion_status(
connector_id, credential_id, db_session, tenant_id

View File

@@ -109,7 +109,9 @@ def revoke_tasks_blocking_deletion(
trail=False,
bind=True,
)
def check_for_connector_deletion_task(self: Task, *, tenant_id: str) -> bool | None:
def check_for_connector_deletion_task(
self: Task, *, tenant_id: str | None
) -> bool | None:
r = get_redis_client()
r_replica = get_redis_replica_client()
r_celery: Redis = self.app.broker_connection().channel().client # type: ignore
@@ -222,7 +224,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
cc_pair_id: int,
db_session: Session,
lock_beat: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> int | None:
"""Returns an int if syncing is needed. The int represents the number of sync tasks generated.
Note that syncing can still be required even if the number of sync tasks generated is zero.
@@ -343,7 +345,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
def monitor_connector_deletion_taskset(
tenant_id: str, key_bytes: bytes, r: Redis
tenant_id: str | None, key_bytes: bytes, r: Redis
) -> None:
fence_key = key_bytes.decode("utf-8")
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
@@ -498,7 +500,7 @@ def monitor_connector_deletion_taskset(
def validate_connector_deletion_fences(
tenant_id: str,
tenant_id: str | None,
r: Redis,
r_replica: Redis,
r_celery: Redis,
@@ -538,7 +540,7 @@ def validate_connector_deletion_fences(
def validate_connector_deletion_fence(
tenant_id: str,
tenant_id: str | None,
key_bytes: bytes,
queued_tasks: set[str],
r: Redis,

View File

@@ -221,7 +221,7 @@ def try_creating_permissions_sync_task(
app: Celery,
cc_pair_id: int,
r: Redis,
tenant_id: str,
tenant_id: str | None,
) -> str | None:
"""Returns a randomized payload id on success.
Returns None if no syncing is required."""
@@ -320,7 +320,7 @@ def try_creating_permissions_sync_task(
def connector_permission_sync_generator_task(
self: Task,
cc_pair_id: int,
tenant_id: str,
tenant_id: str | None,
) -> None:
"""
Permission sync task that handles document permission syncing for a given connector credential pair
@@ -410,6 +410,7 @@ def connector_permission_sync_generator_task(
cc_pair.connector.id,
cc_pair.credential.id,
db_session,
tenant_id,
enforce_creation=False,
)
if not created:
@@ -509,7 +510,7 @@ def connector_permission_sync_generator_task(
)
def update_external_document_permissions_task(
self: Task,
tenant_id: str,
tenant_id: str | None,
serialized_doc_external_access: dict,
source_string: str,
connector_id: int,
@@ -584,7 +585,7 @@ def update_external_document_permissions_task(
def validate_permission_sync_fences(
tenant_id: str,
tenant_id: str | None,
r: Redis,
r_replica: Redis,
r_celery: Redis,
@@ -631,7 +632,7 @@ def validate_permission_sync_fences(
def validate_permission_sync_fence(
tenant_id: str,
tenant_id: str | None,
key_bytes: bytes,
queued_tasks: set[str],
reserved_tasks: set[str],
@@ -841,7 +842,7 @@ class PermissionSyncCallback(IndexingHeartbeatInterface):
def monitor_ccpair_permissions_taskset(
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
fence_key = key_bytes.decode("utf-8")
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)

View File

@@ -123,7 +123,7 @@ def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
soft_time_limit=JOB_TIMEOUT,
bind=True,
)
def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
def check_for_external_group_sync(self: Task, *, tenant_id: str | None) -> bool | None:
# we need to use celery's redis client to access its redis data
# (which lives on a different db number)
r = get_redis_client()
@@ -220,7 +220,7 @@ def try_creating_external_group_sync_task(
app: Celery,
cc_pair_id: int,
r: Redis,
tenant_id: str,
tenant_id: str | None,
) -> str | None:
"""Returns an int if syncing is needed. The int represents the number of sync tasks generated.
Returns None if no syncing is required."""
@@ -306,7 +306,7 @@ def try_creating_external_group_sync_task(
def connector_external_group_sync_generator_task(
self: Task,
cc_pair_id: int,
tenant_id: str,
tenant_id: str | None,
) -> None:
"""
External group sync task for a given connector credential pair
@@ -392,6 +392,7 @@ def connector_external_group_sync_generator_task(
cc_pair.connector.id,
cc_pair.credential.id,
db_session,
tenant_id,
enforce_creation=False,
)
if not created:
@@ -493,7 +494,7 @@ def connector_external_group_sync_generator_task(
def validate_external_group_sync_fences(
tenant_id: str,
tenant_id: str | None,
celery_app: Celery,
r: Redis,
r_replica: Redis,
@@ -525,7 +526,7 @@ def validate_external_group_sync_fences(
def validate_external_group_sync_fence(
tenant_id: str,
tenant_id: str | None,
key_bytes: bytes,
reserved_tasks: set[str],
r_celery: Redis,

View File

@@ -182,7 +182,7 @@ class SimpleJobResult:
class ConnectorIndexingContext(BaseModel):
tenant_id: str
tenant_id: str | None
cc_pair_id: int
search_settings_id: int
index_attempt_id: int
@@ -210,7 +210,7 @@ class ConnectorIndexingLogBuilder:
def monitor_ccpair_indexing_taskset(
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
# if the fence doesn't exist, there's nothing to do
fence_key = key_bytes.decode("utf-8")
@@ -358,7 +358,7 @@ def monitor_ccpair_indexing_taskset(
soft_time_limit=300,
bind=True,
)
def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
"""a lightweight task used to kick off indexing tasks.
Occcasionally does some validation of existing state to clear up error conditions"""
@@ -598,7 +598,7 @@ def connector_indexing_task(
cc_pair_id: int,
search_settings_id: int,
is_ee: bool,
tenant_id: str,
tenant_id: str | None,
) -> int | None:
"""Indexing task. For a cc pair, this task pulls all document IDs from the source
and compares those IDs to locally stored documents and deletes all locally stored IDs missing
@@ -890,7 +890,7 @@ def connector_indexing_proxy_task(
index_attempt_id: int,
cc_pair_id: int,
search_settings_id: int,
tenant_id: str,
tenant_id: str | None,
) -> None:
"""celery out of process task execution strategy is pool=prefork, but it uses fork,
and forking is inherently unstable.
@@ -1170,7 +1170,7 @@ def connector_indexing_proxy_task(
name=OnyxCeleryTask.CHECK_FOR_CHECKPOINT_CLEANUP,
soft_time_limit=300,
)
def check_for_checkpoint_cleanup(*, tenant_id: str) -> None:
def check_for_checkpoint_cleanup(*, tenant_id: str | None) -> None:
"""Clean up old checkpoints that are older than 7 days."""
locked = False
redis_client = get_redis_client(tenant_id=tenant_id)

View File

@@ -187,7 +187,7 @@ class IndexingCallback(IndexingCallbackBase):
def validate_indexing_fence(
tenant_id: str,
tenant_id: str | None,
key_bytes: bytes,
reserved_tasks: set[str],
r_celery: Redis,
@@ -311,7 +311,7 @@ def validate_indexing_fence(
def validate_indexing_fences(
tenant_id: str,
tenant_id: str | None,
r_replica: Redis,
r_celery: Redis,
lock_beat: RedisLock,
@@ -442,7 +442,7 @@ def try_creating_indexing_task(
reindex: bool,
db_session: Session,
r: Redis,
tenant_id: str,
tenant_id: str | None,
) -> int | None:
"""Checks for any conditions that should block the indexing task from being
created, then creates the task.

View File

@@ -59,7 +59,7 @@ def _process_model_list_response(model_list_json: Any) -> list[str]:
trail=False,
bind=True,
)
def check_for_llm_model_update(self: Task, *, tenant_id: str) -> bool | None:
def check_for_llm_model_update(self: Task, *, tenant_id: str | None) -> bool | None:
if not LLM_MODEL_UPDATE_API_URL:
raise ValueError("LLM model update API URL not configured")

View File

@@ -91,7 +91,7 @@ class Metric(BaseModel):
}
task_logger.info(json.dumps(data))
def emit(self, tenant_id: str) -> None:
def emit(self, tenant_id: str | None) -> None:
# Convert value to appropriate type based on the input value
bool_value = None
float_value = None
@@ -656,7 +656,7 @@ def build_job_id(
queue=OnyxCeleryQueues.MONITORING,
bind=True,
)
def monitor_background_processes(self: Task, *, tenant_id: str) -> None:
def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
"""Collect and emit metrics about background processes.
This task runs periodically to gather metrics about:
- Queue lengths for different Celery queues
@@ -864,7 +864,7 @@ def cloud_monitor_celery_queues(
@shared_task(name=OnyxCeleryTask.MONITOR_CELERY_QUEUES, ignore_result=True, bind=True)
def monitor_celery_queues(self: Task, *, tenant_id: str) -> None:
def monitor_celery_queues(self: Task, *, tenant_id: str | None) -> None:
return monitor_celery_queues_helper(self)

View File

@@ -24,7 +24,7 @@ from onyx.db.engine import get_session_with_current_tenant
bind=True,
base=AbortableTask,
)
def kombu_message_cleanup_task(self: Any, tenant_id: str) -> int:
def kombu_message_cleanup_task(self: Any, tenant_id: str | None) -> int:
"""Runs periodically to clean up the kombu_message table"""
# we will select messages older than this amount to clean up

View File

@@ -114,7 +114,7 @@ def _is_pruning_due(cc_pair: ConnectorCredentialPair) -> bool:
soft_time_limit=JOB_TIMEOUT,
bind=True,
)
def check_for_pruning(self: Task, *, tenant_id: str) -> bool | None:
def check_for_pruning(self: Task, *, tenant_id: str | None) -> bool | None:
r = get_redis_client()
r_replica = get_redis_replica_client()
r_celery: Redis = self.app.broker_connection().channel().client # type: ignore
@@ -211,7 +211,7 @@ def try_creating_prune_generator_task(
cc_pair: ConnectorCredentialPair,
db_session: Session,
r: Redis,
tenant_id: str,
tenant_id: str | None,
) -> str | None:
"""Checks for any conditions that should block the pruning generator task from being
created, then creates the task.
@@ -333,7 +333,7 @@ def connector_pruning_generator_task(
cc_pair_id: int,
connector_id: int,
credential_id: int,
tenant_id: str,
tenant_id: str | None,
) -> None:
"""connector pruning task. For a cc pair, this task pulls all document IDs from the source
and compares those IDs to locally stored documents and deletes all locally stored IDs missing
@@ -521,7 +521,7 @@ def connector_pruning_generator_task(
def monitor_ccpair_pruning_taskset(
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
fence_key = key_bytes.decode("utf-8")
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
@@ -567,7 +567,7 @@ def monitor_ccpair_pruning_taskset(
def validate_pruning_fences(
tenant_id: str,
tenant_id: str | None,
r: Redis,
r_replica: Redis,
r_celery: Redis,
@@ -615,7 +615,7 @@ def validate_pruning_fences(
def validate_pruning_fence(
tenant_id: str,
tenant_id: str | None,
key_bytes: bytes,
reserved_tasks: set[str],
queued_tasks: set[str],

View File

@@ -32,7 +32,7 @@ class RetryDocumentIndex:
self,
doc_id: str,
*,
tenant_id: str,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
return self.index.delete_single(
@@ -50,7 +50,7 @@ class RetryDocumentIndex:
self,
doc_id: str,
*,
tenant_id: str,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:

View File

@@ -76,7 +76,7 @@ def document_by_cc_pair_cleanup_task(
document_id: str,
connector_id: int,
credential_id: int,
tenant_id: str,
tenant_id: str | None,
) -> bool:
"""A lightweight subtask used to clean up document to cc pair relationships.
Created by connection deletion and connector pruning parent tasks."""
@@ -297,7 +297,7 @@ def cloud_beat_task_generator(
return None
last_lock_time = time.monotonic()
tenant_ids: list[str] = []
tenant_ids: list[str] | list[None] = []
try:
tenant_ids = get_all_tenant_ids()

View File

@@ -76,7 +76,7 @@ logger = setup_logger()
trail=False,
bind=True,
)
def check_for_vespa_sync_task(self: Task, *, tenant_id: str) -> bool | None:
def check_for_vespa_sync_task(self: Task, *, tenant_id: str | None) -> bool | None:
"""Runs periodically to check if any document needs syncing.
Generates sets of tasks for Celery if syncing is needed."""
@@ -208,7 +208,7 @@ def try_generate_stale_document_sync_tasks(
db_session: Session,
r: Redis,
lock_beat: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> int | None:
# the fence is up, do nothing
@@ -284,7 +284,7 @@ def try_generate_document_set_sync_tasks(
db_session: Session,
r: Redis,
lock_beat: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> int | None:
lock_beat.reacquire()
@@ -361,7 +361,7 @@ def try_generate_user_group_sync_tasks(
db_session: Session,
r: Redis,
lock_beat: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> int | None:
lock_beat.reacquire()
@@ -448,7 +448,7 @@ def monitor_connector_taskset(r: Redis) -> None:
def monitor_document_set_taskset(
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
fence_key = key_bytes.decode("utf-8")
document_set_id_str = RedisDocumentSet.get_id_from_fence_key(fence_key)
@@ -523,7 +523,9 @@ def monitor_document_set_taskset(
time_limit=LIGHT_TIME_LIMIT,
max_retries=3,
)
def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) -> bool:
def vespa_metadata_sync_task(
self: Task, document_id: str, *, tenant_id: str | None
) -> bool:
start = time.monotonic()
completion_status = OnyxCeleryTaskCompletionStatus.UNDEFINED

View File

@@ -55,7 +55,6 @@ from onyx.utils.logger import setup_logger
from onyx.utils.logger import TaskAttemptSingleton
from onyx.utils.telemetry import create_milestone_and_report
from onyx.utils.variable_functionality import global_version
from shared_configs.configs import MULTI_TENANT
logger = setup_logger()
@@ -68,6 +67,7 @@ def _get_connector_runner(
batch_size: int,
start_time: datetime,
end_time: datetime,
tenant_id: str | None,
leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
) -> ConnectorRunner:
"""
@@ -86,6 +86,7 @@ def _get_connector_runner(
input_type=task,
connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
credential=attempt.connector_credential_pair.credential,
tenant_id=tenant_id,
)
# validate the connector settings
@@ -240,7 +241,7 @@ def _check_failure_threshold(
def _run_indexing(
db_session: Session,
index_attempt_id: int,
tenant_id: str,
tenant_id: str | None,
callback: IndexingHeartbeatInterface | None = None,
) -> None:
"""
@@ -387,6 +388,7 @@ def _run_indexing(
batch_size=INDEX_BATCH_SIZE,
start_time=window_start,
end_time=window_end,
tenant_id=tenant_id,
)
# don't use a checkpoint if we're explicitly indexing from
@@ -679,7 +681,7 @@ def _run_indexing(
def run_indexing_entrypoint(
index_attempt_id: int,
tenant_id: str,
tenant_id: str | None,
connector_credential_pair_id: int,
is_ee: bool = False,
callback: IndexingHeartbeatInterface | None = None,
@@ -699,7 +701,7 @@ def run_indexing_entrypoint(
attempt = transition_attempt_to_in_progress(index_attempt_id, db_session)
tenant_str = ""
if MULTI_TENANT:
if tenant_id is not None:
tenant_str = f" for tenant {tenant_id}"
connector_name = attempt.connector_credential_pair.connector.name

View File

@@ -86,7 +86,6 @@ from onyx.document_index.factory import get_default_document_index
from onyx.file_store.models import ChatFileType
from onyx.file_store.models import FileDescriptor
from onyx.file_store.utils import load_all_chat_files
from onyx.file_store.utils import load_all_user_files
from onyx.file_store.utils import save_files
from onyx.llm.exceptions import GenAIDisabledException
from onyx.llm.factory import get_llms_for_persona
@@ -263,11 +262,8 @@ def _get_force_search_settings(
search_tool_available = any(isinstance(tool, SearchTool) for tool in tools)
if not internet_search_available and not search_tool_available:
if new_msg_req.force_user_file_search:
return ForceUseTool(force_use=True, tool_name=SearchTool._NAME)
else:
# Does not matter much which tool is set here as force is false and neither tool is available
return ForceUseTool(force_use=False, tool_name=SearchTool._NAME)
# Does not matter much which tool is set here as force is false and neither tool is available
return ForceUseTool(force_use=False, tool_name=SearchTool._NAME)
tool_name = SearchTool._NAME if search_tool_available else InternetSearchTool._NAME
# Currently, the internet search tool does not support query override
@@ -283,7 +279,6 @@ def _get_force_search_settings(
should_force_search = any(
[
new_msg_req.force_user_file_search,
new_msg_req.retrieval_options
and new_msg_req.retrieval_options.run_search
== OptionalSearchSetting.ALWAYS,
@@ -543,15 +538,6 @@ def stream_chat_message_objects(
req_file_ids = [f["id"] for f in new_msg_req.file_descriptors]
latest_query_files = [file for file in files if file.file_id in req_file_ids]
if not new_msg_req.force_user_file_search:
user_files = load_all_user_files(
new_msg_req.user_file_ids,
new_msg_req.user_folder_ids,
db_session,
)
latest_query_files += user_files
if user_message:
attach_files_to_chat_message(
chat_message=user_message,
@@ -695,7 +681,6 @@ def stream_chat_message_objects(
user=user,
llm=llm,
fast_llm=fast_llm,
use_file_search=new_msg_req.force_user_file_search,
search_tool_config=SearchToolConfig(
answer_style_config=answer_style_config,
document_pruning_config=document_pruning_config,

View File

@@ -3,7 +3,7 @@ import os
INPUT_PROMPT_YAML = "./onyx/seeding/input_prompts.yaml"
PROMPTS_YAML = "./onyx/seeding/prompts.yaml"
PERSONAS_YAML = "./onyx/seeding/personas.yaml"
USER_FOLDERS_YAML = "./onyx/seeding/user_folders.yaml"
NUM_RETURNED_HITS = 50
# Used for LLM filtering and reranking
# We want this to be approximately the number of results we want to show on the first page

View File

@@ -5,6 +5,7 @@ from sqlalchemy.orm import Session
from onyx.configs.app_configs import INTEGRATION_TESTS_MODE
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import DocumentSourceRequiringTenantContext
from onyx.connectors.airtable.airtable_connector import AirtableConnector
from onyx.connectors.asana.connector import AsanaConnector
from onyx.connectors.axero.connector import AxeroConnector
@@ -163,9 +164,13 @@ def instantiate_connector(
input_type: InputType,
connector_specific_config: dict[str, Any],
credential: Credential,
tenant_id: str | None = None,
) -> BaseConnector:
connector_class = identify_connector_class(source, input_type)
if source in DocumentSourceRequiringTenantContext:
connector_specific_config["tenant_id"] = tenant_id
connector = connector_class(**connector_specific_config)
new_credentials = connector.load_credentials(credential.credential_json)
@@ -179,6 +184,7 @@ def validate_ccpair_for_user(
connector_id: int,
credential_id: int,
db_session: Session,
tenant_id: str | None,
enforce_creation: bool = True,
) -> bool:
if INTEGRATION_TESTS_MODE:
@@ -210,6 +216,7 @@ def validate_ccpair_for_user(
input_type=connector.input_type,
connector_specific_config=connector.connector_specific_config,
credential=credential,
tenant_id=tenant_id,
)
except ConnectorValidationError as e:
raise e

View File

@@ -16,7 +16,7 @@ from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.engine import get_session_with_tenant
from onyx.file_processing.extract_file_text import detect_encoding
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
@@ -27,6 +27,8 @@ from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.extract_file_text import read_text_file
from onyx.file_store.file_store import get_default_file_store
from onyx.utils.logger import setup_logger
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
logger = setup_logger()
@@ -163,10 +165,12 @@ class LocalFileConnector(LoadConnector):
def __init__(
self,
file_locations: list[Path | str],
tenant_id: str = POSTGRES_DEFAULT_SCHEMA,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.file_locations = [Path(file_location) for file_location in file_locations]
self.batch_size = batch_size
self.tenant_id = tenant_id
self.pdf_pass: str | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@@ -175,8 +179,9 @@ class LocalFileConnector(LoadConnector):
def load_from_state(self) -> GenerateDocumentsOutput:
documents: list[Document] = []
token = CURRENT_TENANT_ID_CONTEXTVAR.set(self.tenant_id)
with get_session_with_current_tenant() as db_session:
with get_session_with_tenant(tenant_id=self.tenant_id) as db_session:
for file_path in self.file_locations:
current_datetime = datetime.now(timezone.utc)
files = _read_files_and_metadata(
@@ -198,6 +203,8 @@ class LocalFileConnector(LoadConnector):
if documents:
yield documents
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
if __name__ == "__main__":
connector = LocalFileConnector(file_locations=[os.environ["TEST_FILE"]])

View File

@@ -1,9 +1,7 @@
import io
from datetime import datetime
from datetime import timezone
from tempfile import NamedTemporaryFile
import openpyxl # type: ignore
from googleapiclient.discovery import build # type: ignore
from googleapiclient.errors import HttpError # type: ignore
@@ -45,15 +43,12 @@ def _extract_sections_basic(
) -> list[Section]:
mime_type = file["mimeType"]
link = file["webViewLink"]
supported_file_types = set(item.value for item in GDriveMimeType)
if mime_type not in supported_file_types:
if mime_type not in set(item.value for item in GDriveMimeType):
# Unsupported file types can still have a title, finding this way is still useful
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
try:
# ---------------------------
# Google Sheets extraction
if mime_type == GDriveMimeType.SPREADSHEET.value:
try:
sheets_service = build(
@@ -114,53 +109,7 @@ def _extract_sections_basic(
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
" Falling back to basic extraction."
)
# ---------------------------
# Microsoft Excel (.xlsx or .xls) extraction branch
elif mime_type in [
GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value,
GDriveMimeType.SPREADSHEET_MS_EXCEL.value,
]:
try:
response = service.files().get_media(fileId=file["id"]).execute()
with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp:
tmp.write(response)
tmp_path = tmp.name
section_separator = "\n\n"
workbook = openpyxl.load_workbook(tmp_path, read_only=True)
# Work similarly to the xlsx_to_text function used for file connector
# but returns Sections instead of a string
sections = [
Section(
link=link,
text=(
f"Sheet: {sheet.title}\n\n"
+ section_separator.join(
",".join(map(str, row))
for row in sheet.iter_rows(
min_row=1, values_only=True
)
if row
)
),
)
for sheet in workbook.worksheets
]
return sections
except Exception as e:
logger.warning(
f"Error extracting data from Excel file '{file['name']}': {e}"
)
return [
Section(link=link, text="Error extracting data from Excel file")
]
# ---------------------------
# Export for Google Docs, PPT, and fallback for spreadsheets
if mime_type in [
GDriveMimeType.DOC.value,
GDriveMimeType.PPT.value,
@@ -179,8 +128,6 @@ def _extract_sections_basic(
)
return [Section(link=link, text=text)]
# ---------------------------
# Plain text and Markdown files
elif mime_type in [
GDriveMimeType.PLAIN_TEXT.value,
GDriveMimeType.MARKDOWN.value,
@@ -194,8 +141,6 @@ def _extract_sections_basic(
.decode("utf-8"),
)
]
# ---------------------------
# Word, PowerPoint, PDF files
if mime_type in [
GDriveMimeType.WORD_DOC.value,
GDriveMimeType.POWERPOINT.value,
@@ -225,11 +170,7 @@ def _extract_sections_basic(
Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
]
# Catch-all case, should not happen since there should be specific handling
# for each of the supported file types
error_message = f"Unsupported file type: {mime_type}"
logger.error(error_message)
raise ValueError(error_message)
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
except Exception:
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]

View File

@@ -5,10 +5,6 @@ from typing import Any
class GDriveMimeType(str, Enum):
DOC = "application/vnd.google-apps.document"
SPREADSHEET = "application/vnd.google-apps.spreadsheet"
SPREADSHEET_OPEN_FORMAT = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
SPREADSHEET_MS_EXCEL = "application/vnd.ms-excel"
PDF = "application/pdf"
WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
PPT = "application/vnd.google-apps.presentation"

View File

@@ -98,7 +98,6 @@ class BaseFilters(BaseModel):
document_set: list[str] | None = None
time_cutoff: datetime | None = None
tags: list[Tag] | None = None
user_file_ids: list[int] | None = None
class IndexFilters(BaseFilters):

View File

@@ -160,16 +160,7 @@ def retrieval_preprocessing(
user_acl_filters = (
None if bypass_acl else build_access_filters_for_user(user, db_session)
)
user_file_ids = preset_filters.user_file_ids
if persona and persona.user_files:
user_file_ids = user_file_ids + [
file.id
for file in persona.user_files
if file.id not in preset_filters.user_file_ids
]
final_filters = IndexFilters(
user_file_ids=user_file_ids,
source_type=preset_filters.source_type or predicted_source_filters,
document_set=preset_filters.document_set,
time_cutoff=time_filter or predicted_time_cutoff,

View File

@@ -16,6 +16,7 @@ from onyx.configs.constants import UNNAMED_KEY_PLACEHOLDER
from onyx.db.models import ApiKey
from onyx.db.models import User
from onyx.server.api_key.models import APIKeyArgs
from shared_configs.configs import MULTI_TENANT
from shared_configs.contextvars import get_current_tenant_id
@@ -72,7 +73,7 @@ def insert_api_key(
# Get tenant_id from context var (will be default schema for single tenant)
tenant_id = get_current_tenant_id()
api_key = generate_api_key(tenant_id)
api_key = generate_api_key(tenant_id if MULTI_TENANT else None)
api_key_user_id = uuid.uuid4()
display_name = api_key_args.name or UNNAMED_KEY_PLACEHOLDER

View File

@@ -104,7 +104,6 @@ def get_connector_credential_pairs_for_user(
get_editable: bool = True,
ids: list[int] | None = None,
eager_load_connector: bool = False,
include_user_files: bool = False,
eager_load_credential: bool = False,
eager_load_user: bool = False,
) -> list[ConnectorCredentialPair]:
@@ -127,9 +126,6 @@ def get_connector_credential_pairs_for_user(
if ids:
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
if not include_user_files:
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
return list(db_session.scalars(stmt).unique().all())
@@ -157,16 +153,14 @@ def get_connector_credential_pairs_for_user_parallel(
def get_connector_credential_pairs(
db_session: Session, ids: list[int] | None = None, include_user_files: bool = False
db_session: Session,
ids: list[int] | None = None,
) -> list[ConnectorCredentialPair]:
stmt = select(ConnectorCredentialPair).distinct()
if ids:
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
if not include_user_files:
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
return list(db_session.scalars(stmt).all())
@@ -452,7 +446,6 @@ def add_credential_to_connector(
initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE,
last_successful_index_time: datetime | None = None,
seeding_flow: bool = False,
is_user_file: bool = False,
) -> StatusResponse:
connector = fetch_connector_by_id(connector_id, db_session)
@@ -518,7 +511,6 @@ def add_credential_to_connector(
access_type=access_type,
auto_sync_options=auto_sync_options,
last_successful_index_time=last_successful_index_time,
is_user_file=is_user_file,
)
db_session.add(association)
db_session.flush() # make sure the association has an id

View File

@@ -274,7 +274,7 @@ def get_document_counts_for_cc_pairs_parallel(
def get_access_info_for_document(
db_session: Session,
document_id: str,
) -> tuple[str, list[str | None], bool, list[int], list[int]] | None:
) -> tuple[str, list[str | None], bool] | None:
"""Gets access info for a single document by calling the get_access_info_for_documents function
and passing a list with a single document ID.
Args:
@@ -294,7 +294,7 @@ def get_access_info_for_document(
def get_access_info_for_documents(
db_session: Session,
document_ids: list[str],
) -> Sequence[tuple[str, list[str | None], bool, list[int], list[int]]]:
) -> Sequence[tuple[str, list[str | None], bool]]:
"""Gets back all relevant access info for the given documents. This includes
the user_ids for cc pairs that the document is associated with + whether any
of the associated cc pairs are intending to make the document globally public.

View File

@@ -605,6 +605,7 @@ def fetch_document_sets_for_document(
result = fetch_document_sets_for_documents([document_id], db_session)
if not result:
return []
return result[0][1]

View File

@@ -258,11 +258,11 @@ class SqlEngine:
cls._engine = None
def get_all_tenant_ids() -> list[str]:
def get_all_tenant_ids() -> list[str] | list[None]:
"""Returning [None] means the only tenant is the 'public' or self hosted tenant."""
if not MULTI_TENANT:
return [POSTGRES_DEFAULT_SCHEMA]
return [None]
with get_session_with_shared_schema() as session:
result = session.execute(
@@ -417,7 +417,7 @@ def get_session_with_shared_schema() -> Generator[Session, None, None]:
@contextmanager
def get_session_with_tenant(*, tenant_id: str) -> Generator[Session, None, None]:
def get_session_with_tenant(*, tenant_id: str | None) -> Generator[Session, None, None]:
"""
Generate a database session for a specific tenant.
"""

View File

@@ -205,11 +205,6 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
primaryjoin="User.id == foreign(ConnectorCredentialPair.creator_id)",
)
folders: Mapped[list["UserFolder"]] = relationship(
"UserFolder", back_populates="user"
)
files: Mapped[list["UserFile"]] = relationship("UserFile", back_populates="user")
@property
def password_configured(self) -> bool:
"""
@@ -412,7 +407,6 @@ class ConnectorCredentialPair(Base):
"""
__tablename__ = "connector_credential_pair"
is_user_file: Mapped[bool] = mapped_column(Boolean, default=False)
# NOTE: this `id` column has to use `Sequence` instead of `autoincrement=True`
# due to some SQLAlchemy quirks + this not being a primary key column
id: Mapped[int] = mapped_column(
@@ -499,10 +493,6 @@ class ConnectorCredentialPair(Base):
primaryjoin="foreign(ConnectorCredentialPair.creator_id) == remote(User.id)",
)
user_file: Mapped["UserFile"] = relationship(
"UserFile", back_populates="cc_pair", uselist=False
)
background_errors: Mapped[list["BackgroundError"]] = relationship(
"BackgroundError", back_populates="cc_pair", cascade="all, delete-orphan"
)
@@ -1723,17 +1713,6 @@ class Persona(Base):
secondary="persona__user_group",
viewonly=True,
)
# Relationship to UserFile
user_files: Mapped[list["UserFile"]] = relationship(
"UserFile",
secondary="persona__user_file",
back_populates="assistants",
)
user_folders: Mapped[list["UserFolder"]] = relationship(
"UserFolder",
secondary="persona__user_folder",
back_populates="assistants",
)
labels: Mapped[list["PersonaLabel"]] = relationship(
"PersonaLabel",
secondary=Persona__PersonaLabel.__table__,
@@ -1750,24 +1729,6 @@ class Persona(Base):
)
class Persona__UserFolder(Base):
__tablename__ = "persona__user_folder"
persona_id: Mapped[int] = mapped_column(ForeignKey("persona.id"), primary_key=True)
user_folder_id: Mapped[int] = mapped_column(
ForeignKey("user_folder.id"), primary_key=True
)
class Persona__UserFile(Base):
__tablename__ = "persona__user_file"
persona_id: Mapped[int] = mapped_column(ForeignKey("persona.id"), primary_key=True)
user_file_id: Mapped[int] = mapped_column(
ForeignKey("user_file.id"), primary_key=True
)
class PersonaLabel(Base):
__tablename__ = "persona_label"
@@ -2289,68 +2250,6 @@ class InputPrompt__User(Base):
disabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
class UserFolder(Base):
__tablename__ = "user_folder"
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
user_id: Mapped[int] = mapped_column(ForeignKey("user.id"), nullable=False)
name: Mapped[str] = mapped_column(nullable=False)
description: Mapped[str] = mapped_column(nullable=False)
created_at: Mapped[datetime.datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now()
)
# Mapped[datetime.datetime] = mapped_column(
# DateTime(timezone=True), server_default=func.now()
# )
user: Mapped["User"] = relationship(back_populates="folders")
files: Mapped[list["UserFile"]] = relationship(back_populates="folder")
assistants: Mapped[list["Persona"]] = relationship(
"Persona",
secondary=Persona__UserFolder.__table__,
back_populates="user_folders",
)
class UserDocument(str, Enum):
CHAT = "chat"
RECENT = "recent"
FILE = "file"
class UserFile(Base):
__tablename__ = "user_file"
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
user_id: Mapped[int | None] = mapped_column(ForeignKey("user.id"), nullable=False)
assistants: Mapped[list["Persona"]] = relationship(
"Persona",
secondary=Persona__UserFile.__table__,
back_populates="user_files",
)
folder_id: Mapped[int | None] = mapped_column(
ForeignKey("user_folder.id"), nullable=True
)
file_id: Mapped[str] = mapped_column(nullable=False)
document_id: Mapped[str] = mapped_column(nullable=False)
name: Mapped[str] = mapped_column(nullable=False)
created_at: Mapped[datetime.datetime] = mapped_column(
default=datetime.datetime.utcnow
)
user: Mapped["User"] = relationship(back_populates="files")
folder: Mapped["UserFolder"] = relationship(back_populates="files")
token_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
cc_pair_id: Mapped[int | None] = mapped_column(
ForeignKey("connector_credential_pair.id"), nullable=True, unique=True
)
cc_pair: Mapped["ConnectorCredentialPair"] = relationship(
"ConnectorCredentialPair", back_populates="user_file"
)
"""
Multi-tenancy related tables
"""

View File

@@ -33,8 +33,6 @@ from onyx.db.models import StarterMessage
from onyx.db.models import Tool
from onyx.db.models import User
from onyx.db.models import User__UserGroup
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.db.models import UserGroup
from onyx.db.notification import create_notification
from onyx.server.features.persona.models import PersonaSharedNotificationData
@@ -239,8 +237,6 @@ def create_update_persona(
llm_relevance_filter=create_persona_request.llm_relevance_filter,
llm_filter_extraction=create_persona_request.llm_filter_extraction,
is_default_persona=create_persona_request.is_default_persona,
user_file_ids=create_persona_request.user_file_ids,
user_folder_ids=create_persona_request.user_folder_ids,
)
versioned_make_persona_private = fetch_versioned_implementation(
@@ -335,8 +331,6 @@ def get_personas_for_user(
selectinload(Persona.groups),
selectinload(Persona.users),
selectinload(Persona.labels),
selectinload(Persona.user_files),
selectinload(Persona.user_folders),
)
results = db_session.execute(stmt).scalars().all()
@@ -431,8 +425,6 @@ def upsert_persona(
builtin_persona: bool = False,
is_default_persona: bool = False,
label_ids: list[int] | None = None,
user_file_ids: list[int] | None = None,
user_folder_ids: list[int] | None = None,
chunks_above: int = CONTEXT_CHUNKS_ABOVE,
chunks_below: int = CONTEXT_CHUNKS_BELOW,
) -> Persona:
@@ -458,7 +450,6 @@ def upsert_persona(
user=user,
get_editable=True,
)
# Fetch and attach tools by IDs
tools = None
if tool_ids is not None:
@@ -477,26 +468,6 @@ def upsert_persona(
if not document_sets and document_set_ids:
raise ValueError("document_sets not found")
# Fetch and attach user_files by IDs
user_files = None
if user_file_ids is not None:
user_files = (
db_session.query(UserFile).filter(UserFile.id.in_(user_file_ids)).all()
)
if not user_files and user_file_ids:
raise ValueError("user_files not found")
# Fetch and attach user_folders by IDs
user_folders = None
if user_folder_ids is not None:
user_folders = (
db_session.query(UserFolder)
.filter(UserFolder.id.in_(user_folder_ids))
.all()
)
if not user_folders and user_folder_ids:
raise ValueError("user_folders not found")
# Fetch and attach prompts by IDs
prompts = None
if prompt_ids is not None:
@@ -561,14 +532,6 @@ def upsert_persona(
if tools is not None:
existing_persona.tools = tools or []
if user_file_ids is not None:
existing_persona.user_files.clear()
existing_persona.user_files = user_files or []
if user_folder_ids is not None:
existing_persona.user_folders.clear()
existing_persona.user_folders = user_folders or []
# We should only update display priority if it is not already set
if existing_persona.display_priority is None:
existing_persona.display_priority = display_priority

View File

@@ -1,175 +0,0 @@
import datetime
from typing import List
from fastapi import UploadFile
from sqlalchemy import and_
from sqlalchemy.orm import Session
from onyx.connectors.file.connector import _read_files_and_metadata
from onyx.db.models import Persona
from onyx.db.models import Persona__UserFile
from onyx.db.models import User
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.file_processing.extract_file_text import read_text_file
from onyx.llm.factory import get_default_llms
from onyx.natural_language_processing.utils import get_tokenizer
from onyx.server.documents.connector import upload_files
USER_FILE_CONSTANT = "USER_FILE_CONNECTOR"
def create_user_files(
files: List[UploadFile],
folder_id: int | None,
user: User | None,
db_session: Session,
) -> list[UserFile]:
upload_response = upload_files(files, db_session)
user_files = []
context_files = _read_files_and_metadata(
file_name=str(upload_response.file_paths[0]), db_session=db_session
)
content, _ = read_text_file(next(context_files)[1])
llm, _ = get_default_llms()
llm_tokenizer = get_tokenizer(
model_name=llm.config.model_name,
provider_type=llm.config.model_provider,
)
token_count = len(llm_tokenizer.encode(content))
for file_path, file in zip(upload_response.file_paths, files):
new_file = UserFile(
user_id=user.id if user else None,
folder_id=folder_id,
file_id=file_path,
document_id="USER_FILE_CONNECTOR__" + file_path,
name=file.filename,
token_count=token_count,
)
db_session.add(new_file)
user_files.append(new_file)
db_session.commit()
return user_files
def get_user_files_from_folder(folder_id: int, db_session: Session) -> list[UserFile]:
return db_session.query(UserFile).filter(UserFile.folder_id == folder_id).all()
def share_file_with_assistant(
file_id: int, assistant_id: int, db_session: Session
) -> None:
file = db_session.query(UserFile).filter(UserFile.id == file_id).first()
assistant = db_session.query(Persona).filter(Persona.id == assistant_id).first()
if file and assistant:
file.assistants.append(assistant)
db_session.commit()
def unshare_file_with_assistant(
file_id: int, assistant_id: int, db_session: Session
) -> None:
db_session.query(Persona__UserFile).filter(
and_(
Persona__UserFile.user_file_id == file_id,
Persona__UserFile.persona_id == assistant_id,
)
).delete()
db_session.commit()
def share_folder_with_assistant(
folder_id: int, assistant_id: int, db_session: Session
) -> None:
folder = db_session.query(UserFolder).filter(UserFolder.id == folder_id).first()
assistant = db_session.query(Persona).filter(Persona.id == assistant_id).first()
if folder and assistant:
for file in folder.files:
share_file_with_assistant(file.id, assistant_id, db_session)
def unshare_folder_with_assistant(
folder_id: int, assistant_id: int, db_session: Session
) -> None:
folder = db_session.query(UserFolder).filter(UserFolder.id == folder_id).first()
if folder:
for file in folder.files:
unshare_file_with_assistant(file.id, assistant_id, db_session)
def fetch_user_files_for_documents(
document_ids: list[str],
db_session: Session,
) -> dict[str, None | int]:
# Query UserFile objects for the given document_ids
user_files = (
db_session.query(UserFile).filter(UserFile.document_id.in_(document_ids)).all()
)
# Create a dictionary mapping document_ids to UserFile objects
result = {doc_id: None for doc_id in document_ids}
for user_file in user_files:
result[user_file.document_id] = user_file.id
return result
def upsert_user_folder(
db_session: Session,
id: int | None = None,
user_id: int | None = None,
name: str | None = None,
description: str | None = None,
created_at: datetime.datetime | None = None,
user: User | None = None,
files: list[UserFile] | None = None,
assistants: list[Persona] | None = None,
) -> UserFolder:
if id is not None:
user_folder = db_session.query(UserFolder).filter_by(id=id).first()
else:
user_folder = (
db_session.query(UserFolder).filter_by(name=name, user_id=user_id).first()
)
if user_folder:
if user_id is not None:
user_folder.user_id = user_id
if name is not None:
user_folder.name = name
if description is not None:
user_folder.description = description
if created_at is not None:
user_folder.created_at = created_at
if user is not None:
user_folder.user = user
if files is not None:
user_folder.files = files
if assistants is not None:
user_folder.assistants = assistants
else:
user_folder = UserFolder(
id=id,
user_id=user_id,
name=name,
description=description,
created_at=created_at or datetime.datetime.utcnow(),
user=user,
files=files or [],
assistants=assistants or [],
)
db_session.add(user_folder)
db_session.flush()
return user_folder
def get_user_folder_by_name(db_session: Session, name: str) -> UserFolder | None:
return db_session.query(UserFolder).filter(UserFolder.name == name).first()

View File

@@ -81,7 +81,7 @@ def translate_boost_count_to_multiplier(boost: int) -> float:
# Vespa's Document API.
def get_document_chunk_ids(
enriched_document_info_list: list[EnrichedDocumentIndexingInfo],
tenant_id: str,
tenant_id: str | None,
large_chunks_enabled: bool,
) -> list[UUID]:
doc_chunk_ids = []
@@ -139,7 +139,7 @@ def get_uuid_from_chunk_info(
*,
document_id: str,
chunk_id: int,
tenant_id: str,
tenant_id: str | None,
large_chunk_id: int | None = None,
) -> UUID:
"""NOTE: be VERY carefuly about changing this function. If changed without a migration,
@@ -154,7 +154,7 @@ def get_uuid_from_chunk_info(
"large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id)
)
unique_identifier_string = "_".join([doc_str, chunk_index])
if MULTI_TENANT:
if tenant_id and MULTI_TENANT:
unique_identifier_string += "_" + tenant_id
uuid_value = uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)

View File

@@ -43,7 +43,7 @@ class IndexBatchParams:
doc_id_to_previous_chunk_cnt: dict[str, int | None]
doc_id_to_new_chunk_cnt: dict[str, int]
tenant_id: str
tenant_id: str | None
large_chunks_enabled: bool
@@ -222,7 +222,7 @@ class Deletable(abc.ABC):
self,
doc_id: str,
*,
tenant_id: str,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
"""
@@ -249,7 +249,7 @@ class Updatable(abc.ABC):
self,
doc_id: str,
*,
tenant_id: str,
tenant_id: str | None,
chunk_count: int | None,
fields: VespaDocumentFields,
) -> int:
@@ -270,7 +270,9 @@ class Updatable(abc.ABC):
raise NotImplementedError
@abc.abstractmethod
def update(self, update_requests: list[UpdateRequest], *, tenant_id: str) -> None:
def update(
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
"""
Updates some set of chunks. The document and fields to update are specified in the update
requests. Each update request in the list applies its changes to a list of document ids.

View File

@@ -112,16 +112,6 @@ schema DANSWER_CHUNK_NAME {
rank: filter
attribute: fast-search
}
field user_file type int {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
field user_folders type weightedset<int> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}
# If using different tokenization settings, the fieldset has to be removed, and the field must

View File

@@ -468,7 +468,9 @@ class VespaIndex(DocumentIndex):
failure_msg = f"Failed to update document: {future_to_document_id[future]}"
raise requests.HTTPError(failure_msg) from e
def update(self, update_requests: list[UpdateRequest], *, tenant_id: str) -> None:
def update(
self, update_requests: list[UpdateRequest], *, tenant_id: str | None
) -> None:
logger.debug(f"Updating {len(update_requests)} documents in Vespa")
# Handle Vespa character limitations
@@ -616,7 +618,7 @@ class VespaIndex(DocumentIndex):
doc_id: str,
*,
chunk_count: int | None,
tenant_id: str,
tenant_id: str | None,
fields: VespaDocumentFields,
) -> int:
"""Note: if the document id does not exist, the update will be a no-op and the
@@ -645,8 +647,6 @@ class VespaIndex(DocumentIndex):
tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled,
)
logger.error("CHECKing chunks")
logger.error(doc_chunk_ids)
doc_chunk_count += len(doc_chunk_ids)
@@ -661,7 +661,7 @@ class VespaIndex(DocumentIndex):
self,
doc_id: str,
*,
tenant_id: str,
tenant_id: str | None,
chunk_count: int | None,
) -> int:
total_chunks_deleted = 0
@@ -693,7 +693,6 @@ class VespaIndex(DocumentIndex):
tenant_id=tenant_id,
large_chunks_enabled=large_chunks_enabled,
)
for doc_chunk_ids_batch in batch_generator(
chunks_to_delete, BATCH_SIZE
):

View File

@@ -47,7 +47,6 @@ from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.document_index.vespa_constants import USER_FILE
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.utils.logger import setup_logger
@@ -199,8 +198,6 @@ def _index_vespa_chunk(
# which only calls VespaIndex.update
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
USER_FILE: chunk.user_file if chunk.user_file is not None else None,
# USER_FOLDERS: {user_folder: 1 for user_folder in chunk.user_folders},
BOOST: chunk.boost,
}

View File

@@ -5,6 +5,7 @@ from datetime import timezone
from onyx.configs.constants import INDEX_SEPARATOR
from onyx.context.search.models import IndexFilters
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.vespa_constants import ACCESS_CONTROL_LIST
from onyx.document_index.vespa_constants import CHUNK_ID
from onyx.document_index.vespa_constants import DOC_UPDATED_AT
from onyx.document_index.vespa_constants import DOCUMENT_ID
@@ -13,7 +14,6 @@ from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import METADATA_LIST
from onyx.document_index.vespa_constants import SOURCE_TYPE
from onyx.document_index.vespa_constants import TENANT_ID
from onyx.document_index.vespa_constants import USER_FILE
from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
@@ -27,26 +27,14 @@ def build_vespa_filters(
remove_trailing_and: bool = False, # Set to True when using as a complete Vespa query
) -> str:
def _build_or_filters(key: str, vals: list[str] | None) -> str:
"""For string-based 'contains' filters, e.g. WSET fields or array<string> fields."""
if not key or not vals:
return ""
eq_elems = [f'{key} contains "{val}"' for val in vals if val]
if not eq_elems:
return ""
or_clause = " or ".join(eq_elems)
return f"({or_clause}) and "
def _build_int_or_filters(key: str, vals: list[int] | None) -> str:
"""
For an integer field filter.
If vals is not None, we want *only* docs whose key matches one of vals.
"""
# If `vals` is None => skip the filter entirely
if vals is None or not vals:
if vals is None:
return ""
# Otherwise build the OR filter
eq_elems = [f"{key} = {val}" for val in vals]
valid_vals = [val for val in vals if val]
if not key or not valid_vals:
return ""
eq_elems = [f'{key} contains "{elem}"' for elem in valid_vals]
or_clause = " or ".join(eq_elems)
result = f"({or_clause}) and "
@@ -54,55 +42,53 @@ def build_vespa_filters(
def _build_time_filter(
cutoff: datetime | None,
# Slightly over 3 Months, approximately 1 fiscal quarter
untimed_doc_cutoff: timedelta = timedelta(days=92),
) -> str:
if not cutoff:
return ""
# For Documents that don't have an updated at, filter them out for queries asking for
# very recent documents (3 months) default. Documents that don't have an updated at
# time are assigned 3 months for time decay value
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
cutoff_secs = int(cutoff.timestamp())
if include_untimed:
# Documents without updated_at are assigned -1 as their date
return f"!({DOC_UPDATED_AT} < {cutoff_secs}) and "
return f"({DOC_UPDATED_AT} >= {cutoff_secs}) and "
# Start building the filter string
filter_str = f"!({HIDDEN}=true) and " if not include_hidden else ""
# If running in multi-tenant mode
# If running in multi-tenant mode, we may want to filter by tenant_id
if filters.tenant_id and MULTI_TENANT:
filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and '
# ACL filters
# if filters.access_control_list is not None:
# filter_str += _build_or_filters(ACCESS_CONTROL_LIST, filters.access_control_list)
# CAREFUL touching this one, currently there is no second ACL double-check post retrieval
if filters.access_control_list is not None:
filter_str += _build_or_filters(
ACCESS_CONTROL_LIST, filters.access_control_list
)
# Source type filters
source_strs = (
[s.value for s in filters.source_type] if filters.source_type else None
)
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
# Tag filters
tag_attributes = None
if filters.tags:
# build e.g. "tag_key|tag_value"
tag_attributes = [
f"{tag.tag_key}{INDEX_SEPARATOR}{tag.tag_value}" for tag in filters.tags
]
tags = filters.tags
if tags:
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
# Document sets
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
# New: user_file_ids as integer filters
filter_str += _build_int_or_filters(USER_FILE, filters.user_file_ids)
# Time filter
filter_str += _build_time_filter(filters.time_cutoff)
# Trim trailing " and "
if remove_trailing_and and filter_str.endswith(" and "):
filter_str = filter_str[:-5]
filter_str = filter_str[:-5] # We remove the trailing " and "
return filter_str

View File

@@ -66,8 +66,6 @@ EMBEDDINGS = "embeddings"
TITLE_EMBEDDING = "title_embedding"
ACCESS_CONTROL_LIST = "access_control_list"
DOCUMENT_SETS = "document_sets"
USER_FILE = "user_file"
USER_FOLDERS = "user_folders"
LARGE_CHUNK_REFERENCE_IDS = "large_chunk_reference_ids"
METADATA = "metadata"
METADATA_LIST = "metadata_list"

View File

@@ -37,7 +37,6 @@ def delete_unstructured_api_key() -> None:
def _sdk_partition_request(
file: IO[Any], file_name: str, **kwargs: Any
) -> operations.PartitionRequest:
file.seek(0, 0)
try:
request = operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(

View File

@@ -10,10 +10,7 @@ from sqlalchemy.orm import Session
from onyx.configs.constants import FileOrigin
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import ChatMessage
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.file_store.file_store import get_default_file_store
from onyx.file_store.models import ChatFileType
from onyx.file_store.models import FileDescriptor
from onyx.file_store.models import InMemoryChatFile
from onyx.utils.b64 import get_image_type
@@ -56,53 +53,6 @@ def load_all_chat_files(
return files
def load_user_folder(folder_id: int, db_session: Session) -> list[InMemoryChatFile]:
user_files = (
db_session.query(UserFile).filter(UserFile.folder_id == folder_id).all()
)
return [load_user_file(file.id, db_session) for file in user_files]
def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
user_file = db_session.query(UserFile).filter(UserFile.id == file_id).first()
if not user_file:
raise ValueError(f"User file with id {file_id} not found")
file_io = get_default_file_store(db_session).read_file(
user_file.document_id, mode="b"
)
return InMemoryChatFile(
file_id=str(user_file.id),
content=file_io.read(),
file_type=ChatFileType.PLAIN_TEXT,
filename=user_file.name,
)
def load_all_user_files(
user_file_ids: list[int],
user_folder_ids: list[int],
db_session: Session,
) -> list[InMemoryChatFile]:
return cast(
list[InMemoryChatFile],
run_functions_tuples_in_parallel(
[(load_user_file, (file_id, db_session)) for file_id in user_file_ids]
)
+ [
file
for folder_id in user_folder_ids
for file in load_user_folder(folder_id, db_session)
],
)
def save_file_from_url(url: str) -> str:
"""NOTE: using multiple sessions here, since this is often called
using multithreading. In practice, sharing a session has resulted in
@@ -178,39 +128,3 @@ def save_files(urls: list[str], base64_files: list[str]) -> list[str]:
]
return run_functions_tuples_in_parallel(funcs)
def load_all_persona_files_for_chat(
persona_id: int, db_session: Session
) -> tuple[list[InMemoryChatFile], list[int]]:
from onyx.db.models import Persona
from sqlalchemy.orm import joinedload
persona = (
db_session.query(Persona)
.filter(Persona.id == persona_id)
.options(
joinedload(Persona.user_files),
joinedload(Persona.user_folders).joinedload(UserFolder.files),
)
.one()
)
persona_file_calls = [
(load_user_file, (user_file.id, db_session)) for user_file in persona.user_files
]
persona_loaded_files = run_functions_tuples_in_parallel(persona_file_calls)
persona_folder_files = []
persona_folder_file_ids = []
for user_folder in persona.user_folders:
folder_files = load_user_folder(user_folder.id, db_session)
persona_folder_files.extend(folder_files)
persona_folder_file_ids.extend([file.id for file in user_folder.files])
persona_files = list(persona_loaded_files) + persona_folder_files
persona_file_ids = [
file.id for file in persona.user_files
] + persona_folder_file_ids
return persona_files, persona_file_ids

View File

@@ -31,7 +31,6 @@ from onyx.db.models import Document as DBDocument
from onyx.db.search_settings import get_current_search_settings
from onyx.db.tag import create_or_add_document_tag
from onyx.db.tag import create_or_add_document_tag_list
from onyx.db.user_documents import fetch_user_files_for_documents
from onyx.document_index.document_index_utils import (
get_multipass_config,
)
@@ -159,8 +158,8 @@ def index_doc_batch_with_handler(
document_batch: list[Document],
index_attempt_metadata: IndexAttemptMetadata,
db_session: Session,
tenant_id: str,
ignore_time_skip: bool = False,
tenant_id: str | None = None,
) -> IndexingPipelineResult:
try:
index_pipeline_result = index_doc_batch(
@@ -318,8 +317,8 @@ def index_doc_batch(
document_index: DocumentIndex,
index_attempt_metadata: IndexAttemptMetadata,
db_session: Session,
tenant_id: str,
ignore_time_skip: bool = False,
tenant_id: str | None = None,
filter_fnc: Callable[[list[Document]], list[Document]] = filter_documents,
) -> IndexingPipelineResult:
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
@@ -403,10 +402,6 @@ def index_doc_batch(
)
}
doc_id_to_user_file_id: dict[str, int | None] = fetch_user_files_for_documents(
document_ids=updatable_ids, db_session=db_session
)
doc_id_to_previous_chunk_cnt: dict[str, int | None] = {
document_id: chunk_count
for document_id, chunk_count in fetch_chunk_counts_for_documents(
@@ -438,7 +433,6 @@ def index_doc_batch(
document_sets=set(
doc_id_to_document_set.get(chunk.source_document.id, [])
),
user_file=doc_id_to_user_file_id.get(chunk.source_document.id, None),
boost=(
ctx.id_to_db_doc_map[chunk.source_document.id].boost
if chunk.source_document.id in ctx.id_to_db_doc_map
@@ -531,9 +525,9 @@ def build_indexing_pipeline(
embedder: IndexingEmbedder,
document_index: DocumentIndex,
db_session: Session,
tenant_id: str,
chunker: Chunker | None = None,
ignore_time_skip: bool = False,
tenant_id: str | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> IndexingPipelineProtocol:
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""

View File

@@ -84,11 +84,9 @@ class DocMetadataAwareIndexChunk(IndexChunk):
negative -> ranked lower.
"""
tenant_id: str
tenant_id: str | None = None
access: "DocumentAccess"
document_sets: set[str]
user_file: int | None
# user_folders: list[int]
boost: int
@classmethod
@@ -97,18 +95,14 @@ class DocMetadataAwareIndexChunk(IndexChunk):
index_chunk: IndexChunk,
access: "DocumentAccess",
document_sets: set[str],
user_file: int | None,
# user_folder: list[int],
boost: int,
tenant_id: str,
tenant_id: str | None,
) -> "DocMetadataAwareIndexChunk":
index_chunk_data = index_chunk.model_dump()
return cls(
**index_chunk_data,
access=access,
document_sets=document_sets,
user_file=user_file,
# user_folders=user_folders,
boost=boost,
tenant_id=tenant_id,
)

View File

@@ -97,7 +97,6 @@ from onyx.server.settings.api import basic_router as settings_router
from onyx.server.token_rate_limits.api import (
router as token_rate_limit_settings_router,
)
from onyx.server.user_documents.api import router as user_documents_router
from onyx.server.utils import BasicAuthenticationError
from onyx.setup import setup_multitenant_onyx
from onyx.setup import setup_onyx
@@ -220,7 +219,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
# If we are multi-tenant, we need to only set up initial public tables
with Session(engine) as db_session:
setup_onyx(db_session, POSTGRES_DEFAULT_SCHEMA)
setup_onyx(db_session, None)
else:
setup_multitenant_onyx()
@@ -296,7 +295,6 @@ def get_application() -> FastAPI:
include_router_with_global_prefix_prepended(application, input_prompt_router)
include_router_with_global_prefix_prepended(application, admin_input_prompt_router)
include_router_with_global_prefix_prepended(application, cc_pair_router)
include_router_with_global_prefix_prepended(application, user_documents_router)
include_router_with_global_prefix_prepended(application, folder_router)
include_router_with_global_prefix_prepended(application, document_set_router)
include_router_with_global_prefix_prepended(application, search_settings_router)

View File

@@ -410,7 +410,7 @@ def _build_qa_response_blocks(
def _build_continue_in_web_ui_block(
tenant_id: str,
tenant_id: str | None,
message_id: int | None,
) -> Block:
if message_id is None:
@@ -482,7 +482,7 @@ def build_follow_up_resolved_blocks(
def build_slack_response_blocks(
answer: ChatOnyxBotResponse,
tenant_id: str,
tenant_id: str | None,
message_info: SlackMessageInfo,
channel_conf: ChannelConfig | None,
use_citations: bool,

View File

@@ -151,7 +151,7 @@ def handle_slack_feedback(
user_id_to_post_confirmation: str,
channel_id_to_post_confirmation: str,
thread_ts_to_post_confirmation: str,
tenant_id: str,
tenant_id: str | None,
) -> None:
message_id, doc_id, doc_rank = decompose_action_id(feedback_id)

View File

@@ -109,7 +109,7 @@ def handle_message(
slack_channel_config: SlackChannelConfig,
client: WebClient,
feedback_reminder_id: str | None,
tenant_id: str,
tenant_id: str | None,
) -> bool:
"""Potentially respond to the user message depending on filters and if an answer was generated

View File

@@ -72,7 +72,7 @@ def handle_regular_answer(
channel: str,
logger: OnyxLoggingAdapter,
feedback_reminder_id: str | None,
tenant_id: str,
tenant_id: str | None,
num_retries: int = DANSWER_BOT_NUM_RETRIES,
thread_context_percent: float = MAX_THREAD_CONTEXT_PERCENTAGE,
should_respond_with_error_msgs: bool = DANSWER_BOT_DISPLAY_ERROR_MSGS,

View File

@@ -123,13 +123,13 @@ _OFFICIAL_SLACKBOT_USER_ID = "USLACKBOT"
class SlackbotHandler:
def __init__(self) -> None:
logger.info("Initializing SlackbotHandler")
self.tenant_ids: Set[str] = set()
self.tenant_ids: Set[str | None] = set()
# The keys for these dictionaries are tuples of (tenant_id, slack_bot_id)
self.socket_clients: Dict[tuple[str, int], TenantSocketModeClient] = {}
self.slack_bot_tokens: Dict[tuple[str, int], SlackBotTokens] = {}
self.socket_clients: Dict[tuple[str | None, int], TenantSocketModeClient] = {}
self.slack_bot_tokens: Dict[tuple[str | None, int], SlackBotTokens] = {}
# Store Redis lock objects here so we can release them properly
self.redis_locks: Dict[str, Lock] = {}
self.redis_locks: Dict[str | None, Lock] = {}
self.running = True
self.pod_id = self.get_pod_id()
@@ -193,7 +193,7 @@ class SlackbotHandler:
self._shutdown_event.wait(timeout=TENANT_HEARTBEAT_INTERVAL)
def _manage_clients_per_tenant(
self, db_session: Session, tenant_id: str, bot: SlackBot
self, db_session: Session, tenant_id: str | None, bot: SlackBot
) -> None:
"""
- If the tokens are missing or empty, close the socket client and remove them.
@@ -385,7 +385,7 @@ class SlackbotHandler:
finally:
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
def _remove_tenant(self, tenant_id: str) -> None:
def _remove_tenant(self, tenant_id: str | None) -> None:
"""
Helper to remove a tenant from `self.tenant_ids` and close any socket clients.
(Lock release now happens in `acquire_tenants()`, not here.)
@@ -415,7 +415,7 @@ class SlackbotHandler:
)
def start_socket_client(
self, slack_bot_id: int, tenant_id: str, slack_bot_tokens: SlackBotTokens
self, slack_bot_id: int, tenant_id: str | None, slack_bot_tokens: SlackBotTokens
) -> None:
socket_client: TenantSocketModeClient = _get_socket_client(
slack_bot_tokens, tenant_id, slack_bot_id
@@ -912,7 +912,7 @@ def create_process_slack_event() -> (
def _get_socket_client(
slack_bot_tokens: SlackBotTokens, tenant_id: str, slack_bot_id: int
slack_bot_tokens: SlackBotTokens, tenant_id: str | None, slack_bot_id: int
) -> TenantSocketModeClient:
# For more info on how to set this up, checkout the docs:
# https://docs.onyx.app/slack_bot_setup

View File

@@ -570,7 +570,7 @@ def read_slack_thread(
def slack_usage_report(
action: str, sender_id: str | None, client: WebClient, tenant_id: str
action: str, sender_id: str | None, client: WebClient, tenant_id: str | None
) -> None:
if DISABLE_TELEMETRY:
return
@@ -663,7 +663,9 @@ def get_feedback_visibility() -> FeedbackVisibility:
class TenantSocketModeClient(SocketModeClient):
def __init__(self, tenant_id: str, slack_bot_id: int, *args: Any, **kwargs: Any):
def __init__(
self, tenant_id: str | None, slack_bot_id: int, *args: Any, **kwargs: Any
):
super().__init__(*args, **kwargs)
self.tenant_id = tenant_id
self.slack_bot_id = slack_bot_id

View File

@@ -16,10 +16,10 @@ class RedisConnector:
"""Composes several classes to simplify interacting with a connector and its
associated background tasks / associated redis interactions."""
def __init__(self, tenant_id: str, id: int) -> None:
def __init__(self, tenant_id: str | None, id: int) -> None:
"""id: a connector credential pair id"""
self.tenant_id: str = tenant_id
self.tenant_id: str | None = tenant_id
self.id: int = id
self.redis: redis.Redis = get_redis_client(tenant_id=tenant_id)

View File

@@ -31,7 +31,7 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
PREFIX = "connectorsync"
TASKSET_PREFIX = PREFIX + "_taskset"
def __init__(self, tenant_id: str, id: int) -> None:
def __init__(self, tenant_id: str | None, id: int) -> None:
super().__init__(tenant_id, str(id))
# documents that should be skipped
@@ -60,7 +60,7 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
db_session: Session,
redis_client: Redis,
lock: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> tuple[int, int] | None:
"""We can limit the number of tasks generated here, which is useful to prevent
one tenant from overwhelming the sync queue.

View File

@@ -39,8 +39,8 @@ class RedisConnectorDelete:
ACTIVE_PREFIX = PREFIX + "_active"
ACTIVE_TTL = 3600
def __init__(self, tenant_id: str, id: int, redis: redis.Redis) -> None:
self.tenant_id: str = tenant_id
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id = id
self.redis = redis

View File

@@ -52,8 +52,8 @@ class RedisConnectorPermissionSync:
ACTIVE_PREFIX = PREFIX + "_active"
ACTIVE_TTL = CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT * 2
def __init__(self, tenant_id: str, id: int, redis: redis.Redis) -> None:
self.tenant_id: str = tenant_id
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id = id
self.redis = redis

View File

@@ -44,8 +44,8 @@ class RedisConnectorExternalGroupSync:
ACTIVE_PREFIX = PREFIX + "_active"
ACTIVE_TTL = 3600
def __init__(self, tenant_id: str, id: int, redis: redis.Redis) -> None:
self.tenant_id: str = tenant_id
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id = id
self.redis = redis

View File

@@ -52,12 +52,12 @@ class RedisConnectorIndex:
def __init__(
self,
tenant_id: str,
tenant_id: str | None,
id: int,
search_settings_id: int,
redis: redis.Redis,
) -> None:
self.tenant_id: str = tenant_id
self.tenant_id: str | None = tenant_id
self.id = id
self.search_settings_id = search_settings_id
self.redis = redis

View File

@@ -52,8 +52,8 @@ class RedisConnectorPrune:
ACTIVE_PREFIX = PREFIX + "_active"
ACTIVE_TTL = CELERY_PRUNING_LOCK_TIMEOUT * 2
def __init__(self, tenant_id: str, id: int, redis: redis.Redis) -> None:
self.tenant_id: str = tenant_id
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id = id
self.redis = redis

View File

@@ -13,8 +13,8 @@ class RedisConnectorStop:
TIMEOUT_PREFIX = f"{PREFIX}_timeout"
TIMEOUT_TTL = 300
def __init__(self, tenant_id: str, id: int, redis: redis.Redis) -> None:
self.tenant_id: str = tenant_id
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id: int = id
self.redis = redis

View File

@@ -23,7 +23,7 @@ class RedisDocumentSet(RedisObjectHelper):
FENCE_PREFIX = PREFIX + "_fence"
TASKSET_PREFIX = PREFIX + "_taskset"
def __init__(self, tenant_id: str, id: int) -> None:
def __init__(self, tenant_id: str | None, id: int) -> None:
super().__init__(tenant_id, str(id))
@property
@@ -58,7 +58,7 @@ class RedisDocumentSet(RedisObjectHelper):
db_session: Session,
redis_client: Redis,
lock: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> tuple[int, int] | None:
"""Max tasks is ignored for now until we can build the logic to mark the
document set up to date over multiple batches.

View File

@@ -14,8 +14,8 @@ class RedisObjectHelper(ABC):
FENCE_PREFIX = PREFIX + "_fence"
TASKSET_PREFIX = PREFIX + "_taskset"
def __init__(self, tenant_id: str, id: str):
self._tenant_id: str = tenant_id
def __init__(self, tenant_id: str | None, id: str):
self._tenant_id: str | None = tenant_id
self._id: str = id
self.redis = get_redis_client(tenant_id=tenant_id)
@@ -87,7 +87,7 @@ class RedisObjectHelper(ABC):
db_session: Session,
redis_client: Redis,
lock: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> tuple[int, int] | None:
"""First element should be the number of actual tasks generated, second should
be the number of docs that were candidates to be synced for the cc pair.

View File

@@ -24,7 +24,7 @@ class RedisUserGroup(RedisObjectHelper):
FENCE_PREFIX = PREFIX + "_fence"
TASKSET_PREFIX = PREFIX + "_taskset"
def __init__(self, tenant_id: str, id: int) -> None:
def __init__(self, tenant_id: str | None, id: int) -> None:
super().__init__(tenant_id, str(id))
@property
@@ -59,7 +59,7 @@ class RedisUserGroup(RedisObjectHelper):
db_session: Session,
redis_client: Redis,
lock: RedisLock,
tenant_id: str,
tenant_id: str | None,
) -> tuple[int, int] | None:
"""Max tasks is ignored for now until we can build the logic to mark the
user group up to date over multiple batches.

View File

@@ -37,15 +37,13 @@ from onyx.key_value_store.interface import KvKeyNotFoundError
from onyx.server.documents.models import ConnectorBase
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import fetch_versioned_implementation
from shared_configs.configs import MULTI_TENANT
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
logger = setup_logger()
def _create_indexable_chunks(
preprocessed_docs: list[dict],
tenant_id: str,
tenant_id: str | None,
) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]:
ids_to_documents = {}
chunks = []
@@ -88,10 +86,9 @@ def _create_indexable_chunks(
mini_chunk_embeddings=[],
),
title_embedding=preprocessed_doc["title_embedding"],
tenant_id=tenant_id if MULTI_TENANT else POSTGRES_DEFAULT_SCHEMA,
tenant_id=tenant_id,
access=default_public_access,
document_sets=set(),
user_file=None,
boost=DEFAULT_BOOST,
large_chunk_id=None,
)
@@ -114,7 +111,7 @@ def load_processed_docs(cohere_enabled: bool) -> list[dict]:
def seed_initial_documents(
db_session: Session, tenant_id: str, cohere_enabled: bool = False
db_session: Session, tenant_id: str | None, cohere_enabled: bool = False
) -> None:
"""
Seed initial documents so users don't have an empty index to start

View File

@@ -5,7 +5,6 @@ from onyx.configs.chat_configs import INPUT_PROMPT_YAML
from onyx.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
from onyx.configs.chat_configs import PERSONAS_YAML
from onyx.configs.chat_configs import PROMPTS_YAML
from onyx.configs.chat_configs import USER_FOLDERS_YAML
from onyx.context.search.enums import RecencyBiasSetting
from onyx.db.document_set import get_or_create_document_set_by_name
from onyx.db.input_prompt import insert_input_prompt_if_not_exists
@@ -16,30 +15,6 @@ from onyx.db.models import Tool as ToolDBModel
from onyx.db.persona import upsert_persona
from onyx.db.prompts import get_prompt_by_name
from onyx.db.prompts import upsert_prompt
from onyx.db.user_documents import upsert_user_folder
def load_user_folders_from_yaml(
db_session: Session,
user_folders_yaml: str = USER_FOLDERS_YAML,
) -> None:
with open(user_folders_yaml, "r") as file:
data = yaml.safe_load(file)
all_user_folders = data.get("user_folders", [])
for user_folder in all_user_folders:
upsert_user_folder(
db_session=db_session,
id=user_folder.get("id"),
user_id=user_folder.get("user_id"),
name=user_folder.get("name"),
description=user_folder.get("description"),
created_at=user_folder.get("created_at"),
user=user_folder.get("user"),
files=user_folder.get("files"),
assistants=user_folder.get("assistants"),
)
db_session.flush()
def load_prompts_from_yaml(
@@ -204,4 +179,3 @@ def load_chat_yamls(
load_prompts_from_yaml(db_session, prompt_yaml)
load_personas_from_yaml(db_session, personas_yaml)
load_input_prompts_from_yaml(db_session, input_prompts_yaml)
load_user_folders_from_yaml(db_session)

View File

@@ -1,6 +0,0 @@
user_folders:
- id: -1
name: "Recent Documents"
description: "Documents uploaded by the user"
files: []
assistants: []

View File

@@ -620,7 +620,7 @@ def associate_credential_to_connector(
)
try:
validate_ccpair_for_user(connector_id, credential_id, db_session)
validate_ccpair_for_user(connector_id, credential_id, db_session, tenant_id)
response = add_credential_to_connector(
db_session=db_session,

View File

@@ -389,7 +389,12 @@ def check_drive_tokens(
return AuthStatus(authenticated=True)
def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResponse:
@router.post("/admin/connector/file/upload")
def upload_files(
files: list[UploadFile],
_: User = Depends(current_curator_or_admin_user),
db_session: Session = Depends(get_session),
) -> FileUploadResponse:
for file in files:
if not file.filename:
raise HTTPException(status_code=400, detail="File name cannot be empty")
@@ -450,15 +455,6 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
return FileUploadResponse(file_paths=deduped_file_paths)
@router.post("/admin/connector/file/upload")
def upload_files_api(
files: list[UploadFile],
_: User = Depends(current_curator_or_admin_user),
db_session: Session = Depends(get_session),
) -> FileUploadResponse:
return upload_files(files, db_session)
@router.get("/admin/connector")
def get_connectors_by_credential(
_: User = Depends(current_curator_or_admin_user),
@@ -906,6 +902,7 @@ def create_connector_with_mock_credential(
connector_id=connector_id,
credential_id=credential_id,
db_session=db_session,
tenant_id=tenant_id,
)
response = add_credential_to_connector(
db_session=db_session,
@@ -1046,16 +1043,55 @@ def connector_run_once(
status_code=400,
detail="Connector has no valid credentials, cannot create index attempts.",
)
try:
num_triggers = trigger_indexing_for_cc_pair(
credential_ids,
connector_id,
run_info.from_beginning,
tenant_id,
db_session,
# Prevents index attempts for cc pairs that already have an index attempt currently running
skipped_credentials = [
credential_id
for credential_id in credential_ids
if get_index_attempts_for_cc_pair(
cc_pair_identifier=ConnectorCredentialPairIdentifier(
connector_id=run_info.connector_id,
credential_id=credential_id,
),
only_current=True,
db_session=db_session,
disinclude_finished=True,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
]
connector_credential_pairs = [
get_connector_credential_pair(
db_session=db_session,
connector_id=connector_id,
credential_id=credential_id,
)
for credential_id in credential_ids
if credential_id not in skipped_credentials
]
num_triggers = 0
for cc_pair in connector_credential_pairs:
if cc_pair is not None:
indexing_mode = IndexingMode.UPDATE
if run_info.from_beginning:
indexing_mode = IndexingMode.REINDEX
mark_ccpair_with_indexing_trigger(cc_pair.id, indexing_mode, db_session)
num_triggers += 1
logger.info(
f"connector_run_once - marking cc_pair with indexing trigger: "
f"connector={run_info.connector_id} "
f"cc_pair={cc_pair.id} "
f"indexing_trigger={indexing_mode}"
)
# run the beat task to pick up the triggers immediately
primary_app.send_task(
OnyxCeleryTask.CHECK_FOR_INDEXING,
priority=OnyxCeleryPriority.HIGH,
kwargs={"tenant_id": tenant_id},
)
logger.info("connector_run_once - running check_for_indexing")
@@ -1229,82 +1265,3 @@ def get_basic_connector_indexing_status(
for cc_pair in cc_pairs
if cc_pair.connector.source != DocumentSource.INGESTION_API
]
def trigger_indexing_for_cc_pair(
specified_credential_ids: list[int],
connector_id: int,
from_beginning: bool,
tenant_id: str,
db_session: Session,
) -> int:
try:
possible_credential_ids = get_connector_credential_ids(connector_id, db_session)
except ValueError as e:
raise ValueError(f"Connector by id {connector_id} does not exist: {str(e)}")
if not specified_credential_ids:
credential_ids = possible_credential_ids
else:
if set(specified_credential_ids).issubset(set(possible_credential_ids)):
credential_ids = specified_credential_ids
else:
raise ValueError(
"Not all specified credentials are associated with connector"
)
if not credential_ids:
raise ValueError(
"Connector has no valid credentials, cannot create index attempts."
)
# Prevents index attempts for cc pairs that already have an index attempt currently running
skipped_credentials = [
credential_id
for credential_id in credential_ids
if get_index_attempts_for_cc_pair(
cc_pair_identifier=ConnectorCredentialPairIdentifier(
connector_id=connector_id,
credential_id=credential_id,
),
only_current=True,
db_session=db_session,
disinclude_finished=True,
)
]
connector_credential_pairs = [
get_connector_credential_pair(
db_session=db_session,
connector_id=connector_id,
credential_id=credential_id,
)
for credential_id in credential_ids
if credential_id not in skipped_credentials
]
num_triggers = 0
for cc_pair in connector_credential_pairs:
if cc_pair is not None:
indexing_mode = IndexingMode.UPDATE
if from_beginning:
indexing_mode = IndexingMode.REINDEX
mark_ccpair_with_indexing_trigger(cc_pair.id, indexing_mode, db_session)
num_triggers += 1
logger.info(
f"connector_run_once - marking cc_pair with indexing trigger: "
f"connector={connector_id} "
f"cc_pair={cc_pair.id} "
f"indexing_trigger={indexing_mode}"
)
# run the beat task to pick up the triggers immediately
primary_app.send_task(
OnyxCeleryTask.CHECK_FOR_INDEXING,
priority=OnyxCeleryPriority.HIGH,
kwargs={"tenant_id": tenant_id},
)
return num_triggers

View File

@@ -18,6 +18,7 @@ from onyx.db.credentials import fetch_credentials_by_source_for_user
from onyx.db.credentials import fetch_credentials_for_user
from onyx.db.credentials import swap_credentials_connector
from onyx.db.credentials import update_credential
from onyx.db.engine import get_current_tenant_id
from onyx.db.engine import get_session
from onyx.db.models import DocumentSource
from onyx.db.models import User
@@ -99,11 +100,13 @@ def swap_credentials_for_connector(
credential_swap_req: CredentialSwapRequest,
user: User | None = Depends(current_user),
db_session: Session = Depends(get_session),
tenant_id: str | None = Depends(get_current_tenant_id),
) -> StatusResponse:
validate_ccpair_for_user(
credential_swap_req.connector_id,
credential_swap_req.new_credential_id,
db_session,
tenant_id,
)
connector_credential_pair = swap_credentials_connector(

View File

@@ -122,7 +122,6 @@ class CredentialBase(BaseModel):
name: str | None = None
curator_public: bool = False
groups: list[int] = Field(default_factory=list)
is_user_file: bool = False
class CredentialSnapshot(CredentialBase):
@@ -393,7 +392,7 @@ class FileUploadResponse(BaseModel):
class ObjectCreationIdResponse(BaseModel):
id: int
id: int | str
credential: CredentialSnapshot | None = None

View File

@@ -18,9 +18,9 @@ from onyx.db.models import User
from onyx.server.features.folder.models import DeleteFolderOptions
from onyx.server.features.folder.models import FolderChatSessionRequest
from onyx.server.features.folder.models import FolderCreationRequest
from onyx.server.features.folder.models import FolderResponse
from onyx.server.features.folder.models import FolderUpdateRequest
from onyx.server.features.folder.models import GetUserFoldersResponse
from onyx.server.features.folder.models import UserFolderSnapshot
from onyx.server.models import DisplayPriorityRequest
from onyx.server.query_and_chat.models import ChatSessionDetails
@@ -39,7 +39,7 @@ def get_folders(
folders.sort()
return GetUserFoldersResponse(
folders=[
UserFolderSnapshot(
FolderResponse(
folder_id=folder.id,
folder_name=folder.name,
display_priority=folder.display_priority,

View File

@@ -5,7 +5,7 @@ from pydantic import BaseModel
from onyx.server.query_and_chat.models import ChatSessionDetails
class UserFolderSnapshot(BaseModel):
class FolderResponse(BaseModel):
folder_id: int
folder_name: str | None
display_priority: int
@@ -13,7 +13,7 @@ class UserFolderSnapshot(BaseModel):
class GetUserFoldersResponse(BaseModel):
folders: list[UserFolderSnapshot]
folders: list[FolderResponse]
class FolderCreationRequest(BaseModel):

View File

@@ -26,7 +26,6 @@ from onyx.db.persona import create_assistant_label
from onyx.db.persona import create_update_persona
from onyx.db.persona import delete_persona_label
from onyx.db.persona import get_assistant_labels
from shared_configs.contextvars import get_current_tenant_id
from onyx.db.persona import get_persona_by_id
from onyx.db.persona import get_personas_for_user
from onyx.db.persona import mark_persona_as_deleted
@@ -56,9 +55,11 @@ from onyx.server.models import DisplayPriorityRequest
from onyx.tools.utils import is_image_generation_available
from onyx.utils.logger import setup_logger
from onyx.utils.telemetry import create_milestone_and_report
from shared_configs.contextvars import get_current_tenant_id
logger = setup_logger()
admin_router = APIRouter(prefix="/admin/persona")
basic_router = APIRouter(prefix="/persona")
@@ -209,7 +210,6 @@ def create_persona(
and len(persona_upsert_request.prompt_ids) > 0
else None
)
prompt = upsert_prompt(
db_session=db_session,
user=user,

View File

@@ -85,8 +85,6 @@ class PersonaUpsertRequest(BaseModel):
label_ids: list[int] | None = None
is_default_persona: bool = False
display_priority: int | None = None
user_file_ids: list[int] | None = None
user_folder_ids: list[int] | None = None
class PersonaSnapshot(BaseModel):

View File

@@ -4,7 +4,6 @@ from pydantic import BaseModel
from pydantic import Field
from onyx.llm.llm_provider_options import fetch_models_for_provider
from onyx.llm.utils import get_max_input_tokens
if TYPE_CHECKING:
@@ -36,36 +35,22 @@ class LLMProviderDescriptor(BaseModel):
fast_default_model_name: str | None
is_default_provider: bool | None
display_model_names: list[str] | None
model_token_limits: dict[str, int] | None = None
@classmethod
def from_model(
cls, llm_provider_model: "LLMProviderModel"
) -> "LLMProviderDescriptor":
model_names = (
llm_provider_model.model_names
or fetch_models_for_provider(llm_provider_model.provider)
or [llm_provider_model.default_model_name]
)
model_token_rate = (
{
model_name: get_max_input_tokens(
model_name, llm_provider_model.provider
)
for model_name in model_names
}
if model_names is not None
else None
)
return cls(
name=llm_provider_model.name,
provider=llm_provider_model.provider,
default_model_name=llm_provider_model.default_model_name,
fast_default_model_name=llm_provider_model.fast_default_model_name,
is_default_provider=llm_provider_model.is_default_provider,
model_names=model_names,
model_token_limits=model_token_rate,
model_names=(
llm_provider_model.model_names
or fetch_models_for_provider(llm_provider_model.provider)
or [llm_provider_model.default_model_name]
),
display_model_names=llm_provider_model.display_model_names,
)
@@ -95,7 +80,6 @@ class FullLLMProvider(LLMProvider):
id: int
is_default_provider: bool | None = None
model_names: list[str]
model_token_limits: dict[str, int] | None = None
@classmethod
def from_model(cls, llm_provider_model: "LLMProviderModel") -> "FullLLMProvider":
@@ -116,14 +100,6 @@ class FullLLMProvider(LLMProvider):
or fetch_models_for_provider(llm_provider_model.provider)
or [llm_provider_model.default_model_name]
),
model_token_limits={
model_name: get_max_input_tokens(
model_name, llm_provider_model.provider
)
for model_name in llm_provider_model.model_names
}
if llm_provider_model.model_names is not None
else None,
is_public=llm_provider_model.is_public,
groups=[group.id for group in llm_provider_model.groups],
deployment_name=llm_provider_model.deployment_name,

View File

@@ -3,7 +3,6 @@ import datetime
import io
import json
import os
import time
import uuid
from collections.abc import Callable
from collections.abc import Generator
@@ -30,12 +29,10 @@ from onyx.chat.prompt_builder.citations_prompt import (
compute_max_document_tokens_for_persona,
)
from onyx.configs.app_configs import WEB_DOMAIN
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.configs.constants import MessageType
from onyx.configs.constants import MilestoneRecordType
from onyx.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS
from onyx.connectors.models import InputType
from onyx.db.chat import add_chats_to_session_from_slack_thread
from onyx.db.chat import create_chat_session
from onyx.db.chat import create_new_chat_message
@@ -50,18 +47,13 @@ from onyx.db.chat import get_or_create_root_message
from onyx.db.chat import set_as_latest_chat_message
from onyx.db.chat import translate_db_message_to_chat_message_detail
from onyx.db.chat import update_chat_session
from onyx.db.connector import create_connector
from onyx.db.connector_credential_pair import add_credential_to_connector
from onyx.db.credentials import create_credential
from onyx.db.chat_search import search_chat_sessions
from onyx.db.engine import get_session
from onyx.db.engine import get_session_with_tenant
from onyx.db.enums import AccessType
from onyx.db.feedback import create_chat_message_feedback
from onyx.db.feedback import create_doc_retrieval_feedback
from onyx.db.models import User
from onyx.db.persona import get_persona_by_id
from onyx.db.user_documents import create_user_files
from onyx.file_processing.extract_file_text import docx_to_txt_filename
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_store.file_store import get_default_file_store
@@ -74,8 +66,6 @@ from onyx.natural_language_processing.utils import get_tokenizer
from onyx.secondary_llm_flows.chat_session_naming import (
get_renamed_conversation_name,
)
from onyx.server.documents.models import ConnectorBase
from onyx.server.documents.models import CredentialBase
from onyx.server.query_and_chat.models import ChatFeedbackRequest
from onyx.server.query_and_chat.models import ChatMessageIdentifier
from onyx.server.query_and_chat.models import ChatRenameRequest
@@ -101,7 +91,6 @@ from onyx.utils.logger import setup_logger
from onyx.utils.telemetry import create_milestone_and_report
from shared_configs.contextvars import get_current_tenant_id
RECENT_DOCS_FOLDER_ID = -1
logger = setup_logger()
@@ -658,7 +647,7 @@ def seed_chat_from_slack(
def upload_files_for_chat(
files: list[UploadFile],
db_session: Session = Depends(get_session),
user: User | None = Depends(current_user),
_: User | None = Depends(current_user),
) -> dict[str, list[FileDescriptor]]:
image_content_types = {"image/jpeg", "image/png", "image/webp"}
csv_content_types = {"text/csv"}
@@ -696,11 +685,17 @@ def upload_files_for_chat(
if file.content_type in image_content_types:
error_detail = "Unsupported image file type. Supported image types include .jpg, .jpeg, .png, .webp."
elif file.content_type in text_content_types:
error_detail = "Unsupported text file type."
error_detail = "Unsupported text file type. Supported text types include .txt, .csv, .md, .mdx, .conf, "
".log, .tsv."
elif file.content_type in csv_content_types:
error_detail = "Unsupported CSV file type."
error_detail = (
"Unsupported CSV file type. Supported CSV types include .csv."
)
else:
error_detail = "Unsupported document file type."
error_detail = (
"Unsupported document file type. Supported document types include .pdf, .docx, .pptx, .xlsx, "
".json, .xml, .yml, .yaml, .eml, .epub."
)
raise HTTPException(status_code=400, detail=error_detail)
if (
@@ -748,12 +743,11 @@ def upload_files_for_chat(
file_type=new_content_type or file_type.value,
)
# 4) If the file is a doc, extract text and store that separately
# if the file is a doc, extract text and store that so we don't need
# to re-extract it every time we send a message
if file_type == ChatFileType.DOC:
# Re-wrap bytes in a fresh BytesIO so we start at position 0
extracted_text_io = io.BytesIO(file_content)
extracted_text = extract_file_text(
file=extracted_text_io, # use the bytes we already read
file=file_content_io, # use the bytes we already read
file_name=file.filename or "",
)
text_file_id = str(uuid.uuid4())
@@ -765,57 +759,13 @@ def upload_files_for_chat(
file_origin=FileOrigin.CHAT_UPLOAD,
file_type="text/plain",
)
# Return the text file as the "main" file descriptor for doc types
# for DOC type, just return this for the FileDescriptor
# as we would always use this as the ID to attach to the
# message
file_info.append((text_file_id, file.filename, ChatFileType.PLAIN_TEXT))
else:
file_info.append((file_id, file.filename, file_type))
# 5) Create a user file for each uploaded file
user_files = create_user_files([file], RECENT_DOCS_FOLDER_ID, user, db_session)
for user_file in user_files:
# 6) Create connector
connector_base = ConnectorBase(
name=f"UserFile-{int(time.time())}",
source=DocumentSource.FILE,
input_type=InputType.LOAD_STATE,
connector_specific_config={
"file_locations": [user_file.file_id],
},
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
connector = create_connector(
db_session=db_session,
connector_data=connector_base,
)
# 7) Create credential
credential_info = CredentialBase(
credential_json={},
admin_public=True,
source=DocumentSource.FILE,
curator_public=True,
groups=[],
name=f"UserFileCredential-{int(time.time())}",
is_user_file=True,
)
credential = create_credential(credential_info, user, db_session)
# 8) Create connector credential pair
cc_pair = add_credential_to_connector(
db_session=db_session,
user=user,
connector_id=connector.id,
credential_id=credential.id,
cc_pair_name=f"UserFileCCPair-{int(time.time())}",
access_type=AccessType.PRIVATE,
auto_sync_options=None,
groups=[],
)
user_file.cc_pair_id = cc_pair.data
db_session.commit()
return {
"files": [
{"id": file_id, "type": file_type, "name": file_name}

View File

@@ -92,8 +92,6 @@ class CreateChatMessageRequest(ChunkContext):
message: str
# Files that we should attach to this message
file_descriptors: list[FileDescriptor]
user_file_ids: list[int] = []
user_folder_ids: list[int] = []
# If no prompt provided, uses the largest prompt of the chat session
# but really this should be explicitly specified, only in the simplified APIs is this inferred
@@ -120,7 +118,7 @@ class CreateChatMessageRequest(ChunkContext):
# this does persist in the chat thread details
temperature_override: float | None = None
# allow user to specify an alternate assistant
# allow user to specify an alternate assistnat
alternate_assistant_id: int | None = None
# This takes the priority over the prompt_override
@@ -137,8 +135,6 @@ class CreateChatMessageRequest(ChunkContext):
# https://platform.openai.com/docs/guides/structured-outputs/introduction
structured_response_format: dict | None = None
force_user_file_search: bool = False
# If true, ignores most of the search options and uses pro search instead.
# TODO: decide how many of the above options we want to pass through to pro search
use_agentic_search: bool = False

View File

@@ -5,7 +5,6 @@ from pydantic import BaseModel
from onyx.configs.constants import NotificationType
from onyx.db.models import Notification as NotificationDBModel
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
class PageType(str, Enum):
@@ -55,4 +54,4 @@ class Settings(BaseModel):
class UserSettings(Settings):
notifications: list[Notification]
needs_reindexing: bool
tenant_id: str = POSTGRES_DEFAULT_SCHEMA
tenant_id: str | None = None

View File

@@ -1,443 +0,0 @@
import io
import time
from typing import List
import requests
import sqlalchemy.exc
from bs4 import BeautifulSoup
from fastapi import APIRouter
from fastapi import Depends
from fastapi import File
from fastapi import Form
from fastapi import HTTPException
from fastapi import UploadFile
from pydantic import BaseModel
from sqlalchemy.orm import Session
from onyx.auth.users import current_user
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import InputType
from onyx.db.connector import create_connector
from onyx.db.connector_credential_pair import add_credential_to_connector
from onyx.db.credentials import create_credential
from onyx.db.engine import get_session
from onyx.db.enums import AccessType
from onyx.db.models import User
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
from onyx.db.user_documents import create_user_files
from onyx.db.user_documents import share_file_with_assistant
from onyx.db.user_documents import share_folder_with_assistant
from onyx.db.user_documents import unshare_file_with_assistant
from onyx.db.user_documents import unshare_folder_with_assistant
from onyx.file_processing.html_utils import web_html_cleanup
from onyx.server.documents.models import ConnectorBase
from onyx.server.documents.models import CredentialBase
from onyx.server.documents.models import FileUploadResponse
from onyx.server.user_documents.models import MessageResponse
from onyx.server.user_documents.models import UserFileSnapshot
from onyx.server.user_documents.models import UserFolderSnapshot
router = APIRouter()
class FolderCreationRequest(BaseModel):
name: str
description: str
@router.post("/user/folder")
def create_folder(
request: FolderCreationRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> UserFolderSnapshot:
try:
new_folder = UserFolder(
user_id=user.id if user else None,
name=request.name,
description=request.description,
)
db_session.add(new_folder)
db_session.commit()
return UserFolderSnapshot.from_model(new_folder)
except sqlalchemy.exc.DataError as e:
if "StringDataRightTruncation" in str(e):
raise HTTPException(
status_code=400,
detail="Folder name or description is too long. Please use a shorter name or description.",
)
raise
@router.get(
"/user/folder",
)
def get_folders(
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> list[UserFolderSnapshot]:
user_id = user.id if user else None
folders = db_session.query(UserFolder).filter(UserFolder.user_id == user_id).all()
return [UserFolderSnapshot.from_model(folder) for folder in folders]
@router.get("/user/folder/{folder_id}")
def get_folder(
folder_id: int,
user: User | None = Depends(current_user),
db_session: Session = Depends(get_session),
) -> UserFolderSnapshot:
user_id = user.id if user else None
folder = (
db_session.query(UserFolder)
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
.first()
)
if not folder:
raise HTTPException(status_code=404, detail="Folder not found")
return UserFolderSnapshot.from_model(folder)
RECENT_DOCS_FOLDER_ID = -1
@router.post("/user/file/upload")
def upload_user_files(
files: List[UploadFile] = File(...),
folder_id: int | None = Form(None),
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> FileUploadResponse:
if folder_id == 0:
folder_id = None
user_files = create_user_files(files, folder_id, user, db_session)
for user_file in user_files:
connector_base = ConnectorBase(
name=f"UserFile-{user_file.file_id}-{int(time.time())}",
source=DocumentSource.FILE,
input_type=InputType.LOAD_STATE,
connector_specific_config={
"file_locations": [user_file.file_id],
},
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
connector = create_connector(
db_session=db_session,
connector_data=connector_base,
)
credential_info = CredentialBase(
credential_json={},
admin_public=True,
source=DocumentSource.FILE,
curator_public=True,
groups=[],
name=f"UserFileCredential-{user_file.file_id}-{int(time.time())}",
is_user_file=True,
)
credential = create_credential(credential_info, user, db_session)
cc_pair = add_credential_to_connector(
db_session=db_session,
user=user,
connector_id=connector.id,
credential_id=credential.id,
cc_pair_name=f"UserFileCCPair-{user_file.file_id}-{int(time.time())}",
access_type=AccessType.PRIVATE,
auto_sync_options=None,
groups=[],
is_user_file=True,
)
user_file.cc_pair_id = cc_pair.data
print("A")
db_session.commit()
db_session.commit()
# TODO: functional document indexing
# trigger_document_indexing(db_session, user.id)
return FileUploadResponse(
file_paths=[user_file.file_id for user_file in user_files],
)
@router.put("/user/folder/{folder_id}")
def update_folder(
folder_id: int,
name: str,
user: User | None = Depends(current_user),
db_session: Session = Depends(get_session),
) -> UserFolderSnapshot:
user_id = user.id if user else None
folder = (
db_session.query(UserFolder)
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
.first()
)
if not folder:
raise HTTPException(status_code=404, detail="Folder not found")
folder.name = name
db_session.commit()
return UserFolderSnapshot.from_model(folder)
@router.delete("/user/folder/{folder_id}")
def delete_folder(
folder_id: int,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> MessageResponse:
user_id = user.id if user else None
folder = (
db_session.query(UserFolder)
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
.first()
)
if not folder:
raise HTTPException(status_code=404, detail="Folder not found")
db_session.delete(folder)
db_session.commit()
return MessageResponse(message="Folder deleted successfully")
@router.delete("/user/file/{file_id}")
def delete_file(
file_id: int,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> MessageResponse:
user_id = user.id if user else None
file = (
db_session.query(UserFile)
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
.first()
)
if not file:
raise HTTPException(status_code=404, detail="File not found")
db_session.delete(file)
db_session.commit()
return MessageResponse(message="File deleted successfully")
class FileMoveRequest(BaseModel):
new_folder_id: int | None
@router.put("/user/file/{file_id}/move")
def move_file(
file_id: int,
request: FileMoveRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> UserFileSnapshot:
user_id = user.id if user else None
file = (
db_session.query(UserFile)
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
.first()
)
if not file:
raise HTTPException(status_code=404, detail="File not found")
file.folder_id = request.new_folder_id
db_session.commit()
return UserFileSnapshot.from_model(file)
@router.get("/user/file-system")
def get_file_system(
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> list[UserFolderSnapshot]:
user_id = user.id if user else None
folders = db_session.query(UserFolder).filter(UserFolder.user_id == user_id).all()
return [UserFolderSnapshot.from_model(folder) for folder in folders]
@router.put("/user/file/{file_id}/rename")
def rename_file(
file_id: int,
name: str,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> UserFileSnapshot:
user_id = user.id if user else None
file = (
db_session.query(UserFile)
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
.first()
)
if not file:
raise HTTPException(status_code=404, detail="File not found")
file.name = name
db_session.commit()
return UserFileSnapshot.from_model(file)
class ShareRequest(BaseModel):
assistant_id: int
@router.post("/user/file/{file_id}/share")
def share_file(
file_id: int,
request: ShareRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> MessageResponse:
user_id = user.id if user else None
file = (
db_session.query(UserFile)
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
.first()
)
if not file:
raise HTTPException(status_code=404, detail="File not found")
share_file_with_assistant(file_id, request.assistant_id, db_session)
return MessageResponse(message="File shared successfully with the assistant")
@router.post("/user/file/{file_id}/unshare")
def unshare_file(
file_id: int,
request: ShareRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> MessageResponse:
user_id = user.id if user else None
file = (
db_session.query(UserFile)
.filter(UserFile.id == file_id, UserFile.user_id == user_id)
.first()
)
if not file:
raise HTTPException(status_code=404, detail="File not found")
unshare_file_with_assistant(file_id, request.assistant_id, db_session)
return MessageResponse(message="File unshared successfully from the assistant")
@router.post("/user/folder/{folder_id}/share")
def share_folder(
folder_id: int,
request: ShareRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> MessageResponse:
user_id = user.id if user else None
folder = (
db_session.query(UserFolder)
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
.first()
)
if not folder:
raise HTTPException(status_code=404, detail="Folder not found")
share_folder_with_assistant(folder_id, request.assistant_id, db_session)
return MessageResponse(
message="Folder and its files shared successfully with the assistant"
)
@router.post("/user/folder/{folder_id}/unshare")
def unshare_folder(
folder_id: int,
request: ShareRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> MessageResponse:
user_id = user.id if user else None
folder = (
db_session.query(UserFolder)
.filter(UserFolder.id == folder_id, UserFolder.user_id == user_id)
.first()
)
if not folder:
raise HTTPException(status_code=404, detail="Folder not found")
unshare_folder_with_assistant(folder_id, request.assistant_id, db_session)
return MessageResponse(
message="Folder and its files unshared successfully from the assistant"
)
class CreateFileFromLinkRequest(BaseModel):
url: str
folder_id: int | None
@router.post("/user/file/create-from-link")
def create_file_from_link(
request: CreateFileFromLinkRequest,
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> FileUploadResponse:
try:
response = requests.get(request.url)
response.raise_for_status()
content = response.text
soup = BeautifulSoup(content, "html.parser")
parsed_html = web_html_cleanup(soup, mintlify_cleanup_enabled=False)
file_name = f"{parsed_html.title or 'Untitled'}.txt"
file_content = parsed_html.cleaned_text.encode()
file = UploadFile(filename=file_name, file=io.BytesIO(file_content))
user_files = create_user_files([file], request.folder_id, user, db_session)
# Create connector and credential (same as in upload_user_files)
for user_file in user_files:
connector_base = ConnectorBase(
name=f"UserFile-{user_file.file_id}-{int(time.time())}",
source=DocumentSource.FILE,
input_type=InputType.LOAD_STATE,
connector_specific_config={
"file_locations": [user_file.file_id],
},
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
connector = create_connector(
db_session=db_session,
connector_data=connector_base,
)
credential_info = CredentialBase(
credential_json={},
admin_public=True,
source=DocumentSource.FILE,
curator_public=True,
groups=[],
name=f"UserFileCredential-{user_file.file_id}-{int(time.time())}",
)
credential = create_credential(credential_info, user, db_session)
cc_pair = add_credential_to_connector(
db_session=db_session,
user=user,
connector_id=connector.id,
credential_id=credential.id,
cc_pair_name=f"UserFileCCPair-{int(time.time())}",
access_type=AccessType.PRIVATE,
auto_sync_options=None,
groups=[],
is_user_file=True,
)
user_file.cc_pair_id = cc_pair.data
db_session.commit()
db_session.commit()
return FileUploadResponse(
file_paths=[user_file.file_id for user_file in user_files]
)
except requests.RequestException as e:
raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {str(e)}")

View File

@@ -1,70 +0,0 @@
from datetime import datetime
from typing import List
from pydantic import BaseModel
from onyx.db.models import UserFile
from onyx.db.models import UserFolder
class UserFileSnapshot(BaseModel):
id: int
name: str
document_id: str
folder_id: int | None = None
user_id: int | None
file_id: str
created_at: datetime
assistant_ids: List[int] = [] # List of assistant IDs
token_count: int | None
indexed: bool
@classmethod
def from_model(cls, model: UserFile) -> "UserFileSnapshot":
return cls(
id=model.id,
name=model.name,
folder_id=model.folder_id,
document_id=model.document_id,
user_id=model.user_id,
file_id=model.file_id,
created_at=model.created_at,
assistant_ids=[assistant.id for assistant in model.assistants],
token_count=model.token_count,
indexed=model.cc_pair.last_successful_index_time is not None
if model.cc_pair
else False,
)
class UserFolderSnapshot(BaseModel):
id: int
name: str
description: str
files: List[UserFileSnapshot]
created_at: datetime
user_id: int | None
assistant_ids: List[int] = [] # List of assistant IDs
token_count: int | None
@classmethod
def from_model(cls, model: UserFolder) -> "UserFolderSnapshot":
return cls(
id=model.id,
name=model.name,
description=model.description,
files=[UserFileSnapshot.from_model(file) for file in model.files],
created_at=model.created_at,
user_id=model.user_id,
assistant_ids=[assistant.id for assistant in model.assistants],
token_count=sum(file.token_count or 0 for file in model.files) or None,
)
class MessageResponse(BaseModel):
message: str
class FileSystemResponse(BaseModel):
folders: list[UserFolderSnapshot]
files: list[UserFileSnapshot]

View File

@@ -65,7 +65,7 @@ logger = setup_logger()
def setup_onyx(
db_session: Session, tenant_id: str, cohere_enabled: bool = False
db_session: Session, tenant_id: str | None, cohere_enabled: bool = False
) -> None:
"""
Setup Onyx for a particular tenant. In the Single Tenant case, it will set it up for the default schema

View File

@@ -138,7 +138,6 @@ def construct_tools(
user: User | None,
llm: LLM,
fast_llm: LLM,
use_file_search: bool,
search_tool_config: SearchToolConfig | None = None,
internet_search_tool_config: InternetSearchToolConfig | None = None,
image_generation_tool_config: ImageGenerationToolConfig | None = None,
@@ -252,33 +251,6 @@ def construct_tools(
for tool_list in tool_dict.values():
tools.extend(tool_list)
if use_file_search:
search_tool_config = SearchToolConfig()
search_tool = SearchTool(
db_session=db_session,
user=user,
persona=persona,
retrieval_options=search_tool_config.retrieval_options,
prompt_config=prompt_config,
llm=llm,
fast_llm=fast_llm,
pruning_config=search_tool_config.document_pruning_config,
answer_style_config=search_tool_config.answer_style_config,
selected_sections=search_tool_config.selected_sections,
chunks_above=search_tool_config.chunks_above,
chunks_below=search_tool_config.chunks_below,
full_doc=search_tool_config.full_doc,
evaluation_type=(
LLMEvaluationType.BASIC
if persona.llm_relevance_filter
else LLMEvaluationType.SKIP
),
rerank_settings=search_tool_config.rerank_settings,
bypass_acl=search_tool_config.bypass_acl,
)
tool_dict[1] = [search_tool]
# factor in tool definition size when pruning
if search_tool_config:
search_tool_config.document_pruning_config.tool_num_tokens = (

View File

@@ -64,7 +64,7 @@ logger = setup_logger()
CUSTOM_TOOL_RESPONSE_ID = "custom_tool_response"
class CustomToolUserFileSnapshot(BaseModel):
class CustomToolFileResponse(BaseModel):
file_ids: List[str] # References to saved images or CSVs
@@ -131,7 +131,7 @@ class CustomTool(BaseTool):
response = cast(CustomToolCallSummary, args[0].response)
if response.response_type == "image" or response.response_type == "csv":
image_response = cast(CustomToolUserFileSnapshot, response.tool_result)
image_response = cast(CustomToolFileResponse, response.tool_result)
return json.dumps({"file_ids": image_response.file_ids})
# For JSON or other responses, return as-is
@@ -267,14 +267,14 @@ class CustomTool(BaseTool):
file_ids = self._save_and_get_file_references(
response.content, content_type
)
tool_result = CustomToolUserFileSnapshot(file_ids=file_ids)
tool_result = CustomToolFileResponse(file_ids=file_ids)
response_type = "csv"
elif "image/" in content_type:
file_ids = self._save_and_get_file_references(
response.content, content_type
)
tool_result = CustomToolUserFileSnapshot(file_ids=file_ids)
tool_result = CustomToolFileResponse(file_ids=file_ids)
response_type = "image"
else:
@@ -358,7 +358,7 @@ class CustomTool(BaseTool):
def final_result(self, *args: ToolResponse) -> JSON_ro:
response = cast(CustomToolCallSummary, args[0].response)
if isinstance(response.tool_result, CustomToolUserFileSnapshot):
if isinstance(response.tool_result, CustomToolFileResponse):
return response.tool_result.model_dump()
return response.tool_result

View File

@@ -260,7 +260,7 @@ def get_documents_for_tenant_connector(
def search_for_document(
index_name: str,
document_id: str | None = None,
tenant_id: str = POSTGRES_DEFAULT_SCHEMA,
tenant_id: str | None = None,
max_hits: int | None = 10,
) -> List[Dict[str, Any]]:
yql_query = f"select * from sources {index_name}"
@@ -444,15 +444,12 @@ def get_document_acls(
response = vespa_client.get(document_url)
if response.status_code == 200:
fields = response.json().get("fields", {})
document_id = fields.get("document_id") or fields.get(
"documentid", "Unknown"
)
acls = fields.get("access_control_list", {})
title = fields.get("title", "")
source_type = fields.get("source_type", "")
doc_sets = fields.get("document_sets", [])
user_file = fields.get("user_file", None)
source_links_raw = fields.get("source_links", "{}")
try:
source_links = json.loads(source_links_raw)
@@ -465,8 +462,6 @@ def get_document_acls(
print(f"Source Links: {source_links}")
print(f"Title: {title}")
print(f"Source Type: {source_type}")
print(f"Document Sets: {doc_sets}")
print(f"User File: {user_file}")
if MULTI_TENANT:
print(f"Tenant ID: {fields.get('tenant_id', 'N/A')}")
print("-" * 80)
@@ -512,9 +507,9 @@ def get_number_of_chunks_we_think_exist(
class VespaDebugging:
# Class for managing Vespa debugging actions.
def __init__(self, tenant_id: str = POSTGRES_DEFAULT_SCHEMA):
def __init__(self, tenant_id: str | None = None):
CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
self.tenant_id = tenant_id
self.tenant_id = POSTGRES_DEFAULT_SCHEMA if not tenant_id else tenant_id
self.index_name = get_index_name(self.tenant_id)
def sample_document_counts(self) -> None:
@@ -608,7 +603,7 @@ class VespaDebugging:
delete_documents_for_tenant(self.index_name, self.tenant_id, count=count)
def search_for_document(
self, document_id: str | None = None, tenant_id: str = POSTGRES_DEFAULT_SCHEMA
self, document_id: str | None = None, tenant_id: str | None = None
) -> List[Dict[str, Any]]:
return search_for_document(self.index_name, document_id, tenant_id)

View File

@@ -8,7 +8,6 @@ from sqlalchemy.orm import Session
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.search_settings import get_active_search_settings
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
# Modify sys.path
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -75,7 +74,7 @@ def _unsafe_deletion(
for document in documents:
document_index.delete_single(
doc_id=document.id,
tenant_id=POSTGRES_DEFAULT_SCHEMA,
tenant_id=None,
chunk_count=document.chunk_count,
)

View File

@@ -6,7 +6,6 @@ from sqlalchemy import text
from sqlalchemy.orm import Session
from onyx.document_index.document_index_utils import get_multipass_config
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
# makes it so `PYTHONPATH=.` is not required when running this script
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -97,9 +96,7 @@ def main() -> None:
try:
print(f"Deleting document {doc_id} in Vespa")
chunks_deleted = vespa_index.delete_single(
doc_id,
tenant_id=POSTGRES_DEFAULT_SCHEMA,
chunk_count=document.chunk_count,
doc_id, tenant_id=None, chunk_count=document.chunk_count
)
if chunks_deleted > 0:
print(

View File

@@ -18,7 +18,5 @@ CURRENT_TENANT_ID_CONTEXTVAR: contextvars.ContextVar[
def get_current_tenant_id() -> str:
tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
if tenant_id is None:
if not MULTI_TENANT:
return POSTGRES_DEFAULT_SCHEMA
raise RuntimeError("Tenant ID is not set. This should never happen.")
return tenant_id

View File

@@ -87,7 +87,7 @@ def test_confluence_connector_basic(
assert len(txt_doc.sections) == 1
assert txt_doc.sections[0].text == "small"
assert txt_doc.primary_owners
assert txt_doc.primary_owners[0].email == "chris@onyx.app"
assert txt_doc.primary_owners[0].email == "chris@danswer.ai"
assert (
txt_doc.sections[0].link
== "https://danswerai.atlassian.net/wiki/pages/viewpageattachments.action?pageId=52494430&preview=%2F52494430%2F52527123%2Fsmall-file.txt"

View File

@@ -3,7 +3,7 @@ services:
image: onyxdotapp/onyx-backend:${IMAGE_TAG:-latest}
build:
context: ../../backend
is dockerfile: Dockerfile
dockerfile: Dockerfile
command: >
/bin/sh -c "
alembic upgrade head &&

View File

@@ -1,20 +0,0 @@
{
"compilerOptions": {
"target": "es5",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
"exclude": ["node_modules"]
}

1243
web/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -20,13 +20,11 @@
"@phosphor-icons/react": "^2.0.8",
"@radix-ui/react-accordion": "^1.2.2",
"@radix-ui/react-checkbox": "^1.1.2",
"@radix-ui/react-context-menu": "^2.2.5",
"@radix-ui/react-collapsible": "^1.1.2",
"@radix-ui/react-dialog": "^1.1.6",
"@radix-ui/react-dropdown-menu": "^2.1.6",
"@radix-ui/react-label": "^2.1.1",
"@radix-ui/react-popover": "^1.1.6",
"@radix-ui/react-progress": "^1.1.1",
"@radix-ui/react-radio-group": "^1.2.2",
"@radix-ui/react-scroll-area": "^1.2.2",
"@radix-ui/react-select": "^2.1.6",
@@ -75,8 +73,6 @@
"recharts": "^2.13.1",
"rehype-katex": "^7.0.1",
"rehype-prism-plus": "^2.0.0",
"rehype-sanitize": "^6.0.0",
"rehype-stringify": "^10.0.1",
"remark-gfm": "^4.0.0",
"remark-math": "^6.0.0",
"semver": "^7.5.4",

View File

@@ -64,10 +64,10 @@ import { debounce } from "lodash";
import { FullLLMProvider } from "../configuration/llm/interfaces";
import StarterMessagesList from "./StarterMessageList";
import { SwitchField } from "@/components/ui/switch";
import { Switch, SwitchField } from "@/components/ui/switch";
import { generateIdenticon } from "@/components/assistants/AssistantIcon";
import { BackButton } from "@/components/BackButton";
import { Checkbox } from "@/components/ui/checkbox";
import { Checkbox, CheckboxField } from "@/components/ui/checkbox";
import { AdvancedOptionsToggle } from "@/components/AdvancedOptionsToggle";
import { MinimalUserSnapshot } from "@/lib/types";
import { useUserGroups } from "@/lib/hooks";
@@ -76,26 +76,12 @@ import {
Option as DropdownOption,
} from "@/components/Dropdown";
import { SourceChip } from "@/app/chat/input/ChatInputBar";
import {
TagIcon,
UserIcon,
FileIcon,
FolderIcon,
InfoIcon,
} from "lucide-react";
import { TagIcon, UserIcon, XIcon, InfoIcon } from "lucide-react";
import { LLMSelector } from "@/components/llm/LLMSelector";
import useSWR from "swr";
import { errorHandlingFetcher } from "@/lib/fetcher";
import { ConfirmEntityModal } from "@/components/modals/ConfirmEntityModal";
import { FilePickerModal } from "@/app/chat/my-documents/components/FilePicker";
import { useDocumentsContext } from "@/app/chat/my-documents/DocumentsContext";
import {
FileResponse,
FolderResponse,
} from "@/app/chat/my-documents/DocumentsContext";
import { RadioGroup } from "@/components/ui/radio-group";
import { RadioGroupItemField } from "@/components/ui/RadioGroupItemField";
import Title from "@/components/ui/title";
import { SEARCH_TOOL_ID } from "@/app/chat/tools/constants";
function findSearchTool(tools: ToolSnapshot[]) {
@@ -161,7 +147,6 @@ export function AssistantEditor({
"#6FFFFF",
];
const [filePickerModalOpen, setFilePickerModalOpen] = useState(false);
const [showAdvancedOptions, setShowAdvancedOptions] = useState(false);
// state to persist across formik reformatting
@@ -236,16 +221,6 @@ export function AssistantEditor({
enabledToolsMap[tool.id] = personaCurrentToolIds.includes(tool.id);
});
const {
selectedFiles,
selectedFolders,
addSelectedFile,
removeSelectedFile,
addSelectedFolder,
removeSelectedFolder,
clearSelectedItems,
} = useDocumentsContext();
const [showVisibilityWarning, setShowVisibilityWarning] = useState(false);
const initialValues = {
@@ -284,9 +259,6 @@ export function AssistantEditor({
(u) => u.id !== existingPersona.owner?.id
) ?? [],
selectedGroups: existingPersona?.groups ?? [],
user_file_ids: existingPersona?.user_file_ids ?? [],
user_folder_ids: existingPersona?.user_folder_ids ?? [],
knowledge_source: "user_files",
is_default_persona: existingPersona?.is_default_persona ?? false,
};
@@ -396,24 +368,6 @@ export function AssistantEditor({
<BackButton />
</div>
)}
{filePickerModalOpen && (
<FilePickerModal
selectedFiles={selectedFiles}
selectedFolders={selectedFolders}
addSelectedFile={addSelectedFile}
removeSelectedFile={removeSelectedFile}
addSelectedFolder={addSelectedFolder}
isOpen={filePickerModalOpen}
onClose={() => {
setFilePickerModalOpen(false);
}}
onSave={() => {
setFilePickerModalOpen(false);
}}
title="Add Documents to your Assistant"
buttonContent="Add to Assistant"
/>
)}
{labelToDelete && (
<ConfirmEntityModal
@@ -480,7 +434,6 @@ export function AssistantEditor({
label_ids: Yup.array().of(Yup.number()),
selectedUsers: Yup.array().of(Yup.object()),
selectedGroups: Yup.array().of(Yup.number()),
knowledge_source: Yup.string().required(),
is_default_persona: Yup.boolean().required(),
})
.test(
@@ -569,12 +522,9 @@ export function AssistantEditor({
? new Date(values.search_start_date)
: null,
num_chunks: numChunks,
user_file_ids: selectedFiles.map((file) => file.id),
user_folder_ids: selectedFolders.map((folder) => folder.id),
};
let personaResponse;
if (isUpdate) {
personaResponse = await updatePersona(
existingPersona.id,
@@ -896,168 +846,77 @@ export function AssistantEditor({
values.enabled_tools_map[searchTool.id] &&
!(user?.role != "admin" && documentSets.length === 0) && (
<CollapsibleSection>
<div>
<Label>Knowledge Source</Label>
<RadioGroup
className="flex flex-col gap-y-4 mt-2"
value={values.knowledge_source}
onValueChange={(value: string) => {
setFieldValue("knowledge_source", value);
}}
>
<RadioGroupItemField
value="user_files"
id="user_files"
label="User Files"
sublabel="Select specific user files and folders for this Assistant to use"
/>
<RadioGroupItemField
value="team_knowledge"
id="team_knowledge"
label="Team Knowledge"
sublabel="Use team-wide document sets for this Assistant"
/>
</RadioGroup>
{values.knowledge_source === "user_files" &&
!existingPersona?.is_default_persona &&
!admin && (
<div className="mt-4">
<div className="flex justify-start gap-x-2 items-center">
<Label>User Files</Label>
<span
className="cursor-pointer text-xs text-primary hover:underline"
onClick={() => setFilePickerModalOpen(true)}
>
Attach Files and Folders
</span>
</div>
<div className="mt-2">
{ccPairs.length > 0 && (
<>
<Label small>Document Sets</Label>
<div>
<SubLabel>
Select which of your user files and folders
this Assistant should use to inform its
responses. If none are specified, the
Assistant will not have access to any
user-specific documents.
<>
Select which{" "}
{!user || user.role === "admin" ? (
<Link
href="/admin/documents/sets"
className="font-semibold underline hover:underline text-text"
target="_blank"
>
Document Sets
</Link>
) : (
"Document Sets"
)}{" "}
this Assistant should use to inform its
responses. If none are specified, the
Assistant will reference all available
documents.
</>
</SubLabel>
<div className="mt-2 mb-4">
<h4 className="text-xs font-normal mb-2">
Selected Files and Folders
</h4>
<div className="flex flex-wrap gap-2">
{selectedFiles.map((file: FileResponse) => (
<SourceChip
key={file.id}
onRemove={() => {
removeSelectedFile(file);
setFieldValue(
"selectedFiles",
values.selectedFiles.filter(
(f: FileResponse) =>
f.id !== file.id
)
);
}}
title={file.name}
icon={<FileIcon size={12} />}
/>
))}
{selectedFolders.map(
(folder: FolderResponse) => (
<SourceChip
key={folder.id}
onRemove={() => {
removeSelectedFolder(folder);
setFieldValue(
"selectedFolders",
values.selectedFolders.filter(
(f: FolderResponse) =>
f.id !== folder.id
)
);
}}
title={folder.name}
icon={<FolderIcon size={12} />}
/>
)
)}
</div>
</div>
</div>
)}
{values.knowledge_source === "team_knowledge" &&
ccPairs.length > 0 && (
<div className="mt-4">
<Label>Team Knowledge</Label>
<div>
<SubLabel>
<>
Select which{" "}
{!user || user.role === "admin" ? (
<Link
href="/admin/documents/sets"
className="font-semibold underline hover:underline text-text"
target="_blank"
>
Team Document Sets
</Link>
) : (
"Team Document Sets"
)}{" "}
this Assistant should use to inform its
responses. If none are specified, the
Assistant will reference all available
documents.
</>
</SubLabel>
</div>
{documentSets.length > 0 ? (
<FieldArray
name="document_set_ids"
render={(arrayHelpers: ArrayHelpers) => (
<div>
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
{documentSets.map((documentSet) => (
<DocumentSetSelectable
key={documentSet.id}
documentSet={documentSet}
isSelected={values.document_set_ids.includes(
documentSet.id
)}
onSelect={() => {
const index =
values.document_set_ids.indexOf(
documentSet.id
);
if (index !== -1) {
arrayHelpers.remove(index);
} else {
arrayHelpers.push(
documentSet.id
);
}
}}
/>
))}
</div>
{documentSets.length > 0 ? (
<FieldArray
name="document_set_ids"
render={(arrayHelpers: ArrayHelpers) => (
<div>
<div className="mb-3 mt-2 flex gap-2 flex-wrap text-sm">
{documentSets.map((documentSet) => (
<DocumentSetSelectable
key={documentSet.id}
documentSet={documentSet}
isSelected={values.document_set_ids.includes(
documentSet.id
)}
onSelect={() => {
const index =
values.document_set_ids.indexOf(
documentSet.id
);
if (index !== -1) {
arrayHelpers.remove(index);
} else {
arrayHelpers.push(
documentSet.id
);
}
}}
/>
))}
</div>
)}
/>
) : (
<p className="text-sm">
<Link
href="/admin/documents/sets/new"
className="text-primary hover:underline"
>
+ Create Document Set
</Link>
</p>
)}
</div>
)}
</div>
)}
/>
) : (
<p className="text-sm">
<Link
href="/admin/documents/sets/new"
className="text-primary hover:underline"
>
+ Create Document Set
</Link>
</p>
)}
</>
)}
</div>
</CollapsibleSection>
)}

View File

@@ -1,106 +0,0 @@
import {
FileResponse,
FolderResponse,
} from "@/app/chat/my-documents/DocumentsContext";
export interface AssistantFileChanges {
filesToShare: number[];
filesToUnshare: number[];
foldersToShare: number[];
foldersToUnshare: number[];
}
export function calculateFileChanges(
existingFileIds: number[],
existingFolderIds: number[],
selectedFiles: FileResponse[],
selectedFolders: FolderResponse[]
): AssistantFileChanges {
const selectedFileIds = selectedFiles.map((file) => file.id);
const selectedFolderIds = selectedFolders.map((folder) => folder.id);
return {
filesToShare: selectedFileIds.filter((id) => !existingFileIds.includes(id)),
filesToUnshare: existingFileIds.filter(
(id) => !selectedFileIds.includes(id)
),
foldersToShare: selectedFolderIds.filter(
(id) => !existingFolderIds.includes(id)
),
foldersToUnshare: existingFolderIds.filter(
(id) => !selectedFolderIds.includes(id)
),
};
}
export async function shareFiles(
assistantId: number,
fileIds: number[]
): Promise<void> {
for (const fileId of fileIds) {
await fetch(`/api/user/file/${fileId}/share`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ assistant_id: assistantId }),
});
}
}
export async function unshareFiles(
assistantId: number,
fileIds: number[]
): Promise<void> {
for (const fileId of fileIds) {
await fetch(`/api/user/file/${fileId}/unshare`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ assistant_id: assistantId }),
});
}
}
export async function shareFolders(
assistantId: number,
folderIds: number[]
): Promise<void> {
for (const folderId of folderIds) {
await fetch(`/api/user/folder/${folderId}/share`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ assistant_id: assistantId }),
});
}
}
export async function unshareFolders(
assistantId: number,
folderIds: number[]
): Promise<void> {
for (const folderId of folderIds) {
await fetch(`/api/user/folder/${folderId}/unshare`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ assistant_id: assistantId }),
});
}
}
export async function updateAssistantFiles(
assistantId: number,
changes: AssistantFileChanges
): Promise<void> {
await Promise.all([
shareFiles(assistantId, changes.filesToShare),
unshareFiles(assistantId, changes.filesToUnshare),
shareFolders(assistantId, changes.foldersToShare),
unshareFolders(assistantId, changes.foldersToUnshare),
]);
}

Some files were not shown because too many files have changed in this diff Show More