k

quick fix - validated locally
k
2026-03-15 12:42:39 +00:00 · 2025-03-06 17:26:09 -08:00 · 2025-03-06 17:25:44 -08:00 · 2025-03-06 16:43:51 -08:00 · 2025-03-06 16:43:07 -08:00 · 2025-03-06 16:39:39 -08:00
46 changed files with 1086 additions and 181 deletions
--- a/backend/alembic/versions/3934b1bc7b62_update_github_connector_repo_name_to_.py
+++ b/backend/alembic/versions/3934b1bc7b62_update_github_connector_repo_name_to_.py
@@ -0,0 +1,125 @@
+"""Update GitHub connector repo_name to repositories
+
+Revision ID: 3934b1bc7b62
+Revises: b7c2b63c4a03
+Create Date: 2025-03-05 10:50:30.516962
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import json
+import logging
+
+# revision identifiers, used by Alembic.
+revision = "3934b1bc7b62"
+down_revision = "b7c2b63c4a03"
+branch_labels = None
+depends_on = None
+
+logger = logging.getLogger("alembic.runtime.migration")
+
+
+def upgrade() -> None:
+    # Get all GitHub connectors
+    conn = op.get_bind()
+
+    # First get all GitHub connectors
+    github_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'GITHUB'
+            """
+        )
+    ).fetchall()
+
+    # Update each connector's config
+    updated_count = 0
+    for connector_id, config in github_connectors:
+        try:
+            if not config:
+                logger.warning(f"Connector {connector_id} has no config, skipping")
+                continue
+
+            # Parse the config if it's a string
+            if isinstance(config, str):
+                config = json.loads(config)
+
+            if "repo_name" not in config:
+                continue
+
+            # Create new config with repositories instead of repo_name
+            new_config = dict(config)
+            repo_name_value = new_config.pop("repo_name")
+            new_config["repositories"] = repo_name_value
+
+            # Update the connector with the new config
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE connector
+                    SET connector_specific_config = :new_config
+                    WHERE id = :connector_id
+                    """
+                ),
+                {"connector_id": connector_id, "new_config": json.dumps(new_config)},
+            )
+            updated_count += 1
+        except Exception as e:
+            logger.error(f"Error updating connector {connector_id}: {str(e)}")
+
+
+def downgrade() -> None:
+    # Get all GitHub connectors
+    conn = op.get_bind()
+
+    logger.debug(
+        "Starting rollback of GitHub connectors from repositories to repo_name"
+    )
+
+    github_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'GITHUB'
+            """
+        )
+    ).fetchall()
+
+    logger.debug(f"Found {len(github_connectors)} GitHub connectors to rollback")
+
+    # Revert each GitHub connector to use repo_name instead of repositories
+    reverted_count = 0
+    for connector_id, config in github_connectors:
+        try:
+            if not config:
+                continue
+
+            # Parse the config if it's a string
+            if isinstance(config, str):
+                config = json.loads(config)
+
+            if "repositories" not in config:
+                continue
+
+            # Create new config with repo_name instead of repositories
+            new_config = dict(config)
+            repositories_value = new_config.pop("repositories")
+            new_config["repo_name"] = repositories_value
+
+            # Update the connector with the new config
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE connector
+                    SET connector_specific_config = :new_config
+                    WHERE id = :connector_id
+                    """
+                ),
+                {"new_config": json.dumps(new_config), "connector_id": connector_id},
+            )
+            reverted_count += 1
+        except Exception as e:
+            logger.error(f"Error reverting connector {connector_id}: {str(e)}")
--- a/backend/alembic_tenants/versions/3b45e0018bf1_add_new_available_tenant_table.py
+++ b/backend/alembic_tenants/versions/3b45e0018bf1_add_new_available_tenant_table.py
@@ -0,0 +1,33 @@
+"""add new available tenant table
+
+Revision ID: 3b45e0018bf1
+Revises: 34e3630c7f32
+Create Date: 2025-03-06 09:55:18.229910
+
+"""
+import sqlalchemy as sa
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "3b45e0018bf1"
+down_revision = "34e3630c7f32"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create new_available_tenant table
+    op.create_table(
+        "available_tenant",
+        sa.Column("tenant_id", sa.String(), nullable=False),
+        sa.Column("alembic_version", sa.String(), nullable=False),
+        sa.Column("date_created", sa.DateTime(), nullable=False),
+        sa.PrimaryKeyConstraint("tenant_id"),
+    )
+
+
+def downgrade() -> None:
+    # Drop new_available_tenant table
+    op.drop_table("available_tenant")
--- a/backend/ee/onyx/db/query_history.py
+++ b/backend/ee/onyx/db/query_history.py
@@ -134,7 +134,9 @@ def fetch_chat_sessions_eagerly_by_time(
    limit: int | None = 500,
    initial_time: datetime | None = None,
 ) -> list[ChatSession]:
-    time_order: UnaryExpression = desc(ChatSession.time_created)
+    """Sorted by oldest to newest, then by message id"""
+
+    asc_time_order: UnaryExpression = asc(ChatSession.time_created)
    message_order: UnaryExpression = asc(ChatMessage.id)

    filters: list[ColumnElement | BinaryExpression] = [
@@ -147,8 +149,7 @@ def fetch_chat_sessions_eagerly_by_time(
    subquery = (
        db_session.query(ChatSession.id, ChatSession.time_created)
        .filter(*filters)
-        .order_by(ChatSession.id, time_order)
-        .distinct(ChatSession.id)
+        .order_by(asc_time_order)
        .limit(limit)
        .subquery()
    )
@@ -164,7 +165,7 @@ def fetch_chat_sessions_eagerly_by_time(
                ChatMessage.chat_message_feedbacks
            ),
        )
-        .order_by(time_order, message_order)
+        .order_by(asc_time_order, message_order)
    )

    chat_sessions = query.all()
--- a/backend/ee/onyx/db/usage_export.py
+++ b/backend/ee/onyx/db/usage_export.py
@@ -16,13 +16,18 @@ from onyx.db.models import UsageReport
 from onyx.file_store.file_store import get_default_file_store


-# Gets skeletons of all message
+# Gets skeletons of all messages in the given range
 def get_empty_chat_messages_entries__paginated(
    db_session: Session,
    period: tuple[datetime, datetime],
    limit: int | None = 500,
    initial_time: datetime | None = None,
 ) -> tuple[Optional[datetime], list[ChatMessageSkeleton]]:
+    """Returns a tuple where:
+    first element is the most recent timestamp out of the sessions iterated
+    - this timestamp can be used to paginate forward in time
+    second element is a list of messages belonging to all the sessions iterated
+    """
    chat_sessions = fetch_chat_sessions_eagerly_by_time(
        start=period[0],
        end=period[1],
@@ -52,18 +57,17 @@ def get_empty_chat_messages_entries__paginated(
    if len(chat_sessions) == 0:
        return None, []

-    return chat_sessions[0].time_created, message_skeletons
+    return chat_sessions[-1].time_created, message_skeletons


 def get_all_empty_chat_message_entries(
    db_session: Session,
    period: tuple[datetime, datetime],
 ) -> Generator[list[ChatMessageSkeleton], None, None]:
+    """period is the range of time over which to fetch messages."""
    initial_time: Optional[datetime] = period[0]
-    ind = 0
    while True:
-        ind += 1
-
+        # iterate from oldest to newest
        time_created, message_skeletons = get_empty_chat_messages_entries__paginated(
            db_session,
            period,
--- a/backend/ee/onyx/main.py
+++ b/backend/ee/onyx/main.py
@@ -15,7 +15,7 @@ from ee.onyx.server.enterprise_settings.api import (
 )
 from ee.onyx.server.manage.standard_answer import router as standard_answer_router
 from ee.onyx.server.middleware.tenant_tracking import add_tenant_id_middleware
-from ee.onyx.server.oauth.api import router as oauth_router
+from ee.onyx.server.oauth.api import router as ee_oauth_router
 from ee.onyx.server.query_and_chat.chat_backend import (
    router as chat_router,
 )
@@ -128,7 +128,7 @@ def get_application() -> FastAPI:
    include_router_with_global_prefix_prepended(application, query_router)
    include_router_with_global_prefix_prepended(application, chat_router)
    include_router_with_global_prefix_prepended(application, standard_answer_router)
-    include_router_with_global_prefix_prepended(application, oauth_router)
+    include_router_with_global_prefix_prepended(application, ee_oauth_router)

    # Enterprise-only global settings
    include_router_with_global_prefix_prepended(
--- a/backend/ee/onyx/server/oauth/confluence_cloud.py
+++ b/backend/ee/onyx/server/oauth/confluence_cloud.py
@@ -80,6 +80,7 @@ class ConfluenceCloudOAuth:
        "search:confluence%20"
        # granular scope
        "read:attachment:confluence%20"  # possibly unneeded unless calling v2 attachments api
+        "read:content-details:confluence%20"  # for permission sync
        "offline_access"
    )

--- a/backend/ee/onyx/server/tenants/product_gating.py
+++ b/backend/ee/onyx/server/tenants/product_gating.py
@@ -48,4 +48,5 @@ def store_product_gating(tenant_id: str, application_status: ApplicationStatus)

 def get_gated_tenants() -> set[str]:
    redis_client = get_redis_replica_client(tenant_id=ONYX_CLOUD_TENANT_ID)
-    return cast(set[str], redis_client.smembers(GATED_TENANTS_KEY))
+    gated_tenants_bytes = cast(set[bytes], redis_client.smembers(GATED_TENANTS_KEY))
+    return {tenant_id.decode("utf-8") for tenant_id in gated_tenants_bytes}
--- a/backend/ee/onyx/server/tenants/provisioning.py
+++ b/backend/ee/onyx/server/tenants/provisioning.py
@@ -31,6 +31,7 @@ from onyx.db.engine import get_sqlalchemy_engine
 from onyx.db.llm import update_default_provider
 from onyx.db.llm import upsert_cloud_embedding_provider
 from onyx.db.llm import upsert_llm_provider
+from onyx.db.models import AvailableTenant
 from onyx.db.models import IndexModelStatus
 from onyx.db.models import SearchSettings
 from onyx.db.models import UserTenantMapping
@@ -55,19 +56,39 @@ logger = logging.getLogger(__name__)
 async def get_or_provision_tenant(
    email: str, referral_source: str | None = None, request: Request | None = None
 ) -> str:
-    """Get existing tenant ID for an email or create a new tenant if none exists."""
+    """
+    Get existing tenant ID for an email or create a new tenant if none exists.
+    This function should only be called after we have verified we want this user's tenant to exist.
+    It returns the tenant ID associated with the email, creating a new tenant if necessary.
+    """
    if not MULTI_TENANT:
        return POSTGRES_DEFAULT_SCHEMA

    if referral_source and request:
        await submit_to_hubspot(email, referral_source, request)

+    tenant_id: str | None = None
    try:
+        # First, check if the user already has a tenant
        tenant_id = get_tenant_id_for_email(email)
+        return tenant_id
+
    except exceptions.UserNotExists:
-        # If tenant does not exist and in Multi tenant mode, provision a new tenant
+        # User doesn't exist, so we need to create a new tenant or assign an existing one
        try:
-            tenant_id = await create_tenant(email, referral_source)
+            # Try to get a pre-provisioned tenant
+            tenant_id = await get_available_tenant()
+
+            if tenant_id:
+                # If we have a pre-provisioned tenant, assign it to the user
+                await assign_tenant_to_user(tenant_id, email, referral_source)
+                logger.info(
+                    f"Assigned pre-provisioned tenant {tenant_id} to user {email}"
+                )
+            else:
+                # If no pre-provisioned tenant is available, create a new one on-demand
+                tenant_id = await create_tenant(email, referral_source)
+
        except Exception as e:
            logger.error(f"Tenant provisioning failed: {e}")
            raise HTTPException(status_code=500, detail="Failed to provision tenant.")
@@ -81,13 +102,19 @@ async def get_or_provision_tenant(


 async def create_tenant(email: str, referral_source: str | None = None) -> str:
+    """
+    Create a new tenant on-demand when no pre-provisioned tenants are available.
+    This is the fallback method when we can't use a pre-provisioned tenant.
+    """
    tenant_id = TENANT_ID_PREFIX + str(uuid.uuid4())
    try:
        # Provision tenant on data plane
        await provision_tenant(tenant_id, email)
-        # Notify control plane
-        if not DEV_MODE:
+
+        # Notify control plane if not already done in provision_tenant
+        if not DEV_MODE and referral_source:
            await notify_control_plane(tenant_id, email, referral_source)
+
    except Exception as e:
        logger.error(f"Tenant provisioning failed: {e}")
        await rollback_tenant_provisioning(tenant_id)
@@ -105,54 +132,25 @@ async def provision_tenant(tenant_id: str, email: str) -> None:
        )

    logger.debug(f"Provisioning tenant {tenant_id} for user {email}")
-    token = None

    try:
+        # Create the schema for the tenant
        if not create_schema_if_not_exists(tenant_id):
            logger.debug(f"Created schema for tenant {tenant_id}")
        else:
            logger.debug(f"Schema already exists for tenant {tenant_id}")

-        token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
+        # Set up the tenant with all necessary configurations
+        await setup_tenant(tenant_id)

-        # Await the Alembic migrations
-        await asyncio.to_thread(run_alembic_migrations, tenant_id)
-
-        with get_session_with_tenant(tenant_id=tenant_id) as db_session:
-            configure_default_api_keys(db_session)
-
-            current_search_settings = (
-                db_session.query(SearchSettings)
-                .filter_by(status=IndexModelStatus.FUTURE)
-                .first()
-            )
-            cohere_enabled = (
-                current_search_settings is not None
-                and current_search_settings.provider_type == EmbeddingProvider.COHERE
-            )
-            setup_onyx(db_session, tenant_id, cohere_enabled=cohere_enabled)
-
-        add_users_to_tenant([email], tenant_id)
-
-        with get_session_with_tenant(tenant_id=tenant_id) as db_session:
-            create_milestone_and_report(
-                user=None,
-                distinct_id=tenant_id,
-                event_type=MilestoneRecordType.TENANT_CREATED,
-                properties={
-                    "email": email,
-                },
-                db_session=db_session,
-            )
+        # Assign the tenant to the user
+        await assign_tenant_to_user(tenant_id, email)

    except Exception as e:
        logger.exception(f"Failed to create tenant {tenant_id}")
        raise HTTPException(
            status_code=500, detail=f"Failed to create tenant: {str(e)}"
        )
-    finally:
-        if token is not None:
-            CURRENT_TENANT_ID_CONTEXTVAR.reset(token)


 async def notify_control_plane(
@@ -349,3 +347,97 @@ async def delete_user_from_control_plane(tenant_id: str, email: str) -> None:
                raise Exception(
                    f"Failed to delete tenant on control plane: {error_text}"
                )
+
+
+async def get_available_tenant() -> str | None:
+    """
+    Get an available pre-provisioned tenant from the NewAvailableTenant table.
+    Returns the tenant_id if one is available, None otherwise.
+    """
+    if not MULTI_TENANT:
+        return None
+
+    try:
+        with Session(get_sqlalchemy_engine()) as db_session:
+            # Get the oldest available tenant
+            available_tenant = (
+                db_session.query(AvailableTenant)
+                .order_by(AvailableTenant.date_created)
+                .first()
+            )
+
+            if available_tenant:
+                tenant_id = available_tenant.tenant_id
+                # Remove the tenant from the available tenants table
+                db_session.delete(available_tenant)
+                db_session.commit()
+                logger.info(f"Using pre-provisioned tenant {tenant_id}")
+                return tenant_id
+
+    except Exception as e:
+        logger.error(f"Error getting available tenant: {e}")
+
+    return None
+
+
+async def setup_tenant(tenant_id: str) -> None:
+    """
+    Set up a tenant with all necessary configurations.
+    This is a centralized function that handles all tenant setup logic.
+    """
+    token = None
+    try:
+        token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
+
+        # Run Alembic migrations
+        await asyncio.to_thread(run_alembic_migrations, tenant_id)
+
+        # Configure the tenant with default settings
+        with get_session_with_tenant(tenant_id=tenant_id) as db_session:
+            # Configure default API keys
+            configure_default_api_keys(db_session)
+
+            # Set up Onyx with appropriate settings
+            current_search_settings = (
+                db_session.query(SearchSettings)
+                .filter_by(status=IndexModelStatus.FUTURE)
+                .first()
+            )
+            cohere_enabled = (
+                current_search_settings is not None
+                and current_search_settings.provider_type == EmbeddingProvider.COHERE
+            )
+            setup_onyx(db_session, tenant_id, cohere_enabled=cohere_enabled)
+
+    except Exception as e:
+        logger.exception(f"Failed to set up tenant {tenant_id}")
+        raise e
+    finally:
+        if token is not None:
+            CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
+
+
+async def assign_tenant_to_user(
+    tenant_id: str, email: str, referral_source: str | None = None
+) -> None:
+    """
+    Assign a tenant to a user and perform necessary operations.
+    """
+    # Add the user to the tenant
+    add_users_to_tenant([email], tenant_id)
+
+    # Create milestone record
+    with get_session_with_tenant(tenant_id=tenant_id) as db_session:
+        create_milestone_and_report(
+            user=None,
+            distinct_id=tenant_id,
+            event_type=MilestoneRecordType.TENANT_CREATED,
+            properties={
+                "email": email,
+            },
+            db_session=db_session,
+        )
+
+    # Notify control plane
+    if not DEV_MODE:
+        await notify_control_plane(tenant_id, email, referral_source)
--- a/backend/ee/onyx/server/tenants/schema_management.py
+++ b/backend/ee/onyx/server/tenants/schema_management.py
@@ -74,3 +74,21 @@ def drop_schema(tenant_id: str) -> None:
            text("DROP SCHEMA IF EXISTS %(schema_name)s CASCADE"),
            {"schema_name": tenant_id},
        )
+
+
+def get_current_alembic_version(tenant_id: str) -> str:
+    """Get the current Alembic version for a tenant."""
+    from alembic.runtime.migration import MigrationContext
+    from sqlalchemy import text
+
+    engine = get_sqlalchemy_engine()
+
+    # Set the search path to the tenant's schema
+    with engine.connect() as connection:
+        connection.execute(text(f'SET search_path TO "{tenant_id}"'))
+
+        # Get the current version from the alembic_version table
+        context = MigrationContext.configure(connection)
+        current_rev = context.get_current_revision()
+
+    return current_rev or "head"
--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -587,14 +587,20 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
    ) -> Optional[User]:
        email = credentials.username

-        # Get tenant_id from mapping table
-        tenant_id = await fetch_ee_implementation_or_noop(
-            "onyx.server.tenants.provisioning",
-            "get_or_provision_tenant",
-            async_return_default_schema,
-        )(
-            email=email,
-        )
+        tenant_id: str | None = None
+        try:
+            tenant_id = fetch_ee_implementation_or_noop(
+                "onyx.server.tenants.provisioning",
+                "get_tenant_id_for_email",
+                None,
+            )(
+                email=email,
+            )
+        except Exception as e:
+            logger.warning(
+                f"User attempted to login with invalid credentials: {str(e)}"
+            )
+
        if not tenant_id:
            # User not found in mapping
            self.password_helper.hash(credentials.password)
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -111,5 +111,6 @@ celery_app.autodiscover_tasks(
        "onyx.background.celery.tasks.vespa",
        "onyx.background.celery.tasks.connector_deletion",
        "onyx.background.celery.tasks.doc_permission_syncing",
+        "onyx.background.celery.tasks.tenant_provisioning",
    ]
 )
--- a/backend/onyx/background/celery/apps/monitoring.py
+++ b/backend/onyx/background/celery/apps/monitoring.py
@@ -92,5 +92,6 @@ def on_setup_logging(
 celery_app.autodiscover_tasks(
    [
        "onyx.background.celery.tasks.monitoring",
+        "onyx.background.celery.tasks.tenant_provisioning",
    ]
 )
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -20,7 +20,7 @@ BEAT_EXPIRES_DEFAULT = 15 * 60  # 15 minutes (in seconds)
 # hack to slow down task dispatch in the cloud until
 # we have a better implementation (backpressure, etc)
 # Note that DynamicTenantScheduler can adjust the runtime value for this via Redis
-CLOUD_BEAT_MULTIPLIER_DEFAULT = 8.0
+CLOUD_BEAT_MULTIPLIER_DEFAULT = 8

 # tasks that run in either self-hosted on cloud
 beat_task_templates: list[dict] = []
@@ -167,6 +167,16 @@ beat_cloud_tasks: list[dict] = [
            "expires": BEAT_EXPIRES_DEFAULT,
        },
    },
+    {
+        "name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-available-tenants",
+        "task": OnyxCeleryTask.CHECK_AVAILABLE_TENANTS,
+        "schedule": timedelta(minutes=10),
+        "options": {
+            "queue": OnyxCeleryQueues.MONITORING,
+            "priority": OnyxCeleryPriority.HIGH,
+            "expires": BEAT_EXPIRES_DEFAULT,
+        },
+    },
 ]

 # tasks that only run self hosted
--- a/backend/onyx/background/celery/tasks/periodic/init.py
+++ b/backend/onyx/background/celery/tasks/periodic/init.py
--- a/backend/onyx/background/celery/tasks/tenant_provisioning/tasks.py
+++ b/backend/onyx/background/celery/tasks/tenant_provisioning/tasks.py
@@ -0,0 +1,177 @@
+"""
+Periodic tasks for tenant pre-provisioning.
+"""
+import asyncio
+import datetime
+import uuid
+
+from celery import shared_task
+from celery import Task
+from redis.lock import Lock as RedisLock
+from sqlalchemy.orm import Session
+
+from ee.onyx.server.tenants.provisioning import setup_tenant
+from ee.onyx.server.tenants.schema_management import create_schema_if_not_exists
+from ee.onyx.server.tenants.schema_management import get_current_alembic_version
+from onyx.background.celery.apps.app_base import task_logger
+from onyx.configs.app_configs import JOB_TIMEOUT
+from onyx.configs.app_configs import TARGET_AVAILABLE_TENANTS
+from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryQueues
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.configs.constants import OnyxRedisLocks
+from onyx.db.engine import get_sqlalchemy_engine
+from onyx.db.models import AvailableTenant
+from onyx.redis.redis_pool import get_redis_client
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.configs import TENANT_ID_PREFIX
+
+# Default number of pre-provisioned tenants to maintain
+DEFAULT_TARGET_AVAILABLE_TENANTS = 5
+
+# Soft time limit for tenant pre-provisioning tasks (in seconds)
+_TENANT_PROVISIONING_SOFT_TIME_LIMIT = 60 * 5  # 5 minutes
+# Hard time limit for tenant pre-provisioning tasks (in seconds)
+_TENANT_PROVISIONING_TIME_LIMIT = 60 * 10  # 10 minutes
+
+
+@shared_task(
+    name=OnyxCeleryTask.CHECK_AVAILABLE_TENANTS,
+    queue=OnyxCeleryQueues.MONITORING,
+    ignore_result=True,
+    soft_time_limit=JOB_TIMEOUT,
+    trail=False,
+    bind=True,
+)
+def check_available_tenants(self: Task) -> None:
+    """
+    Check if we have enough pre-provisioned tenants available.
+    If not, trigger the pre-provisioning of new tenants.
+    """
+    task_logger.info("STARTING CHECK_AVAILABLE_TENANTS")
+    if not MULTI_TENANT:
+        task_logger.info(
+            "Multi-tenancy is not enabled, skipping tenant pre-provisioning"
+        )
+        return
+
+    r = get_redis_client()
+    lock_check: RedisLock = r.lock(
+        OnyxRedisLocks.CHECK_AVAILABLE_TENANTS_LOCK,
+        timeout=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
+    )
+
+    # These tasks should never overlap
+    if not lock_check.acquire(blocking=False):
+        task_logger.info(
+            "Skipping check_available_tenants task because it is already running"
+        )
+        return
+
+    try:
+        # Get the current count of available tenants
+        with Session(get_sqlalchemy_engine()) as db_session:
+            available_tenants_count = db_session.query(AvailableTenant).count()
+
+        # Get the target number of available tenants
+        target_available_tenants = getattr(
+            TARGET_AVAILABLE_TENANTS, "value", DEFAULT_TARGET_AVAILABLE_TENANTS
+        )
+
+        # Calculate how many new tenants we need to provision
+        tenants_to_provision = max(
+            0, target_available_tenants - available_tenants_count
+        )
+
+        task_logger.info(
+            f"Available tenants: {available_tenants_count}, "
+            f"Target: {target_available_tenants}, "
+            f"To provision: {tenants_to_provision}"
+        )
+
+        # Trigger pre-provisioning tasks for each tenant needed
+        for _ in range(tenants_to_provision):
+            pre_provision_tenant.apply_async(
+                priority=OnyxCeleryPriority.LOW,
+            )
+
+    except Exception as e:
+        task_logger.exception(f"Error in check_available_tenants task: {e}")
+
+    finally:
+        lock_check.release()
+
+
+@shared_task(
+    name=OnyxCeleryTask.PRE_PROVISION_TENANT,
+    ignore_result=True,
+    soft_time_limit=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
+    time_limit=_TENANT_PROVISIONING_TIME_LIMIT,
+    queue=OnyxCeleryQueues.MONITORING,
+    bind=True,
+)
+def pre_provision_tenant(self: Task) -> None:
+    """
+    Pre-provision a new tenant and store it in the NewAvailableTenant table.
+    This function fully sets up the tenant with all necessary configurations,
+    so it's ready to be assigned to a user immediately.
+    """
+    task_logger.info("STARTING PRE_PROVISION_TENANT")
+    if not MULTI_TENANT:
+        task_logger.info(
+            "Multi-tenancy is not enabled, skipping tenant pre-provisioning"
+        )
+        return
+    r = get_redis_client()
+    lock_provision: RedisLock = r.lock(
+        OnyxRedisLocks.PRE_PROVISION_TENANT_LOCK,
+        timeout=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
+    )
+
+    # Allow multiple pre-provisioning tasks to run, but ensure they don't overlap
+    if not lock_provision.acquire(blocking=False):
+        task_logger.info(
+            "Skipping pre_provision_tenant task because it is already running"
+        )
+        return
+
+    try:
+        # Generate a new tenant ID
+        tenant_id = TENANT_ID_PREFIX + str(uuid.uuid4())
+        task_logger.info(f"Starting pre-provisioning for tenant {tenant_id}")
+
+        # Create the schema for the new tenant
+        schema_created = create_schema_if_not_exists(tenant_id)
+        if schema_created:
+            task_logger.info(f"Created schema for tenant '{tenant_id}'")
+        else:
+            task_logger.info(f"Schema already exists for tenant '{tenant_id}'")
+
+        # Set up the tenant with all necessary configurations
+        task_logger.info(f"Setting up tenant configuration for '{tenant_id}'")
+        asyncio.run(setup_tenant(tenant_id))
+        task_logger.info(f"Tenant configuration completed for '{tenant_id}'")
+
+        # Get the current Alembic version
+        alembic_version = get_current_alembic_version(tenant_id)
+        task_logger.info(
+            f"Tenant '{tenant_id}' using Alembic version: {alembic_version}"
+        )
+
+        # Store the pre-provisioned tenant in the database
+        task_logger.info(f"Storing pre-provisioned tenant '{tenant_id}' in database")
+        with Session(get_sqlalchemy_engine()) as db_session:
+            new_tenant = AvailableTenant(
+                tenant_id=tenant_id,
+                alembic_version=alembic_version,
+                date_created=datetime.datetime.now(),
+            )
+            db_session.add(new_tenant)
+            db_session.commit()
+
+        task_logger.info(f"Successfully pre-provisioned tenant {tenant_id}")
+
+    except Exception as e:
+        task_logger.exception(f"Error in pre_provision_tenant task: {e}")
+    finally:
+        lock_provision.release()
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -642,14 +642,7 @@ MOCK_LLM_RESPONSE = (
 )


-# Image processing configurations
-ENABLE_IMAGE_EXTRACTION = (
-    os.environ.get("ENABLE_IMAGE_EXTRACTION", "true").lower() == "true"
-)
-ENABLE_INDEXING_TIME_IMAGE_ANALYSIS = not (
-    os.environ.get("DISABLE_INDEXING_TIME_IMAGE_ANALYSIS", "false").lower() == "true"
-)
-ENABLE_SEARCH_TIME_IMAGE_ANALYSIS = not (
-    os.environ.get("DISABLE_SEARCH_TIME_IMAGE_ANALYSIS", "false").lower() == "true"
-)
-IMAGE_ANALYSIS_MAX_SIZE_MB = int(os.environ.get("IMAGE_ANALYSIS_MAX_SIZE_MB", "20"))
+DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB = 20
+
+# Number of pre-provisioned tenants to maintain
+TARGET_AVAILABLE_TENANTS = int(os.environ.get("TARGET_AVAILABLE_TENANTS", "5"))
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -321,6 +321,8 @@ class OnyxRedisLocks:
        "da_lock:check_connector_external_group_sync_beat"
    )
    MONITOR_BACKGROUND_PROCESSES_LOCK = "da_lock:monitor_background_processes"
+    CHECK_AVAILABLE_TENANTS_LOCK = "da_lock:check_available_tenants"
+    PRE_PROVISION_TENANT_LOCK = "da_lock:pre_provision_tenant"

    CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX = (
        "da_lock:connector_doc_permissions_sync"
@@ -383,6 +385,7 @@ class OnyxCeleryTask:
    CLOUD_MONITOR_CELERY_QUEUES = (
        f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor_celery_queues"
    )
+    CHECK_AVAILABLE_TENANTS = f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check_available_tenants"

    CHECK_FOR_CONNECTOR_DELETION = "check_for_connector_deletion_task"
    CHECK_FOR_VESPA_SYNC_TASK = "check_for_vespa_sync_task"
@@ -399,6 +402,9 @@ class OnyxCeleryTask:
    MONITOR_BACKGROUND_PROCESSES = "monitor_background_processes"
    MONITOR_CELERY_QUEUES = "monitor_celery_queues"

+    # Tenant pre-provisioning
+    PRE_PROVISION_TENANT = "pre_provision_tenant"
+
    KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
    CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
        "connector_permission_sync_generator_task"
--- a/backend/onyx/configs/llm_configs.py
+++ b/backend/onyx/configs/llm_configs.py
@@ -0,0 +1,38 @@
+from onyx.configs.app_configs import DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB
+from onyx.server.settings.store import load_settings
+
+
+def get_image_extraction_and_analysis_enabled() -> bool:
+    """Get image extraction and analysis enabled setting from workspace settings or fallback to False"""
+    try:
+        settings = load_settings()
+        if settings.image_extraction_and_analysis_enabled is not None:
+            return settings.image_extraction_and_analysis_enabled
+    except Exception:
+        pass
+
+    return False
+
+
+def get_search_time_image_analysis_enabled() -> bool:
+    """Get search time image analysis enabled setting from workspace settings or fallback to False"""
+    try:
+        settings = load_settings()
+        if settings.search_time_image_analysis_enabled is not None:
+            return settings.search_time_image_analysis_enabled
+    except Exception:
+        pass
+
+    return False
+
+
+def get_image_analysis_max_size_mb() -> int:
+    """Get image analysis max size MB setting from workspace settings or fallback to environment variable"""
+    try:
+        settings = load_settings()
+        if settings.image_analysis_max_size_mb is not None:
+            return settings.image_analysis_max_size_mb
+    except Exception:
+        pass
+
+    return DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB
--- a/backend/onyx/connectors/confluence/connector.py
+++ b/backend/onyx/connectors/confluence/connector.py
@@ -240,7 +240,7 @@ class ConfluenceConnector(
            # Extract basic page information
            page_id = page["id"]
            page_title = page["title"]
-            page_url = f"{self.wiki_base}/wiki{page['_links']['webui']}"
+            page_url = f"{self.wiki_base}{page['_links']['webui']}"

            # Get the page content
            page_content = extract_text_from_confluence_html(
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -144,6 +144,12 @@ class OnyxConfluence:
            self.static_credentials = credential_json
            return credential_json, False

+        if not OAUTH_CONFLUENCE_CLOUD_CLIENT_ID:
+            raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_ID must be set!")
+
+        if not OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET:
+            raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET must be set!")
+
        # check if we should refresh tokens. we're deciding to refresh halfway
        # to expiration
        now = datetime.now(timezone.utc)
--- a/backend/onyx/connectors/github/connector.py
+++ b/backend/onyx/connectors/github/connector.py
@@ -124,14 +124,14 @@ class GithubConnector(LoadConnector, PollConnector):
    def __init__(
        self,
        repo_owner: str,
-        repo_name: str | None = None,
+        repositories: str | None = None,
        batch_size: int = INDEX_BATCH_SIZE,
        state_filter: str = "all",
        include_prs: bool = True,
        include_issues: bool = False,
    ) -> None:
        self.repo_owner = repo_owner
-        self.repo_name = repo_name
+        self.repositories = repositories
        self.batch_size = batch_size
        self.state_filter = state_filter
        self.include_prs = include_prs
@@ -157,11 +157,42 @@ class GithubConnector(LoadConnector, PollConnector):
            )

        try:
-            return github_client.get_repo(f"{self.repo_owner}/{self.repo_name}")
+            return github_client.get_repo(f"{self.repo_owner}/{self.repositories}")
        except RateLimitExceededException:
            _sleep_after_rate_limit_exception(github_client)
            return self._get_github_repo(github_client, attempt_num + 1)

+    def _get_github_repos(
+        self, github_client: Github, attempt_num: int = 0
+    ) -> list[Repository.Repository]:
+        """Get specific repositories based on comma-separated repo_name string."""
+        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
+            raise RuntimeError(
+                "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
+            )
+
+        try:
+            repos = []
+            # Split repo_name by comma and strip whitespace
+            repo_names = [
+                name.strip() for name in (cast(str, self.repositories)).split(",")
+            ]
+
+            for repo_name in repo_names:
+                if repo_name:  # Skip empty strings
+                    try:
+                        repo = github_client.get_repo(f"{self.repo_owner}/{repo_name}")
+                        repos.append(repo)
+                    except GithubException as e:
+                        logger.warning(
+                            f"Could not fetch repo {self.repo_owner}/{repo_name}: {e}"
+                        )
+
+            return repos
+        except RateLimitExceededException:
+            _sleep_after_rate_limit_exception(github_client)
+            return self._get_github_repos(github_client, attempt_num + 1)
+
    def _get_all_repos(
        self, github_client: Github, attempt_num: int = 0
    ) -> list[Repository.Repository]:
@@ -189,11 +220,17 @@ class GithubConnector(LoadConnector, PollConnector):
        if self.github_client is None:
            raise ConnectorMissingCredentialError("GitHub")

-        repos = (
-            [self._get_github_repo(self.github_client)]
-            if self.repo_name
-            else self._get_all_repos(self.github_client)
-        )
+        repos = []
+        if self.repositories:
+            if "," in self.repositories:
+                # Multiple repositories specified
+                repos = self._get_github_repos(self.github_client)
+            else:
+                # Single repository (backward compatibility)
+                repos = [self._get_github_repo(self.github_client)]
+        else:
+            # All repositories
+            repos = self._get_all_repos(self.github_client)

        for repo in repos:
            if self.include_prs:
@@ -268,11 +305,48 @@ class GithubConnector(LoadConnector, PollConnector):
            )

        try:
-            if self.repo_name:
-                test_repo = self.github_client.get_repo(
-                    f"{self.repo_owner}/{self.repo_name}"
-                )
-                test_repo.get_contents("")
+            if self.repositories:
+                if "," in self.repositories:
+                    # Multiple repositories specified
+                    repo_names = [name.strip() for name in self.repositories.split(",")]
+                    if not repo_names:
+                        raise ConnectorValidationError(
+                            "Invalid connector settings: No valid repository names provided."
+                        )
+
+                    # Validate at least one repository exists and is accessible
+                    valid_repos = False
+                    validation_errors = []
+
+                    for repo_name in repo_names:
+                        if not repo_name:
+                            continue
+
+                        try:
+                            test_repo = self.github_client.get_repo(
+                                f"{self.repo_owner}/{repo_name}"
+                            )
+                            test_repo.get_contents("")
+                            valid_repos = True
+                            # If at least one repo is valid, we can proceed
+                            break
+                        except GithubException as e:
+                            validation_errors.append(
+                                f"Repository '{repo_name}': {e.data.get('message', str(e))}"
+                            )
+
+                    if not valid_repos:
+                        error_msg = (
+                            "None of the specified repositories could be accessed: "
+                        )
+                        error_msg += ", ".join(validation_errors)
+                        raise ConnectorValidationError(error_msg)
+                else:
+                    # Single repository (backward compatibility)
+                    test_repo = self.github_client.get_repo(
+                        f"{self.repo_owner}/{self.repositories}"
+                    )
+                    test_repo.get_contents("")
            else:
                # Try to get organization first
                try:
@@ -298,10 +372,15 @@ class GithubConnector(LoadConnector, PollConnector):
                    "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)."
                )
            elif e.status == 404:
-                if self.repo_name:
-                    raise ConnectorValidationError(
-                        f"GitHub repository not found with name: {self.repo_owner}/{self.repo_name}"
-                    )
+                if self.repositories:
+                    if "," in self.repositories:
+                        raise ConnectorValidationError(
+                            f"None of the specified GitHub repositories could be found for owner: {self.repo_owner}"
+                        )
+                    else:
+                        raise ConnectorValidationError(
+                            f"GitHub repository not found with name: {self.repo_owner}/{self.repositories}"
+                        )
                else:
                    raise ConnectorValidationError(
                        f"GitHub user or organization not found: {self.repo_owner}"
@@ -310,6 +389,7 @@ class GithubConnector(LoadConnector, PollConnector):
                raise ConnectorValidationError(
                    f"Unexpected GitHub error (status={e.status}): {e.data}"
                )
+
        except Exception as exc:
            raise Exception(
                f"Unexpected error during GitHub settings validation: {exc}"
@@ -321,7 +401,7 @@ if __name__ == "__main__":

    connector = GithubConnector(
        repo_owner=os.environ["REPO_OWNER"],
-        repo_name=os.environ["REPO_NAME"],
+        repositories=os.environ["REPOSITORIES"],
    )
    connector.load_credentials(
        {"github_access_token": os.environ["GITHUB_ACCESS_TOKEN"]}
--- a/backend/onyx/connectors/vision_enabled_connector.py
+++ b/backend/onyx/connectors/vision_enabled_connector.py
@@ -1,7 +1,7 @@
 """
 Mixin for connectors that need vision capabilities.
 """
-from onyx.configs.app_configs import ENABLE_INDEXING_TIME_IMAGE_ANALYSIS
+from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
 from onyx.llm.factory import get_default_llm_with_vision
 from onyx.llm.interfaces import LLM
 from onyx.utils.logger import setup_logger
@@ -30,7 +30,7 @@ class VisionEnabledConnector:
        Sets self.image_analysis_llm to the LLM instance or None if disabled.
        """
        self.image_analysis_llm: LLM | None = None
-        if ENABLE_INDEXING_TIME_IMAGE_ANALYSIS:
+        if get_image_extraction_and_analysis_enabled():
            try:
                self.image_analysis_llm = get_default_llm_with_vision()
                if self.image_analysis_llm is None:
--- a/backend/onyx/context/search/postprocessing/postprocessing.py
+++ b/backend/onyx/context/search/postprocessing/postprocessing.py
@@ -10,8 +10,8 @@ from langchain_core.messages import SystemMessage

 from onyx.chat.models import SectionRelevancePiece
 from onyx.configs.app_configs import BLURB_SIZE
-from onyx.configs.app_configs import ENABLE_SEARCH_TIME_IMAGE_ANALYSIS
 from onyx.configs.constants import RETURN_SEPARATOR
+from onyx.configs.llm_configs import get_search_time_image_analysis_enabled
 from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MAX
 from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MIN
 from onyx.context.search.enums import LLMEvaluationType
@@ -413,7 +413,7 @@ def search_postprocessing(
        # NOTE: if we don't rerank, we can return the chunks immediately
        # since we know this is the final order.
        # This way the user experience isn't delayed by the LLM step
-        if ENABLE_SEARCH_TIME_IMAGE_ANALYSIS:
+        if get_search_time_image_analysis_enabled():
            update_image_sections_with_query(
                retrieved_sections, search_query.query, llm
            )
@@ -456,7 +456,7 @@ def search_postprocessing(
            _log_top_section_links(search_query.search_type.value, reranked_sections)

            # Add the image processing step here
-            if ENABLE_SEARCH_TIME_IMAGE_ANALYSIS:
+            if get_search_time_image_analysis_enabled():
                update_image_sections_with_query(
                    reranked_sections, search_query.query, llm
                )
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -2310,6 +2310,17 @@ class UserTenantMapping(Base):
        return value.lower() if value else value


+class AvailableTenant(Base):
+    __tablename__ = "available_tenant"
+    """
+    These entries will only exist ephemerally and are meant to be picked up by new users on registration.
+    """
+
+    tenant_id: Mapped[str] = mapped_column(String, primary_key=True, nullable=False)
+    alembic_version: Mapped[str] = mapped_column(String, nullable=False)
+    date_created: Mapped[datetime.datetime] = mapped_column(DateTime, nullable=False)
+
+
 # This is a mapping from tenant IDs to anonymous user paths
 class TenantAnonymousUserPath(Base):
    __tablename__ = "tenant_anonymous_user_path"
--- a/backend/onyx/db/seeding/chat_history_seeding.py
+++ b/backend/onyx/db/seeding/chat_history_seeding.py
@@ -0,0 +1,53 @@
+import random
+from datetime import datetime
+from datetime import timedelta
+
+from onyx.configs.constants import MessageType
+from onyx.db.chat import create_chat_session
+from onyx.db.chat import create_new_chat_message
+from onyx.db.chat import get_or_create_root_message
+from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.models import ChatSession
+
+
+def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
+    """Utility function to seed chat history for testing.
+
+    num_sessions: the number of sessions to seed
+    num_messages: the number of messages to seed per sessions
+    days: the number of days looking backwards from the current time over which to randomize
+    the times.
+    """
+    with get_session_with_current_tenant() as db_session:
+        for y in range(0, num_sessions):
+            create_chat_session(db_session, f"pytest_session_{y}", None, None)
+
+        # randomize all session times
+        rows = db_session.query(ChatSession).all()
+        for row in rows:
+            row.time_created = datetime.utcnow() - timedelta(
+                days=random.randint(0, days)
+            )
+            row.time_updated = row.time_created + timedelta(
+                minutes=random.randint(0, 10)
+            )
+
+            root_message = get_or_create_root_message(row.id, db_session)
+
+            for x in range(0, num_messages):
+                chat_message = create_new_chat_message(
+                    row.id,
+                    root_message,
+                    f"pytest_message_{x}",
+                    None,
+                    0,
+                    MessageType.USER,
+                    db_session,
+                )
+
+                chat_message.time_sent = row.time_created + timedelta(
+                    minutes=random.randint(0, 10)
+                )
+            db_session.commit()
+
+        db_session.commit()
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -464,12 +464,29 @@ def index_doc_batch(
            ),
        )

-        successful_doc_ids = {record.document_id for record in insertion_records}
-        if successful_doc_ids != set(updatable_ids):
+        all_returned_doc_ids = (
+            {record.document_id for record in insertion_records}
+            .union(
+                {
+                    record.failed_document.document_id
+                    for record in vector_db_write_failures
+                    if record.failed_document
+                }
+            )
+            .union(
+                {
+                    record.failed_document.document_id
+                    for record in embedding_failures
+                    if record.failed_document
+                }
+            )
+        )
+        if all_returned_doc_ids != set(updatable_ids):
            raise RuntimeError(
                f"Some documents were not successfully indexed. "
                f"Updatable IDs: {updatable_ids}, "
-                f"Successful IDs: {successful_doc_ids}"
+                f"Returned IDs: {all_returned_doc_ids}. "
+                "This should never happen."
            )

        last_modified_ids = []
--- a/backend/onyx/main.py
+++ b/backend/onyx/main.py
@@ -51,6 +51,7 @@ from onyx.server.documents.cc_pair import router as cc_pair_router
 from onyx.server.documents.connector import router as connector_router
 from onyx.server.documents.credential import router as credential_router
 from onyx.server.documents.document import router as document_router
+from onyx.server.documents.standard_oauth import router as standard_oauth_router
 from onyx.server.features.document_set.api import router as document_set_router
 from onyx.server.features.folder.api import router as folder_router
 from onyx.server.features.input_prompt.api import (
@@ -322,6 +323,7 @@ def get_application() -> FastAPI:
    )
    include_router_with_global_prefix_prepended(application, long_term_logs_router)
    include_router_with_global_prefix_prepended(application, api_key_router)
+    include_router_with_global_prefix_prepended(application, standard_oauth_router)

    if AUTH_TYPE == AuthType.DISABLED:
        # Server logs this during auth setup verification step
--- a/backend/onyx/server/settings/models.py
+++ b/backend/onyx/server/settings/models.py
@@ -53,6 +53,11 @@ class Settings(BaseModel):
    auto_scroll: bool | None = False
    query_history_type: QueryHistoryType | None = None

+    # Image processing settings
+    image_extraction_and_analysis_enabled: bool | None = False
+    search_time_image_analysis_enabled: bool | None = False
+    image_analysis_max_size_mb: int | None = 20
+

 class UserSettings(Settings):
    notifications: list[Notification]
--- a/backend/onyx/server/settings/store.py
+++ b/backend/onyx/server/settings/store.py
@@ -47,6 +47,7 @@ def load_settings() -> Settings:

    settings.anonymous_user_enabled = anonymous_user_enabled
    settings.query_history_type = ONYX_QUERY_HISTORY_TYPE
+
    return settings


--- a/backend/scripts/chat_history_seeding.py
+++ b/backend/scripts/chat_history_seeding.py
@@ -0,0 +1,45 @@
+import argparse
+import logging
+from logging import getLogger
+
+from onyx.db.seeding.chat_history_seeding import seed_chat_history
+
+# Configure the logger
+logging.basicConfig(
+    level=logging.INFO,  # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",  # Log format
+    handlers=[logging.StreamHandler()],  # Output logs to console
+)
+
+logger = getLogger(__name__)
+
+
+def go_main(num_sessions: int, num_messages: int, num_days: int) -> None:
+    seed_chat_history(num_sessions, num_messages, num_days)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Seed chat history")
+    parser.add_argument(
+        "--sessions",
+        type=int,
+        default=2048,
+        help="Number of chat sessions to seed",
+    )
+
+    parser.add_argument(
+        "--messages",
+        type=int,
+        default=4,
+        help="Number of chat messages to seed per session",
+    )
+
+    parser.add_argument(
+        "--days",
+        type=int,
+        default=90,
+        help="Number of days looking backwards over which to seed the timestamps with",
+    )
+
+    args = parser.parse_args()
+    go_main(args.sessions, args.messages, args.days)
--- a/backend/tests/daily/connectors/confluence/test_confluence_basic.py
+++ b/backend/tests/daily/connectors/confluence/test_confluence_basic.py
@@ -45,7 +45,7 @@ def test_confluence_connector_basic(
    with pytest.raises(StopIteration):
        next(doc_batch_generator)

-    assert len(doc_batch) == 3
+    assert len(doc_batch) == 2

    page_within_a_page_doc: Document | None = None
    page_doc: Document | None = None
--- a/backend/tests/integration/tests/query_history/test_usage_reports.py
+++ b/backend/tests/integration/tests/query_history/test_usage_reports.py
@@ -0,0 +1,46 @@
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+
+from ee.onyx.db.usage_export import get_all_empty_chat_message_entries
+from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.seeding.chat_history_seeding import seed_chat_history
+
+
+def test_usage_reports(reset: None) -> None:
+    EXPECTED_SESSIONS = 2048
+    MESSAGES_PER_SESSION = 4
+    EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION
+
+    seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90)
+
+    with get_session_with_current_tenant() as db_session:
+        # count of all entries should be exact
+        period = (
+            datetime.fromtimestamp(0, tz=timezone.utc),
+            datetime.now(tz=timezone.utc),
+        )
+
+        count = 0
+        for entry_batch in get_all_empty_chat_message_entries(db_session, period):
+            for entry in entry_batch:
+                count += 1
+
+        assert count == EXPECTED_MESSAGES
+
+        # count in a one month time range should be within a certain range statistically
+        # this can be improved if we seed the chat history data deterministically
+        period = (
+            datetime.now(tz=timezone.utc) - timedelta(days=30),
+            datetime.now(tz=timezone.utc),
+        )
+
+        count = 0
+        for entry_batch in get_all_empty_chat_message_entries(db_session, period):
+            for entry in entry_batch:
+                count += 1
+
+        lower = EXPECTED_MESSAGES // 3 - (EXPECTED_MESSAGES // (3 * 3))
+        upper = EXPECTED_MESSAGES // 3 + (EXPECTED_MESSAGES // (3 * 3))
+        assert count > lower
+        assert count < upper
--- a/deployment/README.md
+++ b/deployment/README.md
@@ -80,3 +80,13 @@ prod cluster**
   - `kubectl delete -f .`
   - To not delete the persistent volumes (Document indexes and Users), specify the specific `.yaml` files instead of
     `.` without specifying delete on persistent-volumes.yaml.
+
+### Using Helm to deploy to an existing cluster
+
+Onyx has a helm chart that is convenient to install all services to an existing Kubernetes cluster. To install:
+
+* Currently the helm chart is not published so to install, clone the repo.
+* Configure access to the cluster via kubectl. Ensure the kubectl context is set to the cluster that you want to use
+* The default secrets, environment variables and other service level configuration are stored in `deployment/helm/charts/onyx/values.yml`. You may create another `override.yml`
+* `cd deployment/helm/charts/onyx` and run `helm install onyx -n onyx -f override.yaml .`. This will install onyx on the cluster under the `onyx` namespace.
+* Check the status of the deploy using `kubectl get pods -n onyx`
--- a/deployment/helm/charts/onyx/templates/ingress-api.yaml
+++ b/deployment/helm/charts/onyx/templates/ingress-api.yaml
@@ -0,0 +1,27 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "onyx-stack.fullname" . }}-ingress-api
+  annotations:
+    kubernetes.io/ingress.class: nginx
+    nginx.ingress.kubernetes.io/rewrite-target: /$2
+    nginx.ingress.kubernetes.io/use-regex: "true"
+    cert-manager.io/cluster-issuer: {{ include "onyx-stack.fullname" . }}-letsencrypt
+spec:
+  rules:
+    - host: {{ .Values.ingress.api.host }}
+      http:
+        paths:
+          - path: /api(/|$)(.*)
+            pathType: Prefix
+            backend:
+              service:
+                name: {{ include "onyx-stack.fullname" . }}-api-service
+                port:
+                  number: {{ .Values.api.service.servicePort }}
+  tls:
+    - hosts:
+        - {{ .Values.ingress.api.host }}
+      secretName: {{ include "onyx-stack.fullname" . }}-ingress-api-tls
+{{- end }}
--- a/deployment/helm/charts/onyx/templates/ingress-webserver.yaml
+++ b/deployment/helm/charts/onyx/templates/ingress-webserver.yaml
@@ -0,0 +1,26 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "onyx-stack.fullname" . }}-ingress-webserver
+  annotations:
+    kubernetes.io/ingress.class: nginx
+    cert-manager.io/cluster-issuer: {{ include "onyx-stack.fullname" . }}-letsencrypt
+    kubernetes.io/tls-acme: "true"
+spec:
+  rules:
+    - host: {{ .Values.ingress.webserver.host }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: {{ include "onyx-stack.fullname" . }}-webserver
+                port:
+                  number: {{ .Values.webserver.service.servicePort }}
+  tls:
+    - hosts:
+        - {{ .Values.ingress.webserver.host }}
+      secretName: {{ include "onyx-stack.fullname" . }}-ingress-webserver-tls
+{{- end }}
--- a/deployment/helm/charts/onyx/templates/lets-encrypt.yaml
+++ b/deployment/helm/charts/onyx/templates/lets-encrypt.yaml
@@ -0,0 +1,20 @@
+{{- if .Values.letsencrypt.enabled -}}
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: {{ include "onyx-stack.fullname" . }}-letsencrypt
+spec:
+  acme:
+    # The ACME server URL
+    server: https://acme-v02.api.letsencrypt.org/directory
+    # Email address used for ACME registration
+    email: {{ .Values.letsencrypt.email }}
+    # Name of a secret used to store the ACME account private key
+    privateKeySecretRef:
+      name: {{ include "onyx-stack.fullname" . }}-letsencrypt
+    # Enable the HTTP-01 challenge provider
+    solvers:
+      - http01:
+          ingress:
+            class: nginx
+{{- end }}
--- a/deployment/helm/charts/onyx/values.yaml
+++ b/deployment/helm/charts/onyx/values.yaml
@@ -376,22 +376,17 @@ redis:
    existingSecret: onyx-secrets
    existingSecretPasswordKey: redis_password

-# ingress:
-#  enabled: false
-#  className: ""
-#  annotations: {}
-#    # kubernetes.io/ingress.class: nginx
-#    # kubernetes.io/tls-acme: "true"
-#  hosts:
-#    - host: chart-example.local
-#      paths:
-#        - path: /
-#          pathType: ImplementationSpecific
-#  tls: []
-#  #  - secretName: chart-example-tls
-#  #    hosts:
-#  #      - chart-example.local
+ingress:
+  enabled: false
+  className: ""
+  api:
+    host: onyx.local
+  webserver:
+    host: onyx.local

+letsencrypt:
+  enabled: false
+  email: "abc@abc.com"

 auth:
  # existingSecret onyx-secret for storing smtp, oauth, slack, and other secrets
--- a/web/src/app/admin/settings/SettingsForm.tsx
+++ b/web/src/app/admin/settings/SettingsForm.tsx
@@ -26,7 +26,7 @@ export function Checkbox({
  onChange: (e: React.ChangeEvent<HTMLInputElement>) => void;
 }) {
  return (
-    <label className="flex text-sm cursor-pointer">
+    <label className="flex text-xs cursor-pointer">
      <input
        checked={checked}
        onChange={onChange}
@@ -34,7 +34,7 @@ export function Checkbox({
        className="mr-2 w-3.5 h-3.5 my-auto"
      />
      <div>
-        <Label>{label}</Label>
+        <Label small>{label}</Label>
        {sublabel && <SubLabel>{sublabel}</SubLabel>}
      </div>
    </label>
@@ -208,7 +208,7 @@ export function SettingsForm() {
  }

  return (
-    <div>
+    <div className="flex flex-col pb-8">
      {popup}
      <Title className="mb-4">Workspace Settings</Title>
      <Checkbox
@@ -290,23 +290,71 @@ export function SettingsForm() {
            id="chatRetentionInput"
            placeholder="Infinite Retention"
          />
-          <Button
-            onClick={handleSetChatRetention}
-            variant="submit"
-            size="sm"
-            className="mr-3"
-          >
-            Set Retention Limit
-          </Button>
-          <Button
-            onClick={handleClearChatRetention}
-            variant="default"
-            size="sm"
-          >
-            Retain All
-          </Button>
+          <div className="mr-auto flex gap-2">
+            <Button
+              onClick={handleSetChatRetention}
+              variant="submit"
+              size="sm"
+              className="mr-auto"
+            >
+              Set Retention Limit
+            </Button>
+            <Button
+              onClick={handleClearChatRetention}
+              variant="default"
+              size="sm"
+              className="mr-auto"
+            >
+              Retain All
+            </Button>
+          </div>
        </>
      )}
+
+      {/* Image Processing Settings */}
+      <Title className="mt-8 mb-4">Image Processing</Title>
+
+      <div className="flex flex-col gap-2">
+        <Checkbox
+          label="Enable Image Extraction and Analysis"
+          sublabel="Extract and analyze images from documents during indexing. This allows the system to process images and create searchable descriptions of them."
+          checked={settings.image_extraction_and_analysis_enabled ?? false}
+          onChange={(e) =>
+            handleToggleSettingsField(
+              "image_extraction_and_analysis_enabled",
+              e.target.checked
+            )
+          }
+        />
+
+        <Checkbox
+          label="Enable Search-time Image Analysis"
+          sublabel="Analyze images at search time when a user asks about images. This provides more detailed and query-specific image analysis but may increase search-time latency."
+          checked={settings.search_time_image_analysis_enabled ?? false}
+          onChange={(e) =>
+            handleToggleSettingsField(
+              "search_time_image_analysis_enabled",
+              e.target.checked
+            )
+          }
+        />
+
+        <IntegerInput
+          label="Maximum Image Size for Analysis (MB)"
+          sublabel="Images larger than this size will not be analyzed to prevent excessive resource usage."
+          value={settings.image_analysis_max_size_mb ?? null}
+          onChange={(e) => {
+            const value = e.target.value ? parseInt(e.target.value) : null;
+            if (value !== null && !isNaN(value) && value > 0) {
+              updateSettingField([
+                { fieldName: "image_analysis_max_size_mb", newValue: value },
+              ]);
+            }
+          }}
+          id="image-analysis-max-size"
+          placeholder="Enter maximum size in MB"
+        />
+      </div>
    </div>
  );
 }
--- a/web/src/app/admin/settings/interfaces.ts
+++ b/web/src/app/admin/settings/interfaces.ts
@@ -21,6 +21,11 @@ export interface Settings {
  auto_scroll: boolean;
  temperature_override_enabled: boolean;
  query_history_type: QueryHistoryType;
+
+  // Image processing settings
+  image_extraction_and_analysis_enabled?: boolean;
+  search_time_image_analysis_enabled?: boolean;
+  image_analysis_max_size_mb?: number;
 }

 export enum NotificationType {
--- a/web/src/app/auth/login/EmailPasswordForm.tsx
+++ b/web/src/app/auth/login/EmailPasswordForm.tsx
@@ -61,6 +61,7 @@ export function EmailPasswordForm({

            if (!response.ok) {
              setIsWorking(false);
+
              const errorDetail = (await response.json()).detail;
              let errorMsg = "Unknown error";
              if (typeof errorDetail === "object" && errorDetail.reason) {
@@ -96,12 +97,13 @@ export function EmailPasswordForm({
          } else {
            setIsWorking(false);
            const errorDetail = (await loginResponse.json()).detail;
-
            let errorMsg = "Unknown error";
            if (errorDetail === "LOGIN_BAD_CREDENTIALS") {
              errorMsg = "Invalid email or password";
            } else if (errorDetail === "NO_WEB_LOGIN_AND_HAS_NO_PASSWORD") {
              errorMsg = "Create an account to set a password";
+            } else if (typeof errorDetail === "string") {
+              errorMsg = errorDetail;
            }
            if (loginResponse.status === 429) {
              errorMsg = "Too many requests. Please try again later.";
--- a/web/src/app/chat/folders/FolderDropdown.tsx
+++ b/web/src/app/chat/folders/FolderDropdown.tsx
@@ -191,6 +191,7 @@ export const FolderDropdown = forwardRef<HTMLDivElement, FolderDropdownProps>(
                    onChange={(e) => setNewFolderName(e.target.value)}
                    className="text-sm font-medium bg-transparent outline-none w-full pb-1 border-b border-background-500 transition-colors duration-200"
                    onKeyDown={(e) => {
+                      e.stopPropagation();
                      if (e.key === "Enter") {
                        handleEdit();
                      }
--- a/web/src/app/chat/folders/FolderList.tsx
+++ b/web/src/app/chat/folders/FolderList.tsx
@@ -303,7 +303,6 @@ const FolderItem = ({
              key={chatSession.id}
              chatSession={chatSession}
              isSelected={chatSession.id === currentChatId}
-              skipGradient={isDragOver}
              showShareModal={showShareModal}
              showDeleteModal={showDeleteModal}
            />
--- a/web/src/app/chat/sessionSidebar/ChatSessionDisplay.tsx
+++ b/web/src/app/chat/sessionSidebar/ChatSessionDisplay.tsx
@@ -32,21 +32,17 @@ export function ChatSessionDisplay({
  chatSession,
  search,
  isSelected,
-  skipGradient,
  closeSidebar,
  showShareModal,
  showDeleteModal,
-  foldersExisting,
  isDragging,
 }: {
  chatSession: ChatSession;
  isSelected: boolean;
  search?: boolean;
-  skipGradient?: boolean;
  closeSidebar?: () => void;
  showShareModal?: (chatSession: ChatSession) => void;
  showDeleteModal?: (chatSession: ChatSession) => void;
-  foldersExisting?: boolean;
  isDragging?: boolean;
 }) {
  const router = useRouter();
@@ -238,8 +234,12 @@ export function ChatSessionDisplay({
                          e.preventDefault();
                          e.stopPropagation();
                        }}
-                        onChange={(e) => setChatName(e.target.value)}
+                        onChange={(e) => {
+                          setChatName(e.target.value);
+                        }}
                        onKeyDown={(event) => {
+                          event.stopPropagation();
+
                          if (event.key === "Enter") {
                            onRename();
                            event.preventDefault();
--- a/web/src/app/chat/sessionSidebar/PagesTab.tsx
+++ b/web/src/app/chat/sessionSidebar/PagesTab.tsx
@@ -264,7 +264,6 @@ export function PagesTab({
        >
          <ChatSessionDisplay
            chatSession={chat}
-            foldersExisting={foldersExisting}
            isSelected={currentChatId === chat.id}
            showShareModal={showShareModal}
            showDeleteModal={showDeleteModal}
--- a/web/src/components/admin/connectors/ConnectorTitle.tsx
+++ b/web/src/components/admin/connectors/ConnectorTitle.tsx
@@ -40,8 +40,12 @@ export const ConnectorTitle = ({
    const typedConnector = connector as Connector<GithubConfig>;
    additionalMetadata.set(
      "Repo",
-      typedConnector.connector_specific_config.repo_name
-        ? `${typedConnector.connector_specific_config.repo_owner}/${typedConnector.connector_specific_config.repo_name}`
+      typedConnector.connector_specific_config.repositories
+        ? `${typedConnector.connector_specific_config.repo_owner}/${
+            typedConnector.connector_specific_config.repositories.includes(",")
+              ? "multiple repos"
+              : typedConnector.connector_specific_config.repositories
+          }`
        : `${typedConnector.connector_specific_config.repo_owner}/*`
    );
  } else if (connector.source === "gitlab") {
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -190,10 +190,12 @@ export const connectorConfigs: Record<
            fields: [
              {
                type: "text",
-                query: "Enter the repository name:",
-                label: "Repository Name",
-                name: "repo_name",
+                query: "Enter the repository name(s):",
+                label: "Repository Name(s)",
+                name: "repositories",
                optional: false,
+                description:
+                  "For multiple repositories, enter comma-separated names (e.g., repo1,repo2,repo3)",
              },
            ],
          },
@@ -1257,21 +1259,18 @@ export function createConnectorInitialValues(
    name: "",
    groups: [],
    access_type: "public",
-    ...configuration.values.reduce(
-      (acc, field) => {
-        if (field.type === "select") {
-          acc[field.name] = null;
-        } else if (field.type === "list") {
-          acc[field.name] = field.default || [];
-        } else if (field.type === "checkbox") {
-          acc[field.name] = field.default || false;
-        } else if (field.default !== undefined) {
-          acc[field.name] = field.default;
-        }
-        return acc;
-      },
-      {} as { [record: string]: any }
-    ),
+    ...configuration.values.reduce((acc, field) => {
+      if (field.type === "select") {
+        acc[field.name] = null;
+      } else if (field.type === "list") {
+        acc[field.name] = field.default || [];
+      } else if (field.type === "checkbox") {
+        acc[field.name] = field.default || false;
+      } else if (field.default !== undefined) {
+        acc[field.name] = field.default;
+      }
+      return acc;
+    }, {} as { [record: string]: any }),
  };
 }

@@ -1283,28 +1282,25 @@ export function createConnectorValidationSchema(
  return Yup.object().shape({
    access_type: Yup.string().required("Access Type is required"),
    name: Yup.string().required("Connector Name is required"),
-    ...configuration.values.reduce(
-      (acc, field) => {
-        let schema: any =
-          field.type === "select"
-            ? Yup.string()
-            : field.type === "list"
-              ? Yup.array().of(Yup.string())
-              : field.type === "checkbox"
-                ? Yup.boolean()
-                : field.type === "file"
-                  ? Yup.mixed()
-                  : Yup.string();
+    ...configuration.values.reduce((acc, field) => {
+      let schema: any =
+        field.type === "select"
+          ? Yup.string()
+          : field.type === "list"
+          ? Yup.array().of(Yup.string())
+          : field.type === "checkbox"
+          ? Yup.boolean()
+          : field.type === "file"
+          ? Yup.mixed()
+          : Yup.string();

-        if (!field.optional) {
-          schema = schema.required(`${field.label} is required`);
-        }
+      if (!field.optional) {
+        schema = schema.required(`${field.label} is required`);
+      }

-        acc[field.name] = schema;
-        return acc;
-      },
-      {} as Record<string, any>
-    ),
+      acc[field.name] = schema;
+      return acc;
+    }, {} as Record<string, any>),
    // These are advanced settings
    indexingStart: Yup.string().nullable(),
    pruneFreq: Yup.number().min(0, "Prune frequency must be non-negative"),
@@ -1358,7 +1354,7 @@ export interface WebConfig {

 export interface GithubConfig {
  repo_owner: string;
-  repo_name: string;
+  repositories: string; // Comma-separated list of repository names
  include_prs: boolean;
  include_issues: boolean;
 }
Author	SHA1	Message	Date
pablonyx	a49039565c	k	2025-03-06 17:26:09 -08:00
pablonyx	cff23c7c63	quick fix - validated locally	2025-03-06 17:25:44 -08:00
pablonyx	86cc29ae43	k	2025-03-06 16:43:51 -08:00
pablonyx	99f2c3e87f	k	2025-03-06 16:43:07 -08:00
pablonyx	05cb7bfd51	nit	2025-03-06 16:39:39 -08:00
pablonyx	ee89591060	quick destructive nit	2025-03-06 15:14:34 -08:00
pablonyx	44cf923ab7	k	2025-03-06 14:47:55 -08:00
pablonyx	3ca97648f5	quick nit	2025-03-06 14:47:55 -08:00
pablonyx	b8b20585e1	cleaner	2025-03-06 14:47:53 -08:00
pablonyx	55b0b02068	k	2025-03-06 14:47:24 -08:00
pablonyx	2b1ec9c454	k	2025-03-06 14:47:24 -08:00
pablonyx	7d6db8d500	Comma separated list for Github repos (#4199 )	2025-03-06 14:46:57 -08:00
Chris Weaver	a7a374dc81	Confluence fixes (#4220 ) * Confluence fixes * Small tweak * Address greptile comments	2025-03-06 20:57:07 +00:00
rkuo-danswer	facc8cc2fa	add scope needed for permission sync (#4198 ) Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>	2025-03-06 20:03:38 +00:00
rkuo-danswer	2c0af0a0ca	Feature/helm updates (#4201 ) * add ingress for api and web * helm setup docs * add letsencrypt. close blocks * use pathType ImplementationSpecific as Prefix is deprecated * fix backend labels. configure nginx routes. update annotations * fix linting --------- Co-authored-by: Sajjad Anwar <sajjadkm@gmail.com> Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>	2025-03-06 19:48:20 +00:00
pablonyx	bfbc1cd954	k (#4172 )	2025-03-06 18:55:12 +00:00
pablonyx	626da583aa	Fix gated tenants (#4177 ) * fix * mypy .	2025-03-06 18:07:15 +00:00
pablonyx	92faca139d	Fix extra tenant mystery (#4197 ) * fix extra tenant mystery * nit	2025-03-06 18:06:49 +00:00
pablonyx	cec05c5ee9	Revert "k" This reverts commit `687122911d`.	2025-03-06 09:38:31 -08:00
Richard Kuo (Danswer)	eaf054ef06	oauth router went missing?	2025-03-05 15:50:23 -08:00
pablonyx	a7a1a24658	minor nit	2025-03-05 15:35:02 -08:00
pablonyx	687122911d	k	2025-03-05 15:27:14 -08:00
pablonyx	40953bd4fe	Workspace configs (#4202 )	2025-03-05 12:28:44 -08:00
rkuo-danswer	a7acc07e79	fix usage report pagination (#4183 ) * early work in progress * rename utility script * move actual data seeding to a shareable function * add test * make the test pass with the fix * fix comment --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>	2025-03-05 19:13:51 +00:00