fix: sharepoint lg files issue (#5065 )

* add SharePoint file size threshold check * Implement retry logic for SharePoint queries to handle rate limiting and server error * mypy fix * add content none check * remove unreachable code from retry logic in sharepoint connector
attempt fix for broken excel files (#5071 )
2026-02-25 19:55:47 +00:00 · 2025-07-24 14:26:01 +00:00 · 2025-07-24 01:21:13 +00:00 · 2025-07-23 23:05:35 +00:00 · 2025-07-23 21:59:34 +00:00 · 2025-07-23 14:54:32 -07:00
580 changed files with 27746 additions and 10479 deletions
--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -0,0 +1,94 @@
+name: External Dependency Unit Tests
+
+on:
+  merge_group:
+  pull_request:
+    branches: [main]
+
+env:
+  # AWS
+  S3_AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  S3_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
+
+  # MinIO
+  S3_ENDPOINT_URL: "http://localhost:9004"
+
+  # Confluence
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
+jobs:
+  discover-test-dirs:
+    runs-on: ubuntu-latest
+    outputs:
+      test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      
+      - name: Discover test directories
+        id: set-matrix
+        run: |
+          # Find all subdirectories in backend/tests/external_dependency_unit
+          dirs=$(find backend/tests/external_dependency_unit -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-dirs=$dirs" >> $GITHUB_OUTPUT
+
+  external-dependency-unit-tests:
+    needs: discover-test-dirs
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
+
+    env:
+      PYTHONPATH: ./backend
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          playwright install chromium
+          playwright install-deps chromium
+
+      - name: Set up Standard Dependencies
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p onyx-stack up -d minio relational_db cache index
+
+      - name: Run migrations
+        run: |
+          cd backend
+          alembic upgrade head
+
+      - name: Run Tests for ${{ matrix.test-dir }}
+        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
+        run: |
+          py.test \
+            -n 8 \
+            --dist loadfile \
+            --durations=8 \
+            -o junit_family=xunit2 \
+            -xv \
+            --ff \
+            backend/tests/external_dependency_unit/${{ matrix.test-dir }}
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -16,6 +16,9 @@ env:
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
+  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
+  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
  PLATFORM_PAIR: linux-amd64

 jobs:
@@ -266,6 +269,9 @@ jobs:
            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
+            -e JIRA_BASE_URL=${JIRA_BASE_URL} \
+            -e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
+            -e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
            -e TEST_WEB_HOSTNAME=test-runner \
            -e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
            -e MOCK_CONNECTOR_SERVER_PORT=8001 \
--- a/.github/workflows/pr-labeler.yml
+++ b/.github/workflows/pr-labeler.yml
@@ -0,0 +1,38 @@
+name: PR Labeler
+
+on:
+  pull_request_target:
+    branches:
+      - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - edited
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  validate_pr_title:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR title for Conventional Commits
+        env:
+          PR_TITLE: ${{ github.event.pull_request.title }}
+        run: |
+          echo "PR Title: $PR_TITLE"
+          if [[ ! "$PR_TITLE" =~ ^(feat|fix|docs|test|ci|refactor|perf|chore|revert|build)(\(.+\))?:\ .+ ]]; then
+            echo "::error::❌ Your PR title does not follow the Conventional Commits format.
+              This check ensures that all pull requests use clear, consistent titles that help automate changelogs and improve project history.
+
+              Please update your PR title to follow the Conventional Commits style.  
+              Here is a link to a blog explaining the reason why we've included the Conventional Commits style into our PR titles: https://xfuture-blog.com/working-with-conventional-commits
+
+              **Here are some examples of valid PR titles:**
+              - feat: add user authentication
+              - fix(login): handle null password error
+              - docs(readme): update installation instructions"
+            exit 1
+          fi
--- a/.github/workflows/pr-mit-integration-tests.yml
+++ b/.github/workflows/pr-mit-integration-tests.yml
@@ -16,6 +16,9 @@ env:
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
+  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
+  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
  PLATFORM_PAIR: linux-amd64
 jobs:
  integration-tests-mit:
@@ -201,6 +204,9 @@ jobs:
            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
+            -e JIRA_BASE_URL=${JIRA_BASE_URL} \
+            -e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
+            -e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
            -e TEST_WEB_HOSTNAME=test-runner \
            -e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
            -e MOCK_CONNECTOR_SERVER_PORT=8001 \
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -47,7 +47,7 @@ jobs:
          -i /local/openapi.json \
          -g python \
          -o /local/onyx_openapi_client \
-          --package-name onyx_openapi_client
+          --package-name onyx_openapi_client \
            
    - name: Run MyPy
      run: |
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -16,12 +16,13 @@ env:
  # Confluence
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
-  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}

  # Jira
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}

@@ -49,6 +50,15 @@ env:
  SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
  SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}

+  # Hubspot
+  HUBSPOT_ACCESS_TOKEN: ${{ secrets.HUBSPOT_ACCESS_TOKEN }}
+
+  # IMAP
+  IMAP_HOST: ${{ secrets.IMAP_HOST }}
+  IMAP_USERNAME: ${{ secrets.IMAP_USERNAME }}
+  IMAP_PASSWORD: ${{ secrets.IMAP_PASSWORD }}
+  IMAP_MAILBOXES: ${{ secrets.IMAP_MAILBOXES }}
+
  # Airtable
  AIRTABLE_TEST_BASE_ID: ${{ secrets.AIRTABLE_TEST_BASE_ID }}
  AIRTABLE_TEST_TABLE_ID: ${{ secrets.AIRTABLE_TEST_TABLE_ID }}
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -45,8 +45,9 @@ PYTHONPATH=../backend
 PYTHONUNBUFFERED=1


-# Internet Search 
+# Internet Search
 BING_API_KEY=<REPLACE THIS>
+EXA_API_KEY=<REPLACE THIS>


 # Enable the full set of Danswer Enterprise Edition features
@@ -58,3 +59,9 @@ AGENT_RETRIEVAL_STATS=False   # Note: This setting will incur substantial re-ran
 AGENT_RERANKING_STATS=True
 AGENT_MAX_QUERY_RETRIEVAL_RESULTS=20
 AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS=20
+
+# S3 File Store Configuration (MinIO for local development)
+S3_ENDPOINT_URL=http://localhost:9004
+S3_FILE_STORE_BUCKET_NAME=onyx-file-store-bucket
+S3_AWS_ACCESS_KEY_ID=minioadmin
+S3_AWS_SECRET_ACCESS_KEY=minioadmin
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -24,8 +24,8 @@
          "Celery primary",
          "Celery light",
          "Celery heavy",
-          "Celery indexing",
-          "Celery user files indexing",
+          "Celery docfetching",
+          "Celery docprocessing",
          "Celery beat",
          "Celery monitoring"
        ],
@@ -46,8 +46,8 @@
          "Celery primary",
          "Celery light",
          "Celery heavy",
-          "Celery indexing",
-          "Celery user files indexing",
+          "Celery docfetching",
+          "Celery docprocessing",
          "Celery beat",
          "Celery monitoring"
        ],
@@ -226,35 +226,66 @@
        "consoleTitle": "Celery heavy Console"
      },
      {
-        "name": "Celery indexing",
+        "name": "Celery docfetching",
        "type": "debugpy",
        "request": "launch",
        "module": "celery",
        "cwd": "${workspaceFolder}/backend",
        "envFile": "${workspaceFolder}/.vscode/.env",
        "env": {
-          "ENABLE_MULTIPASS_INDEXING": "false",
-          "LOG_LEVEL": "DEBUG",
-          "PYTHONUNBUFFERED": "1",
-          "PYTHONPATH": "."
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
        },
        "args": [
-          "-A",
-          "onyx.background.celery.versioned_apps.indexing",
-          "worker",
-          "--pool=threads",
-          "--concurrency=1",
-          "--prefetch-multiplier=1",
-          "--loglevel=INFO",
-          "--hostname=indexing@%n",
-          "-Q",
-          "connector_indexing"
+            "-A",
+            "onyx.background.celery.versioned_apps.docfetching",
+            "worker",
+            "--pool=threads",
+            "--concurrency=1",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docfetching@%n",
+            "-Q",
+            "connector_doc_fetching,user_files_indexing"
        ],
        "presentation": {
-          "group": "2"
+            "group": "2"
        },
-        "consoleTitle": "Celery indexing Console"
-      },
+        "consoleTitle": "Celery docfetching Console",
+        "justMyCode": false
+    },
+    {
+        "name": "Celery docprocessing",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+            "ENABLE_MULTIPASS_INDEXING": "false",
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
+        },
+        "args": [
+            "-A",
+            "onyx.background.celery.versioned_apps.docprocessing",
+            "worker",
+            "--pool=threads",
+            "--concurrency=6",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docprocessing@%n",
+            "-Q",
+            "docprocessing"
+        ],
+        "presentation": {
+            "group": "2"
+        },
+        "consoleTitle": "Celery docprocessing Console",
+        "justMyCode": false
+    },
      {
        "name": "Celery monitoring",
        "type": "debugpy",
@@ -303,35 +334,6 @@
        },
        "consoleTitle": "Celery beat Console"
      },
-      {
-        "name": "Celery user files indexing",
-        "type": "debugpy",
-        "request": "launch",
-        "module": "celery",
-        "cwd": "${workspaceFolder}/backend",
-        "envFile": "${workspaceFolder}/.vscode/.env",
-        "env": {
-          "LOG_LEVEL": "DEBUG",
-          "PYTHONUNBUFFERED": "1",
-          "PYTHONPATH": "."
-        },
-        "args": [
-          "-A",
-          "onyx.background.celery.versioned_apps.indexing",
-          "worker",
-          "--pool=threads",
-          "--concurrency=1",
-          "--prefetch-multiplier=1",
-          "--loglevel=INFO",
-          "--hostname=user_files_indexing@%n",
-          "-Q",
-          "user_files_indexing"
-        ],
-        "presentation": {
-          "group": "2"
-        },
-        "consoleTitle": "Celery user files indexing Console"
-      },
      {
        "name": "Pytest",
        "consoleName": "Pytest",
@@ -426,7 +428,7 @@
      },
      "args": [
        "--filename",
-        "generated/openapi.json",
+        "generated/openapi.json"
      ]
    },
    {
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -59,6 +59,7 @@ Onyx being a fully functional app, relies on some external software, specificall
 - [Postgres](https://www.postgresql.org/) (Relational DB)
 - [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
 - [Redis](https://redis.io/) (Cache)
+- [MinIO](https://min.io/) (File Store)
 - [Nginx](https://nginx.org/) (Not needed for development flows generally)

 > **Note:**
@@ -171,10 +172,10 @@ Otherwise, you can follow the instructions below to run the application for deve

 You will need Docker installed to run these containers.

-First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
+First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:

 ```bash
-docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache
+docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache minio
 ```

 (index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -37,8 +37,7 @@ RUN apt-get update && \
        pkg-config \
        gcc \
        nano \
-        vim \
-        postgresql-client && \
+        vim && \
    rm -rf /var/lib/apt/lists/* && \
    apt-get clean

@@ -78,6 +77,9 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/* && \
    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

+# Install postgresql-client for easy manual tests
+# Install it here to avoid it being cleaned up above
+RUN apt-get update && apt-get install -y postgresql-client

 # Pre-downloading models for setups with limited egress
 RUN python -c "from tokenizers import Tokenizer; \
--- a/backend/alembic/README.md
+++ b/backend/alembic/README.md
@@ -20,3 +20,44 @@ To run all un-applied migrations:
 To undo migrations:
 `alembic downgrade -X`
 where X is the number of migrations you want to undo from the current state
+
+### Multi-tenant migrations
+
+For multi-tenant deployments, you can use additional options:
+
+**Upgrade all tenants:**
+```bash
+alembic -x upgrade_all_tenants=true upgrade head
+```
+
+**Upgrade specific schemas:**
+```bash
+# Single schema
+alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012 upgrade head
+
+# Multiple schemas (comma-separated)
+alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012,public,another_tenant upgrade head
+```
+
+**Upgrade tenants within an alphabetical range:**
+```bash
+# Upgrade tenants 100-200 when sorted alphabetically (positions 100 to 200)
+alembic -x upgrade_all_tenants=true -x tenant_range_start=100 -x tenant_range_end=200 upgrade head
+
+# Upgrade tenants starting from position 1000 alphabetically
+alembic -x upgrade_all_tenants=true -x tenant_range_start=1000 upgrade head
+
+# Upgrade first 500 tenants alphabetically
+alembic -x upgrade_all_tenants=true -x tenant_range_end=500 upgrade head
+```
+
+**Continue on error (for batch operations):**
+```bash
+alembic -x upgrade_all_tenants=true -x continue=true upgrade head
+```
+
+The tenant range filtering works by:
+1. Sorting tenant IDs alphabetically
+2. Using 1-based position numbers (1st, 2nd, 3rd tenant, etc.)
+3. Filtering to the specified range of positions
+4. Non-tenant schemas (like 'public') are always included
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -1,12 +1,12 @@
 from typing import Any, Literal
-from onyx.db.engine import get_iam_auth_token
+from onyx.db.engine.iam_auth import get_iam_auth_token
 from onyx.configs.app_configs import USE_IAM_AUTH
 from onyx.configs.app_configs import POSTGRES_HOST
 from onyx.configs.app_configs import POSTGRES_PORT
 from onyx.configs.app_configs import POSTGRES_USER
 from onyx.configs.app_configs import AWS_REGION_NAME
-from onyx.db.engine import build_connection_string
-from onyx.db.engine import get_all_tenant_ids
+from onyx.db.engine.sql_engine import build_connection_string
+from onyx.db.engine.tenant_utils import get_all_tenant_ids
 from sqlalchemy import event
 from sqlalchemy import pool
 from sqlalchemy import text
@@ -21,10 +21,14 @@ from alembic import context
 from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.sql.schema import SchemaItem
 from onyx.configs.constants import SSL_CERT_FILE
-from shared_configs.configs import MULTI_TENANT, POSTGRES_DEFAULT_SCHEMA
+from shared_configs.configs import (
+    MULTI_TENANT,
+    POSTGRES_DEFAULT_SCHEMA,
+    TENANT_ID_PREFIX,
+)
 from onyx.db.models import Base
 from celery.backends.database.session import ResultModelBase  # type: ignore
-from onyx.db.engine import SqlEngine
+from onyx.db.engine.sql_engine import SqlEngine

 # Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
 # hidden! (defaults to level=WARN)
@@ -69,15 +73,67 @@ def include_object(
    return True


-def get_schema_options() -> tuple[str, bool, bool, bool]:
+def filter_tenants_by_range(
+    tenant_ids: list[str], start_range: int | None = None, end_range: int | None = None
+) -> list[str]:
+    """
+    Filter tenant IDs by alphabetical position range.
+
+    Args:
+        tenant_ids: List of tenant IDs to filter
+        start_range: Starting position in alphabetically sorted list (1-based, inclusive)
+        end_range: Ending position in alphabetically sorted list (1-based, inclusive)
+
+    Returns:
+        Filtered list of tenant IDs in their original order
+    """
+    if start_range is None and end_range is None:
+        return tenant_ids
+
+    # Separate tenant IDs from non-tenant schemas
+    tenant_schemas = [tid for tid in tenant_ids if tid.startswith(TENANT_ID_PREFIX)]
+    non_tenant_schemas = [
+        tid for tid in tenant_ids if not tid.startswith(TENANT_ID_PREFIX)
+    ]
+
+    # Sort tenant schemas alphabetically.
+    # NOTE: can cause missed schemas if a schema is created in between workers
+    # fetching of all tenant IDs. We accept this risk for now. Just re-running
+    # the migration will fix the issue.
+    sorted_tenant_schemas = sorted(tenant_schemas)
+
+    # Apply range filtering (0-based indexing)
+    start_idx = start_range if start_range is not None else 0
+    end_idx = end_range if end_range is not None else len(sorted_tenant_schemas)
+
+    # Ensure indices are within bounds
+    start_idx = max(0, start_idx)
+    end_idx = min(len(sorted_tenant_schemas), end_idx)
+
+    # Get the filtered tenant schemas
+    filtered_tenant_schemas = sorted_tenant_schemas[start_idx:end_idx]
+
+    # Combine with non-tenant schemas and preserve original order
+    filtered_tenants = []
+    for tenant_id in tenant_ids:
+        if tenant_id in filtered_tenant_schemas or tenant_id in non_tenant_schemas:
+            filtered_tenants.append(tenant_id)
+
+    return filtered_tenants
+
+
+def get_schema_options() -> (
+    tuple[bool, bool, bool, int | None, int | None, list[str] | None]
+):
    x_args_raw = context.get_x_argument()
    x_args = {}
    for arg in x_args_raw:
-        for pair in arg.split(","):
-            if "=" in pair:
-                key, value = pair.split("=", 1)
-                x_args[key.strip()] = value.strip()
-    schema_name = x_args.get("schema", POSTGRES_DEFAULT_SCHEMA)
+        if "=" in arg:
+            key, value = arg.split("=", 1)
+            x_args[key.strip()] = value.strip()
+        else:
+            raise ValueError(f"Invalid argument: {arg}")
+
    create_schema = x_args.get("create_schema", "true").lower() == "true"
    upgrade_all_tenants = x_args.get("upgrade_all_tenants", "false").lower() == "true"

@@ -85,17 +141,81 @@ def get_schema_options() -> tuple[str, bool, bool, bool]:
    # only applies to online migrations
    continue_on_error = x_args.get("continue", "false").lower() == "true"

-    if (
-        MULTI_TENANT
-        and schema_name == POSTGRES_DEFAULT_SCHEMA
-        and not upgrade_all_tenants
-    ):
+    # Tenant range filtering
+    tenant_range_start = None
+    tenant_range_end = None
+
+    if "tenant_range_start" in x_args:
+        try:
+            tenant_range_start = int(x_args["tenant_range_start"])
+        except ValueError:
+            raise ValueError(
+                f"Invalid tenant_range_start value: {x_args['tenant_range_start']}. Must be an integer."
+            )
+
+    if "tenant_range_end" in x_args:
+        try:
+            tenant_range_end = int(x_args["tenant_range_end"])
+        except ValueError:
+            raise ValueError(
+                f"Invalid tenant_range_end value: {x_args['tenant_range_end']}. Must be an integer."
+            )
+
+    # Validate range
+    if tenant_range_start is not None and tenant_range_end is not None:
+        if tenant_range_start > tenant_range_end:
+            raise ValueError(
+                f"tenant_range_start ({tenant_range_start}) cannot be greater than tenant_range_end ({tenant_range_end})"
+            )
+
+    # Specific schema names filtering (replaces both schema_name and the old tenant_ids approach)
+    schemas = None
+    if "schemas" in x_args:
+        schema_names_str = x_args["schemas"].strip()
+        if schema_names_str:
+            # Split by comma and strip whitespace
+            schemas = [
+                name.strip() for name in schema_names_str.split(",") if name.strip()
+            ]
+            if schemas:
+                logger.info(f"Specific schema names specified: {schemas}")
+
+    # Validate that only one method is used at a time
+    range_filtering = tenant_range_start is not None or tenant_range_end is not None
+    specific_filtering = schemas is not None and len(schemas) > 0
+
+    if range_filtering and specific_filtering:
        raise ValueError(
-            "Cannot run default migrations in public schema when multi-tenancy is enabled. "
-            "Please specify a tenant-specific schema."
+            "Cannot use both tenant range filtering (tenant_range_start/tenant_range_end) "
+            "and specific schema filtering (schemas) at the same time. "
+            "Please use only one filtering method."
        )

-    return schema_name, create_schema, upgrade_all_tenants, continue_on_error
+    if upgrade_all_tenants and specific_filtering:
+        raise ValueError(
+            "Cannot use both upgrade_all_tenants=true and schemas at the same time. "
+            "Use either upgrade_all_tenants=true for all tenants, or schemas for specific schemas."
+        )
+
+    # If any filtering parameters are specified, we're not doing the default single schema migration
+    if range_filtering:
+        upgrade_all_tenants = True
+
+    # Validate multi-tenant requirements
+    if MULTI_TENANT and not upgrade_all_tenants and not specific_filtering:
+        raise ValueError(
+            "In multi-tenant mode, you must specify either upgrade_all_tenants=true "
+            "or provide schemas. Cannot run default migration."
+        )
+
+    return (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    )


 def do_run_migrations(
@@ -142,12 +262,17 @@ def provide_iam_token_for_alembic(

 async def run_async_migrations() -> None:
    (
-        schema_name,
        create_schema,
        upgrade_all_tenants,
        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
    ) = get_schema_options()

+    if not schemas and not MULTI_TENANT:
+        schemas = [POSTGRES_DEFAULT_SCHEMA]
+
    # without init_engine, subsequent engine calls fail hard intentionally
    SqlEngine.init_engine(pool_size=20, max_overflow=5)

@@ -164,12 +289,50 @@ async def run_async_migrations() -> None:
        ) -> None:
            provide_iam_token_for_alembic(dialect, conn_rec, cargs, cparams)

-    if upgrade_all_tenants:
+    if schemas:
+        # Use specific schema names directly without fetching all tenants
+        logger.info(f"Migrating specific schema names: {schemas}")
+
+        i_schema = 0
+        num_schemas = len(schemas)
+        for schema in schemas:
+            i_schema += 1
+            logger.info(
+                f"Migrating schema: index={i_schema} num_schemas={num_schemas} schema={schema}"
+            )
+            try:
+                async with engine.connect() as connection:
+                    await connection.run_sync(
+                        do_run_migrations,
+                        schema_name=schema,
+                        create_schema=create_schema,
+                    )
+            except Exception as e:
+                logger.error(f"Error migrating schema {schema}: {e}")
+                if not continue_on_error:
+                    logger.error("--continue=true is not set, raising exception!")
+                    raise
+
+                logger.warning("--continue=true is set, continuing to next schema.")
+
+    elif upgrade_all_tenants:
        tenant_schemas = get_all_tenant_ids()

+        filtered_tenant_schemas = filter_tenants_by_range(
+            tenant_schemas, tenant_range_start, tenant_range_end
+        )
+
+        if tenant_range_start is not None or tenant_range_end is not None:
+            logger.info(
+                f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
+            )
+            logger.info(
+                f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
+            )
+
        i_tenant = 0
-        num_tenants = len(tenant_schemas)
-        for schema in tenant_schemas:
+        num_tenants = len(filtered_tenant_schemas)
+        for schema in filtered_tenant_schemas:
            i_tenant += 1
            logger.info(
                f"Migrating schema: index={i_tenant} num_tenants={num_tenants} schema={schema}"
@@ -190,17 +353,13 @@ async def run_async_migrations() -> None:
                logger.warning("--continue=true is set, continuing to next schema.")

    else:
-        try:
-            logger.info(f"Migrating schema: {schema_name}")
-            async with engine.connect() as connection:
-                await connection.run_sync(
-                    do_run_migrations,
-                    schema_name=schema_name,
-                    create_schema=create_schema,
-                )
-        except Exception as e:
-            logger.error(f"Error migrating schema {schema_name}: {e}")
-            raise
+        # This should not happen in the new design since we require either
+        # upgrade_all_tenants=true or schemas in multi-tenant mode
+        # and for non-multi-tenant mode, we should use schemas with the default schema
+        raise ValueError(
+            "No migration target specified. Use either upgrade_all_tenants=true for all tenants "
+            "or schemas for specific schemas."
+        )

    await engine.dispose()

@@ -221,10 +380,37 @@ def run_migrations_offline() -> None:
    # without init_engine, subsequent engine calls fail hard intentionally
    SqlEngine.init_engine(pool_size=20, max_overflow=5)

-    schema_name, _, upgrade_all_tenants, continue_on_error = get_schema_options()
+    (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    ) = get_schema_options()
    url = build_connection_string()

-    if upgrade_all_tenants:
+    if schemas:
+        # Use specific schema names directly without fetching all tenants
+        logger.info(f"Migrating specific schema names: {schemas}")
+
+        for schema in schemas:
+            logger.info(f"Migrating schema: {schema}")
+            context.configure(
+                url=url,
+                target_metadata=target_metadata,  # type: ignore
+                literal_binds=True,
+                include_object=include_object,
+                version_table_schema=schema,
+                include_schemas=True,
+                script_location=config.get_main_option("script_location"),
+                dialect_opts={"paramstyle": "named"},
+            )
+
+            with context.begin_transaction():
+                context.run_migrations()
+
+    elif upgrade_all_tenants:
        engine = create_async_engine(url)

        if USE_IAM_AUTH:
@@ -238,7 +424,19 @@ def run_migrations_offline() -> None:
        tenant_schemas = get_all_tenant_ids()
        engine.sync_engine.dispose()

-        for schema in tenant_schemas:
+        filtered_tenant_schemas = filter_tenants_by_range(
+            tenant_schemas, tenant_range_start, tenant_range_end
+        )
+
+        if tenant_range_start is not None or tenant_range_end is not None:
+            logger.info(
+                f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
+            )
+            logger.info(
+                f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
+            )
+
+        for schema in filtered_tenant_schemas:
            logger.info(f"Migrating schema: {schema}")
            context.configure(
                url=url,
@@ -254,21 +452,12 @@ def run_migrations_offline() -> None:
            with context.begin_transaction():
                context.run_migrations()
    else:
-        logger.info(f"Migrating schema: {schema_name}")
-        context.configure(
-            url=url,
-            target_metadata=target_metadata,  # type: ignore
-            literal_binds=True,
-            include_object=include_object,
-            version_table_schema=schema_name,
-            include_schemas=True,
-            script_location=config.get_main_option("script_location"),
-            dialect_opts={"paramstyle": "named"},
+        # This should not happen in the new design
+        raise ValueError(
+            "No migration target specified. Use either upgrade_all_tenants=true for all tenants "
+            "or schemas for specific schemas."
        )

-        with context.begin_transaction():
-            context.run_migrations()
-

 def run_migrations_online() -> None:
    logger.info("run_migrations_online starting.")
--- a/backend/alembic/versions/0816326d83aa_add_federated_connector_tables.py
+++ b/backend/alembic/versions/0816326d83aa_add_federated_connector_tables.py
@@ -0,0 +1,72 @@
+"""add federated connector tables
+
+Revision ID: 0816326d83aa
+Revises: 12635f6655b7
+Create Date: 2025-06-29 14:09:45.109518
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision = "0816326d83aa"
+down_revision = "12635f6655b7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create federated_connector table
+    op.create_table(
+        "federated_connector",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("source", sa.String(), nullable=False),
+        sa.Column("credentials", sa.LargeBinary(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create federated_connector_oauth_token table
+    op.create_table(
+        "federated_connector_oauth_token",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("federated_connector_id", sa.Integer(), nullable=False),
+        sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("token", sa.LargeBinary(), nullable=False),
+        sa.Column("expires_at", sa.DateTime(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create federated_connector__document_set table
+    op.create_table(
+        "federated_connector__document_set",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("federated_connector_id", sa.Integer(), nullable=False),
+        sa.Column("document_set_id", sa.Integer(), nullable=False),
+        sa.Column("entities", postgresql.JSONB(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["document_set_id"], ["document_set.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint(
+            "federated_connector_id",
+            "document_set_id",
+            name="uq_federated_connector_document_set",
+        ),
+    )
+
+
+def downgrade() -> None:
+    # Drop tables in reverse order due to foreign key dependencies
+    op.drop_table("federated_connector__document_set")
+    op.drop_table("federated_connector_oauth_token")
+    op.drop_table("federated_connector")
--- a/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
+++ b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
@@ -0,0 +1,596 @@
+"""drive-canonical-ids
+
+Revision ID: 12635f6655b7
+Revises: 58c50ef19f08
+Create Date: 2025-06-20 14:44:54.241159
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from urllib.parse import urlparse, urlunparse
+from httpx import HTTPStatusError
+import httpx
+from onyx.document_index.factory import get_default_document_index
+from onyx.db.search_settings import SearchSettings
+from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
+from onyx.document_index.vespa.shared_utils.utils import (
+    replace_invalid_doc_id_characters,
+)
+from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
+from onyx.utils.logger import setup_logger
+import os
+
+logger = setup_logger()
+
+# revision identifiers, used by Alembic.
+revision = "12635f6655b7"
+down_revision = "58c50ef19f08"
+branch_labels = None
+depends_on = None
+
+SKIP_CANON_DRIVE_IDS = os.environ.get("SKIP_CANON_DRIVE_IDS", "true").lower() == "true"
+
+
+def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
+    result = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_fetch = result.fetchall()
+    search_settings = (
+        SearchSettings(**search_settings_fetch[0]._asdict())
+        if search_settings_fetch
+        else None
+    )
+
+    result2 = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_future_fetch = result2.fetchall()
+    search_settings_future = (
+        SearchSettings(**search_settings_future_fetch[0]._asdict())
+        if search_settings_future_fetch
+        else None
+    )
+
+    if not isinstance(search_settings, SearchSettings):
+        raise RuntimeError(
+            "current search settings is of type " + str(type(search_settings))
+        )
+    if (
+        not isinstance(search_settings_future, SearchSettings)
+        and search_settings_future is not None
+    ):
+        raise RuntimeError(
+            "future search settings is of type " + str(type(search_settings_future))
+        )
+
+    return search_settings, search_settings_future
+
+
+def normalize_google_drive_url(url: str) -> str:
+    """Remove query parameters from Google Drive URLs to create canonical document IDs.
+    NOTE: copied from drive doc_conversion.py
+    """
+    parsed_url = urlparse(url)
+    parsed_url = parsed_url._replace(query="")
+    spl_path = parsed_url.path.split("/")
+    if spl_path and (spl_path[-1] in ["edit", "view", "preview"]):
+        spl_path.pop()
+        parsed_url = parsed_url._replace(path="/".join(spl_path))
+    # Remove query parameters and reconstruct URL
+    return urlunparse(parsed_url)
+
+
+def get_google_drive_documents_from_database() -> list[dict]:
+    """Get all Google Drive documents from the database."""
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            """
+            SELECT d.id
+            FROM document d
+            JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id
+            JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id
+                AND dcc.credential_id = cc.credential_id
+            JOIN connector c ON cc.connector_id = c.id
+            WHERE c.source = 'GOOGLE_DRIVE'
+        """
+        )
+    )
+
+    documents = []
+    for row in result:
+        documents.append({"document_id": row.id})
+
+    return documents
+
+
+def update_document_id_in_database(
+    old_doc_id: str, new_doc_id: str, index_name: str
+) -> None:
+    """Update document IDs in all relevant database tables using copy-and-swap approach."""
+    bind = op.get_bind()
+
+    # print(f"Updating database tables for document {old_doc_id} -> {new_doc_id}")
+
+    # Check if new document ID already exists
+    result = bind.execute(
+        sa.text("SELECT COUNT(*) FROM document WHERE id = :new_id"),
+        {"new_id": new_doc_id},
+    )
+    row = result.fetchone()
+    if row and row[0] > 0:
+        # print(f"Document with ID {new_doc_id} already exists, deleting old one")
+        delete_document_from_db(old_doc_id, index_name)
+        return
+
+    # Step 1: Create a new document row with the new ID (copy all fields from old row)
+    # Use a conservative approach to handle columns that might not exist in all installations
+    try:
+        bind.execute(
+            sa.text(
+                """
+                INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
+                                    link, doc_updated_at, primary_owners, secondary_owners,
+                                    external_user_emails, external_user_group_ids, is_public,
+                                    chunk_count, last_modified, last_synced, kg_stage, kg_processing_time)
+                SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
+                       link, doc_updated_at, primary_owners, secondary_owners,
+                       external_user_emails, external_user_group_ids, is_public,
+                       chunk_count, last_modified, last_synced, kg_stage, kg_processing_time
+                FROM document
+                WHERE id = :old_id
+            """
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated database tables for document {old_doc_id} -> {new_doc_id}")
+    except Exception as e:
+        # If the full INSERT fails, try a more basic version with only core columns
+        logger.warning(f"Full INSERT failed, trying basic version: {e}")
+        bind.execute(
+            sa.text(
+                """
+                INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
+                                    link, doc_updated_at, primary_owners, secondary_owners)
+                SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
+                       link, doc_updated_at, primary_owners, secondary_owners
+                FROM document
+                WHERE id = :old_id
+            """
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+
+    # Step 2: Update all foreign key references to point to the new ID
+
+    # Update document_by_connector_credential_pair table
+    bind.execute(
+        sa.text(
+            "UPDATE document_by_connector_credential_pair SET id = :new_id WHERE id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document_by_connector_credential_pair table for document {old_doc_id} -> {new_doc_id}")
+
+    # Update search_doc table (stores search results for chat replay)
+    # This is critical for agent functionality
+    bind.execute(
+        sa.text(
+            "UPDATE search_doc SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated search_doc table for document {old_doc_id} -> {new_doc_id}")
+    # Update document_retrieval_feedback table (user feedback on documents)
+    bind.execute(
+        sa.text(
+            "UPDATE document_retrieval_feedback SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document_retrieval_feedback table for document {old_doc_id} -> {new_doc_id}")
+    # Update document__tag table (document-tag relationships)
+    bind.execute(
+        sa.text(
+            "UPDATE document__tag SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document__tag table for document {old_doc_id} -> {new_doc_id}")
+    # Update user_file table (user uploaded files linked to documents)
+    bind.execute(
+        sa.text(
+            "UPDATE user_file SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated user_file table for document {old_doc_id} -> {new_doc_id}")
+    # Update KG and chunk_stats tables (these may not exist in all installations)
+    try:
+        # Update kg_entity table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_entity SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_entity table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_entity_extraction_staging table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_entity_extraction_staging SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_entity_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_relationship table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_relationship SET source_document = :new_id WHERE source_document = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_relationship table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_relationship_extraction_staging table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_relationship_extraction_staging SET source_document = :new_id WHERE source_document = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_relationship_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
+        # Update chunk_stats table
+        bind.execute(
+            sa.text(
+                "UPDATE chunk_stats SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated chunk_stats table for document {old_doc_id} -> {new_doc_id}")
+        # Update chunk_stats ID field which includes document_id
+        bind.execute(
+            sa.text(
+                """
+                UPDATE chunk_stats
+                SET id = REPLACE(id, :old_id, :new_id)
+                WHERE id LIKE :old_id_pattern
+            """
+            ),
+            {
+                "new_id": new_doc_id,
+                "old_id": old_doc_id,
+                "old_id_pattern": f"{old_doc_id}__%",
+            },
+        )
+        # print(f"Successfully updated chunk_stats ID field for document {old_doc_id} -> {new_doc_id}")
+    except Exception as e:
+        logger.warning(f"Some KG/chunk tables may not exist or failed to update: {e}")
+
+    # Step 3: Delete the old document row (this should now be safe since all FKs point to new row)
+    bind.execute(
+        sa.text("DELETE FROM document WHERE id = :old_id"), {"old_id": old_doc_id}
+    )
+    # print(f"Successfully deleted document {old_doc_id} from database")
+
+
+def _visit_chunks(
+    *,
+    http_client: httpx.Client,
+    index_name: str,
+    selection: str,
+    continuation: str | None = None,
+) -> tuple[list[dict], str | None]:
+    """Helper that calls the /document/v1 visit API once and returns (docs, next_token)."""
+
+    # Use the same URL as the document API, but with visit-specific params
+    base_url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
+
+    params: dict[str, str] = {
+        "selection": selection,
+        "wantedDocumentCount": "1000",
+    }
+    if continuation:
+        params["continuation"] = continuation
+
+    # print(f"Visiting chunks for selection '{selection}' with params {params}")
+    resp = http_client.get(base_url, params=params, timeout=None)
+    # print(f"Visited chunks for document {selection}")
+    resp.raise_for_status()
+
+    payload = resp.json()
+    return payload.get("documents", []), payload.get("continuation")
+
+
+def delete_document_chunks_from_vespa(index_name: str, doc_id: str) -> None:
+    """Delete all chunks for *doc_id* from Vespa using continuation-token paging (no offset)."""
+
+    total_deleted = 0
+    # Use exact match instead of contains - Document Selector Language doesn't support contains
+    selection = f'{index_name}.document_id=="{doc_id}"'
+
+    with get_vespa_http_client() as http_client:
+        continuation: str | None = None
+        while True:
+            docs, continuation = _visit_chunks(
+                http_client=http_client,
+                index_name=index_name,
+                selection=selection,
+                continuation=continuation,
+            )
+
+            if not docs:
+                break
+
+            for doc in docs:
+                vespa_full_id = doc.get("id")
+                if not vespa_full_id:
+                    continue
+
+                vespa_doc_uuid = vespa_full_id.split("::")[-1]
+                delete_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
+
+                try:
+                    resp = http_client.delete(delete_url)
+                    resp.raise_for_status()
+                    total_deleted += 1
+                except Exception as e:
+                    print(f"Failed to delete chunk {vespa_doc_uuid}: {e}")
+
+            if not continuation:
+                break
+
+
+def update_document_id_in_vespa(
+    index_name: str, old_doc_id: str, new_doc_id: str
+) -> None:
+    """Update all chunks' document_id field from *old_doc_id* to *new_doc_id* using continuation paging."""
+
+    clean_new_doc_id = replace_invalid_doc_id_characters(new_doc_id)
+
+    # Use exact match instead of contains - Document Selector Language doesn't support contains
+    selection = f'{index_name}.document_id=="{old_doc_id}"'
+
+    with get_vespa_http_client() as http_client:
+        continuation: str | None = None
+        while True:
+            # print(f"Visiting chunks for document {old_doc_id} -> {new_doc_id}")
+            docs, continuation = _visit_chunks(
+                http_client=http_client,
+                index_name=index_name,
+                selection=selection,
+                continuation=continuation,
+            )
+
+            if not docs:
+                break
+
+            for doc in docs:
+                vespa_full_id = doc.get("id")
+                if not vespa_full_id:
+                    continue
+
+                vespa_doc_uuid = vespa_full_id.split("::")[-1]
+                vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
+
+                update_request = {
+                    "fields": {"document_id": {"assign": clean_new_doc_id}}
+                }
+
+                try:
+                    resp = http_client.put(vespa_url, json=update_request)
+                    resp.raise_for_status()
+                except Exception as e:
+                    print(f"Failed to update chunk {vespa_doc_uuid}: {e}")
+                    raise
+
+            if not continuation:
+                break
+
+
+def delete_document_from_db(current_doc_id: str, index_name: str) -> None:
+    # Delete all foreign key references first, then delete the document
+    try:
+        bind = op.get_bind()
+
+        # Delete from agent-related tables first (order matters due to foreign keys)
+        # Delete from agent__sub_query__search_doc first since it references search_doc
+        bind.execute(
+            sa.text(
+                """
+                DELETE FROM agent__sub_query__search_doc
+                WHERE search_doc_id IN (
+                    SELECT id FROM search_doc WHERE document_id = :doc_id
+                )
+                """
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from chat_message__search_doc
+        bind.execute(
+            sa.text(
+                """
+                DELETE FROM chat_message__search_doc
+                WHERE search_doc_id IN (
+                    SELECT id FROM search_doc WHERE document_id = :doc_id
+                )
+                """
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Now we can safely delete from search_doc
+        bind.execute(
+            sa.text("DELETE FROM search_doc WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from document_by_connector_credential_pair
+        bind.execute(
+            sa.text(
+                "DELETE FROM document_by_connector_credential_pair WHERE id = :doc_id"
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from other tables that reference this document
+        bind.execute(
+            sa.text(
+                "DELETE FROM document_retrieval_feedback WHERE document_id = :doc_id"
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        bind.execute(
+            sa.text("DELETE FROM document__tag WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        bind.execute(
+            sa.text("DELETE FROM user_file WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from KG tables if they exist
+        try:
+            bind.execute(
+                sa.text("DELETE FROM kg_entity WHERE document_id = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text(
+                    "DELETE FROM kg_entity_extraction_staging WHERE document_id = :doc_id"
+                ),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM kg_relationship WHERE source_document = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text(
+                    "DELETE FROM kg_relationship_extraction_staging WHERE source_document = :doc_id"
+                ),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM chunk_stats WHERE document_id = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM chunk_stats WHERE id LIKE :doc_id_pattern"),
+                {"doc_id_pattern": f"{current_doc_id}__%"},
+            )
+
+        except Exception as e:
+            logger.warning(
+                f"Some KG/chunk tables may not exist or failed to delete from: {e}"
+            )
+
+        # Finally delete the document itself
+        bind.execute(
+            sa.text("DELETE FROM document WHERE id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete chunks from vespa
+        delete_document_chunks_from_vespa(index_name, current_doc_id)
+
+    except Exception as e:
+        print(f"Failed to delete duplicate document {current_doc_id}: {e}")
+        # Continue with other documents instead of failing the entire migration
+
+
+def upgrade() -> None:
+    if SKIP_CANON_DRIVE_IDS:
+        return
+    current_search_settings, future_search_settings = active_search_settings()
+    document_index = get_default_document_index(
+        current_search_settings,
+        future_search_settings,
+    )
+
+    # Get the index name
+    if hasattr(document_index, "index_name"):
+        index_name = document_index.index_name
+    else:
+        # Default index name if we can't get it from the document_index
+        index_name = "danswer_index"
+
+    # Get all Google Drive documents from the database (this is faster and more reliable)
+    gdrive_documents = get_google_drive_documents_from_database()
+
+    if not gdrive_documents:
+        return
+
+    # Track normalized document IDs to detect duplicates
+    all_normalized_doc_ids = set()
+    updated_count = 0
+
+    for doc_info in gdrive_documents:
+        current_doc_id = doc_info["document_id"]
+        normalized_doc_id = normalize_google_drive_url(current_doc_id)
+
+        print(f"Processing document {current_doc_id} -> {normalized_doc_id}")
+        # Check for duplicates
+        if normalized_doc_id in all_normalized_doc_ids:
+            # print(f"Deleting duplicate document {current_doc_id}")
+            delete_document_from_db(current_doc_id, index_name)
+            continue
+
+        all_normalized_doc_ids.add(normalized_doc_id)
+
+        # If the document ID already doesn't have query parameters, skip it
+        if current_doc_id == normalized_doc_id:
+            # print(f"Skipping document {current_doc_id} -> {normalized_doc_id} because it already has no query parameters")
+            continue
+
+        try:
+            # Update both database and Vespa in order
+            # Database first to ensure consistency
+            update_document_id_in_database(
+                current_doc_id, normalized_doc_id, index_name
+            )
+
+            # For Vespa, we can now use the original document IDs since we're using contains matching
+            update_document_id_in_vespa(index_name, current_doc_id, normalized_doc_id)
+            updated_count += 1
+            # print(f"Finished updating document {current_doc_id} -> {normalized_doc_id}")
+        except Exception as e:
+            print(f"Failed to update document {current_doc_id}: {e}")
+
+            if isinstance(e, HTTPStatusError):
+                print(f"HTTPStatusError: {e}")
+                print(f"Response: {e.response.text}")
+                print(f"Status: {e.response.status_code}")
+                print(f"Headers: {e.response.headers}")
+                print(f"Request: {e.request.url}")
+                print(f"Request headers: {e.request.headers}")
+            # Note: Rollback is complex with copy-and-swap approach since the old document is already deleted
+            # In case of failure, manual intervention may be required
+            # Continue with other documents instead of failing the entire migration
+            continue
+
+    logger.info(f"Migration complete. Updated {updated_count} Google Drive documents")
+
+
+def downgrade() -> None:
+    # this is a one way migration, so no downgrade.
+    # It wouldn't make sense to store the extra query parameters
+    # and duplicate documents to allow a reversal.
+    pass
--- a/backend/alembic/versions/27c6ecc08586_permission_framework.py
+++ b/backend/alembic/versions/27c6ecc08586_permission_framework.py
@@ -144,27 +144,34 @@ def upgrade() -> None:

 def downgrade() -> None:
    op.execute("TRUNCATE TABLE index_attempt")
-    op.add_column(
-        "index_attempt",
-        sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
-    )
-    op.add_column(
-        "index_attempt",
-        sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
-    )
-    op.add_column(
-        "index_attempt",
-        sa.Column(
-            "connector_specific_config",
-            postgresql.JSONB(astext_type=sa.Text()),
-            autoincrement=False,
-            nullable=False,
-        ),
-    )
-
-    # Check if the constraint exists before dropping
    conn = op.get_bind()
    inspector = sa.inspect(conn)
+    existing_columns = {col["name"] for col in inspector.get_columns("index_attempt")}
+
+    if "input_type" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
+        )
+
+    if "source" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
+        )
+
+    if "connector_specific_config" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column(
+                "connector_specific_config",
+                postgresql.JSONB(astext_type=sa.Text()),
+                autoincrement=False,
+                nullable=False,
+            ),
+        )
+
+    # Check if the constraint exists before dropping
    constraints = inspector.get_foreign_keys("index_attempt")

    if any(
@@ -183,8 +190,12 @@ def downgrade() -> None:
            "fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
        )

-    op.drop_column("index_attempt", "credential_id")
-    op.drop_column("index_attempt", "connector_id")
-    op.drop_table("connector_credential_pair")
-    op.drop_table("credential")
-    op.drop_table("connector")
+    if "credential_id" in existing_columns:
+        op.drop_column("index_attempt", "credential_id")
+
+    if "connector_id" in existing_columns:
+        op.drop_column("index_attempt", "connector_id")
+
+    op.execute("DROP TABLE IF EXISTS connector_credential_pair CASCADE")
+    op.execute("DROP TABLE IF EXISTS credential CASCADE")
+    op.execute("DROP TABLE IF EXISTS connector CASCADE")
--- a/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
+++ b/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
@@ -0,0 +1,115 @@
+"""add_indexing_coordination
+
+Revision ID: 2f95e36923e6
+Revises: 0816326d83aa
+Create Date: 2025-07-10 16:17:57.762182
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "2f95e36923e6"
+down_revision = "0816326d83aa"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add database-based coordination fields (replacing Redis fencing)
+    op.add_column(
+        "index_attempt", sa.Column("celery_task_id", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "cancellation_requested",
+            sa.Boolean(),
+            nullable=False,
+            server_default="false",
+        ),
+    )
+
+    # Add batch coordination fields (replacing FileStore state)
+    op.add_column(
+        "index_attempt", sa.Column("total_batches", sa.Integer(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "completed_batches", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "total_failures_batch_level",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("total_chunks", sa.Integer(), nullable=False, server_default="0"),
+    )
+
+    # Progress tracking for stall detection
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_progress_time", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_batches_completed_count",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+
+    # Heartbeat tracking for worker liveness detection
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "heartbeat_counter", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_heartbeat_value", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_heartbeat_time", sa.DateTime(timezone=True), nullable=True),
+    )
+
+    # Add index for coordination queries
+    op.create_index(
+        "ix_index_attempt_active_coordination",
+        "index_attempt",
+        ["connector_credential_pair_id", "search_settings_id", "status"],
+    )
+
+
+def downgrade() -> None:
+    # Remove the new index
+    op.drop_index("ix_index_attempt_active_coordination", table_name="index_attempt")
+
+    # Remove the new columns
+    op.drop_column("index_attempt", "last_batches_completed_count")
+    op.drop_column("index_attempt", "last_progress_time")
+    op.drop_column("index_attempt", "last_heartbeat_time")
+    op.drop_column("index_attempt", "last_heartbeat_value")
+    op.drop_column("index_attempt", "heartbeat_counter")
+    op.drop_column("index_attempt", "total_chunks")
+    op.drop_column("index_attempt", "total_failures_batch_level")
+    op.drop_column("index_attempt", "completed_batches")
+    op.drop_column("index_attempt", "total_batches")
+    op.drop_column("index_attempt", "cancellation_requested")
+    op.drop_column("index_attempt", "celery_task_id")
--- a/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
+++ b/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
@@ -0,0 +1,136 @@
+"""update_kg_trigger_functions
+
+Revision ID: 36e9220ab794
+Revises: c9e2cd766c29
+Create Date: 2025-06-22 17:33:25.833733
+
+"""
+
+from alembic import op
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
+
+# revision identifiers, used by Alembic.
+revision = "36e9220ab794"
+down_revision = "c9e2cd766c29"
+branch_labels = None
+depends_on = None
+
+
+def _get_tenant_contextvar(session: Session) -> str:
+    """Get the current schema for the migration"""
+    current_tenant = session.execute(text("SELECT current_schema()")).scalar()
+    if isinstance(current_tenant, str):
+        return current_tenant
+    else:
+        raise ValueError("Current tenant is not a string")
+
+
+def upgrade() -> None:
+
+    bind = op.get_bind()
+    session = Session(bind=bind)
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    tenant_id = _get_tenant_contextvar(session)
+    alphanum_pattern = r"[^a-z0-9]+"
+    truncate_length = 1000
+    function = "update_kg_entity_name"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                name text;
+                cleaned_name text;
+            BEGIN
+                -- Set name to semantic_id if document_id is not NULL
+                IF NEW.document_id IS NOT NULL THEN
+                    SELECT lower(semantic_id) INTO name
+                    FROM "{tenant_id}".document
+                    WHERE id = NEW.document_id;
+                ELSE
+                    name = lower(NEW.name);
+                END IF;
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams
+                NEW.name = name;
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".kg_entity')
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            BEFORE INSERT OR UPDATE OF name
+            ON "{tenant_id}".kg_entity
+            FOR EACH ROW
+            EXECUTE FUNCTION "{tenant_id}".{function}();
+        """
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    function = "update_kg_entity_name_from_doc"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                doc_name text;
+                cleaned_name text;
+            BEGIN
+                doc_name = lower(NEW.semantic_id);
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    doc_name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams for all entities referencing this document
+                UPDATE "{tenant_id}".kg_entity
+                SET
+                    name = doc_name,
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
+                WHERE document_id = NEW.id;
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".document')
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            AFTER UPDATE OF semantic_id
+            ON "{tenant_id}".document
+            FOR EACH ROW
+            EXECUTE FUNCTION "{tenant_id}".{function}();
+        """
+    )
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/3bd4c84fe72f_improved_index.py
+++ b/backend/alembic/versions/3bd4c84fe72f_improved_index.py
@@ -21,22 +21,14 @@ depends_on = None
 # an outage by creating an index without using CONCURRENTLY. This migration:
 #
 # 1. Creates more efficient full-text search capabilities using tsvector columns and GIN indexes
-# 2. Uses CONCURRENTLY for all index creation to prevent table locking
-# 3. Explicitly manages transactions with COMMIT statements to allow CONCURRENTLY to work
-# (see: https://www.postgresql.org/docs/9.4/sql-createindex.html#SQL-CREATEINDEX-CONCURRENTLY)
-# (see: https://github.com/sqlalchemy/alembic/issues/277)
-# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
+# 2. Adds indexes to both chat_message and chat_session tables for comprehensive search
+# 3. Note: CONCURRENTLY was removed due to operational issues


 def upgrade() -> None:
    # First, drop any existing indexes to avoid conflicts
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
-
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
-
-    op.execute("COMMIT")
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
    op.execute("DROP INDEX IF EXISTS idx_chat_message_message_lower;")

    # Drop existing columns if they exist
@@ -52,12 +44,9 @@ def upgrade() -> None:
        """
    )

-    # Commit the current transaction before creating concurrent indexes
-    op.execute("COMMIT")
-
    op.execute(
        """
-        CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
+        CREATE INDEX IF NOT EXISTS idx_chat_message_tsv
        ON chat_message
        USING GIN (message_tsv)
        """
@@ -72,12 +61,9 @@ def upgrade() -> None:
        """
    )

-    # Commit again before creating the second concurrent index
-    op.execute("COMMIT")
-
    op.execute(
        """
-        CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
+        CREATE INDEX IF NOT EXISTS idx_chat_session_desc_tsv
        ON chat_session
        USING GIN (description_tsv)
        """
@@ -85,12 +71,9 @@ def upgrade() -> None:


 def downgrade() -> None:
-    # Drop the indexes first (use CONCURRENTLY for dropping too)
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
-
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
+    # Drop the indexes first
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")

    # Then drop the columns
    op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv;")
--- a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
+++ b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
@@ -15,7 +15,7 @@ from datetime import datetime, timedelta
 from onyx.configs.app_configs import DB_READONLY_USER
 from onyx.configs.app_configs import DB_READONLY_PASSWORD
 from shared_configs.configs import MULTI_TENANT
-from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA


 # revision identifiers, used by Alembic.
@@ -80,6 +80,7 @@ def upgrade() -> None:
        )
    )

+    op.execute("DROP TABLE IF EXISTS kg_config CASCADE")
    op.create_table(
        "kg_config",
        sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
@@ -123,6 +124,7 @@ def upgrade() -> None:
        ],
    )

+    op.execute("DROP TABLE IF EXISTS kg_entity_type CASCADE")
    op.create_table(
        "kg_entity_type",
        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
@@ -156,6 +158,7 @@ def upgrade() -> None:
        ),
    )

+    op.execute("DROP TABLE IF EXISTS kg_relationship_type CASCADE")
    # Create KGRelationshipType table
    op.create_table(
        "kg_relationship_type",
@@ -194,6 +197,7 @@ def upgrade() -> None:
        ),
    )

+    op.execute("DROP TABLE IF EXISTS kg_relationship_type_extraction_staging CASCADE")
    # Create KGRelationshipTypeExtractionStaging table
    op.create_table(
        "kg_relationship_type_extraction_staging",
@@ -227,6 +231,8 @@ def upgrade() -> None:
        ),
    )

+    op.execute("DROP TABLE IF EXISTS kg_entity CASCADE")
+
    # Create KGEntity table
    op.create_table(
        "kg_entity",
@@ -281,6 +287,7 @@ def upgrade() -> None:
        "ix_entity_name_search", "kg_entity", ["name", "entity_type_id_name"]
    )

+    op.execute("DROP TABLE IF EXISTS kg_entity_extraction_staging CASCADE")
    # Create KGEntityExtractionStaging table
    op.create_table(
        "kg_entity_extraction_staging",
@@ -330,6 +337,7 @@ def upgrade() -> None:
        ["name", "entity_type_id_name"],
    )

+    op.execute("DROP TABLE IF EXISTS kg_relationship CASCADE")
    # Create KGRelationship table
    op.create_table(
        "kg_relationship",
@@ -371,6 +379,7 @@ def upgrade() -> None:
        "ix_kg_relationship_nodes", "kg_relationship", ["source_node", "target_node"]
    )

+    op.execute("DROP TABLE IF EXISTS kg_relationship_extraction_staging CASCADE")
    # Create KGRelationshipExtractionStaging table
    op.create_table(
        "kg_relationship_extraction_staging",
@@ -414,6 +423,7 @@ def upgrade() -> None:
        ["source_node", "target_node"],
    )

+    op.execute("DROP TABLE IF EXISTS kg_term CASCADE")
    # Create KGTerm table
    op.create_table(
        "kg_term",
@@ -467,11 +477,11 @@ def upgrade() -> None:

    # Create GIN index for clustering and normalization
    op.execute(
-        "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_kg_entity_clustering_trigrams "
-        f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.gin_trgm_ops)"
+        "CREATE INDEX IF NOT EXISTS idx_kg_entity_clustering_trigrams "
+        f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA}.gin_trgm_ops)"
    )
    op.execute(
-        "CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_kg_entity_normalization_trigrams "
+        "CREATE INDEX IF NOT EXISTS idx_kg_entity_normalization_trigrams "
        "ON kg_entity USING GIN (name_trigrams)"
    )

@@ -508,7 +518,7 @@ def upgrade() -> None:

                -- Set name and name trigrams
                NEW.name = name;
-                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name);
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
                RETURN NEW;
            END;
            $$ LANGUAGE plpgsql;
@@ -553,7 +563,7 @@ def upgrade() -> None:
                UPDATE kg_entity
                SET
                    name = doc_name,
-                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name)
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
                WHERE document_id = NEW.id;
                RETURN NEW;
            END;
@@ -625,9 +635,8 @@ def downgrade() -> None:
        op.execute(f"DROP FUNCTION IF EXISTS {function}()")

    # Drop index
-    op.execute("COMMIT")  # Commit to allow CONCURRENTLY
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_kg_entity_clustering_trigrams")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_kg_entity_normalization_trigrams")
+    op.execute("DROP INDEX IF EXISTS idx_kg_entity_clustering_trigrams")
+    op.execute("DROP INDEX IF EXISTS idx_kg_entity_normalization_trigrams")

    # Drop tables in reverse order of creation to handle dependencies
    op.drop_table("kg_term")
--- a/backend/alembic/versions/58c50ef19f08_add_stale_column_to_user__external_user_.py
+++ b/backend/alembic/versions/58c50ef19f08_add_stale_column_to_user__external_user_.py
@@ -0,0 +1,90 @@
+"""add stale column to external user group tables
+
+Revision ID: 58c50ef19f08
+Revises: 7b9b952abdf6
+Create Date: 2025-06-25 14:08:14.162380
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "58c50ef19f08"
+down_revision = "7b9b952abdf6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add the stale column with default value False to user__external_user_group_id
+    op.add_column(
+        "user__external_user_group_id",
+        sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
+    )
+
+    # Create index for efficient querying of stale rows by cc_pair_id
+    op.create_index(
+        "ix_user__external_user_group_id_cc_pair_id_stale",
+        "user__external_user_group_id",
+        ["cc_pair_id", "stale"],
+        unique=False,
+    )
+
+    # Create index for efficient querying of all stale rows
+    op.create_index(
+        "ix_user__external_user_group_id_stale",
+        "user__external_user_group_id",
+        ["stale"],
+        unique=False,
+    )
+
+    # Add the stale column with default value False to public_external_user_group
+    op.add_column(
+        "public_external_user_group",
+        sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
+    )
+
+    # Create index for efficient querying of stale rows by cc_pair_id
+    op.create_index(
+        "ix_public_external_user_group_cc_pair_id_stale",
+        "public_external_user_group",
+        ["cc_pair_id", "stale"],
+        unique=False,
+    )
+
+    # Create index for efficient querying of all stale rows
+    op.create_index(
+        "ix_public_external_user_group_stale",
+        "public_external_user_group",
+        ["stale"],
+        unique=False,
+    )
+
+
+def downgrade() -> None:
+    # Drop the indices for public_external_user_group first
+    op.drop_index(
+        "ix_public_external_user_group_stale", table_name="public_external_user_group"
+    )
+    op.drop_index(
+        "ix_public_external_user_group_cc_pair_id_stale",
+        table_name="public_external_user_group",
+    )
+
+    # Drop the stale column from public_external_user_group
+    op.drop_column("public_external_user_group", "stale")
+
+    # Drop the indices for user__external_user_group_id
+    op.drop_index(
+        "ix_user__external_user_group_id_stale",
+        table_name="user__external_user_group_id",
+    )
+    op.drop_index(
+        "ix_user__external_user_group_id_cc_pair_id_stale",
+        table_name="user__external_user_group_id",
+    )
+
+    # Drop the stale column from user__external_user_group_id
+    op.drop_column("user__external_user_group_id", "stale")
--- a/backend/alembic/versions/7b9b952abdf6_update_entities.py
+++ b/backend/alembic/versions/7b9b952abdf6_update_entities.py
@@ -0,0 +1,318 @@
+"""update-entities
+
+Revision ID: 7b9b952abdf6
+Revises: 36e9220ab794
+Create Date: 2025-06-23 20:24:08.139201
+
+"""
+
+import json
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "7b9b952abdf6"
+down_revision = "36e9220ab794"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    # new entity type metadata_attribute_conversion
+    new_entity_type_conversion = {
+        "LINEAR": {
+            "team": {"name": "team", "keep": True, "implication_property": None},
+            "state": {"name": "state", "keep": True, "implication_property": None},
+            "priority": {
+                "name": "priority",
+                "keep": True,
+                "implication_property": None,
+            },
+            "estimate": {
+                "name": "estimate",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_at": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "started_at": {
+                "name": "started_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "completed_at": {
+                "name": "completed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "due_date": {
+                "name": "due_date",
+                "keep": True,
+                "implication_property": None,
+            },
+            "creator": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignee": {
+                "name": "assignee",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+        },
+        "JIRA": {
+            "issuetype": {
+                "name": "subtype",
+                "keep": True,
+                "implication_property": None,
+            },
+            "status": {"name": "status", "keep": True, "implication_property": None},
+            "priority": {
+                "name": "priority",
+                "keep": True,
+                "implication_property": None,
+            },
+            "project_name": {
+                "name": "project",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "updated": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "resolution_date": {
+                "name": "completed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "duedate": {"name": "due_date", "keep": True, "implication_property": None},
+            "reporter_email": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignee_email": {
+                "name": "assignee",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+            "key": {"name": "key", "keep": True, "implication_property": None},
+            "parent": {"name": "parent", "keep": True, "implication_property": None},
+        },
+        "GITHUB_PR": {
+            "repo": {"name": "repository", "keep": True, "implication_property": None},
+            "state": {"name": "state", "keep": True, "implication_property": None},
+            "num_commits": {
+                "name": "num_commits",
+                "keep": True,
+                "implication_property": None,
+            },
+            "num_files_changed": {
+                "name": "num_files_changed",
+                "keep": True,
+                "implication_property": None,
+            },
+            "labels": {"name": "labels", "keep": True, "implication_property": None},
+            "merged": {"name": "merged", "keep": True, "implication_property": None},
+            "merged_at": {
+                "name": "merged_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "closed_at": {
+                "name": "closed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_at": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "updated_at": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "user": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignees": {
+                "name": "assignees",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+        },
+        "GITHUB_ISSUE": {
+            "repo": {"name": "repository", "keep": True, "implication_property": None},
+            "state": {"name": "state", "keep": True, "implication_property": None},
+            "labels": {"name": "labels", "keep": True, "implication_property": None},
+            "closed_at": {
+                "name": "closed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_at": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "updated_at": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "user": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignees": {
+                "name": "assignees",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+        },
+        "FIREFLIES": {},
+        "ACCOUNT": {},
+        "OPPORTUNITY": {
+            "name": {"name": "name", "keep": True, "implication_property": None},
+            "stage_name": {"name": "stage", "keep": True, "implication_property": None},
+            "type": {"name": "type", "keep": True, "implication_property": None},
+            "amount": {"name": "amount", "keep": True, "implication_property": None},
+            "fiscal_year": {
+                "name": "fiscal_year",
+                "keep": True,
+                "implication_property": None,
+            },
+            "fiscal_quarter": {
+                "name": "fiscal_quarter",
+                "keep": True,
+                "implication_property": None,
+            },
+            "is_closed": {
+                "name": "is_closed",
+                "keep": True,
+                "implication_property": None,
+            },
+            "close_date": {
+                "name": "close_date",
+                "keep": True,
+                "implication_property": None,
+            },
+            "probability": {
+                "name": "close_probability",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_date": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "last_modified_date": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "account": {
+                "name": "account",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "ACCOUNT",
+                    "implied_relationship_name": "is_account_of",
+                },
+            },
+        },
+        "VENDOR": {},
+        "EMPLOYEE": {},
+    }
+
+    current_entity_types = conn.execute(
+        sa.text("SELECT id_name, attributes from kg_entity_type")
+    ).all()
+    for entity_type, attributes in current_entity_types:
+        # delete removed entity types
+        if entity_type not in new_entity_type_conversion:
+            op.execute(
+                sa.text(f"DELETE FROM kg_entity_type WHERE id_name = '{entity_type}'")
+            )
+            continue
+
+        # update entity type attributes
+        if "metadata_attributes" in attributes:
+            del attributes["metadata_attributes"]
+        attributes["metadata_attribute_conversion"] = new_entity_type_conversion[
+            entity_type
+        ]
+        attributes_str = json.dumps(attributes).replace("'", "''")
+        op.execute(
+            sa.text(
+                f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
+                f"WHERE id_name = '{entity_type}'"
+            ),
+        )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+
+    current_entity_types = conn.execute(
+        sa.text("SELECT id_name, attributes from kg_entity_type")
+    ).all()
+    for entity_type, attributes in current_entity_types:
+        conversion = {}
+        if "metadata_attribute_conversion" in attributes:
+            conversion = attributes.pop("metadata_attribute_conversion")
+        attributes["metadata_attributes"] = {
+            attr: prop["name"] for attr, prop in conversion.items() if prop["keep"]
+        }
+
+        attributes_str = json.dumps(attributes).replace("'", "''")
+        op.execute(
+            sa.text(
+                f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
+                f"WHERE id_name = '{entity_type}'"
+            ),
+        )
--- a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
+++ b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
@@ -0,0 +1,315 @@
+"""modify_file_store_for_external_storage
+
+Revision ID: c9e2cd766c29
+Revises: 03bf8be6b53a
+Create Date: 2025-06-13 14:02:09.867679
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from typing import cast, Any
+
+from botocore.exceptions import ClientError
+
+from onyx.db._deprecated.pg_file_store import delete_lobj_by_id, read_lobj
+from onyx.file_store.file_store import get_s3_file_store
+from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
+
+# revision identifiers, used by Alembic.
+revision = "c9e2cd766c29"
+down_revision = "03bf8be6b53a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    try:
+        # Modify existing file_store table to support external storage
+        op.rename_table("file_store", "file_record")
+
+        # Make lobj_oid nullable (for external storage files)
+        op.alter_column("file_record", "lobj_oid", nullable=True)
+
+        # Add external storage columns with generic names
+        op.add_column(
+            "file_record", sa.Column("bucket_name", sa.String(), nullable=True)
+        )
+        op.add_column(
+            "file_record", sa.Column("object_key", sa.String(), nullable=True)
+        )
+
+        # Add timestamps for tracking
+        op.add_column(
+            "file_record",
+            sa.Column(
+                "created_at",
+                sa.DateTime(timezone=True),
+                server_default=sa.func.now(),
+                nullable=False,
+            ),
+        )
+        op.add_column(
+            "file_record",
+            sa.Column(
+                "updated_at",
+                sa.DateTime(timezone=True),
+                server_default=sa.func.now(),
+                nullable=False,
+            ),
+        )
+
+        op.alter_column("file_record", "file_name", new_column_name="file_id")
+    except Exception as e:
+        if "does not exist" in str(e) or 'relation "file_store" does not exist' in str(
+            e
+        ):
+            print(
+                f"Ran into error - {e}. Likely means we had a partial success in the past, continuing..."
+            )
+        else:
+            raise
+
+    print(
+        "External storage configured - migrating files from PostgreSQL to external storage..."
+    )
+    # if we fail midway through this, we'll have a partial success. Running the migration
+    # again should allow us to continue.
+    _migrate_files_to_external_storage()
+    print("File migration completed successfully!")
+
+    # Remove lobj_oid column
+    op.drop_column("file_record", "lobj_oid")
+
+
+def downgrade() -> None:
+    """Revert schema changes and migrate files from external storage back to PostgreSQL large objects."""
+
+    print(
+        "Reverting to PostgreSQL-backed file store – migrating files from external storage …"
+    )
+
+    # 1. Ensure `lobj_oid` exists on the current `file_record` table (nullable for now).
+    op.add_column("file_record", sa.Column("lobj_oid", sa.Integer(), nullable=True))
+
+    # 2. Move content from external storage back into PostgreSQL large objects (table is still
+    #    called `file_record` so application code continues to work during the copy).
+    try:
+        _migrate_files_to_postgres()
+    except Exception:
+        print("Error during downgrade migration, rolling back …")
+        op.drop_column("file_record", "lobj_oid")
+        raise
+
+    # 3. After migration every row should now have `lobj_oid` populated – mark NOT NULL.
+    op.alter_column("file_record", "lobj_oid", nullable=False)
+
+    # 4. Remove columns that are only relevant to external storage.
+    op.drop_column("file_record", "updated_at")
+    op.drop_column("file_record", "created_at")
+    op.drop_column("file_record", "object_key")
+    op.drop_column("file_record", "bucket_name")
+
+    # 5. Rename `file_id` back to `file_name` (still on `file_record`).
+    op.alter_column("file_record", "file_id", new_column_name="file_name")
+
+    # 6. Finally, rename the table back to its original name expected by the legacy codebase.
+    op.rename_table("file_record", "file_store")
+
+    print(
+        "Downgrade migration completed – files are now stored inside PostgreSQL again."
+    )
+
+
+# -----------------------------------------------------------------------------
+# Helper: migrate from external storage (S3/MinIO) back into PostgreSQL large objects
+
+
+def _migrate_files_to_postgres() -> None:
+    """Move any files whose content lives in external S3-compatible storage back into PostgreSQL.
+
+    The logic mirrors *inverse* of `_migrate_files_to_external_storage` used on upgrade.
+    """
+
+    # Obtain DB session from Alembic context
+    bind = op.get_bind()
+    session = Session(bind=bind)
+
+    # Fetch rows that have external storage pointers (bucket/object_key not NULL)
+    result = session.execute(
+        text(
+            "SELECT file_id, bucket_name, object_key FROM file_record "
+            "WHERE bucket_name IS NOT NULL AND object_key IS NOT NULL"
+        )
+    )
+
+    files_to_migrate = [row[0] for row in result.fetchall()]
+    total_files = len(files_to_migrate)
+
+    if total_files == 0:
+        print("No files found in external storage to migrate back to PostgreSQL.")
+        return
+
+    print(f"Found {total_files} files to migrate back to PostgreSQL large objects.")
+
+    _set_tenant_contextvar(session)
+    migrated_count = 0
+
+    # only create external store if we have files to migrate. This line
+    # makes it so we need to have S3/MinIO configured to run this migration.
+    external_store = get_s3_file_store()
+
+    for i, file_id in enumerate(files_to_migrate, 1):
+        print(f"Migrating file {i}/{total_files}: {file_id}")
+
+        # Read file content from external storage (always binary)
+        try:
+            file_io = external_store.read_file(
+                file_id=file_id, mode="b", use_tempfile=True
+            )
+            file_io.seek(0)
+
+            # Import lazily to avoid circular deps at Alembic runtime
+            from onyx.db._deprecated.pg_file_store import (
+                create_populate_lobj,
+            )  # noqa: E402
+
+            # Create new Postgres large object and populate it
+            lobj_oid = create_populate_lobj(content=file_io, db_session=session)
+
+            # Update DB row: set lobj_oid, clear bucket/object_key
+            session.execute(
+                text(
+                    "UPDATE file_record SET lobj_oid = :lobj_oid, bucket_name = NULL, "
+                    "object_key = NULL WHERE file_id = :file_id"
+                ),
+                {"lobj_oid": lobj_oid, "file_id": file_id},
+            )
+        except ClientError as e:
+            if "NoSuchKey" in str(e):
+                print(
+                    f"File {file_id} not found in external storage. Deleting from database."
+                )
+                session.execute(
+                    text("DELETE FROM file_record WHERE file_id = :file_id"),
+                    {"file_id": file_id},
+                )
+            else:
+                raise
+
+        migrated_count += 1
+        print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
+
+    # Flush the SQLAlchemy session so statements are sent to the DB, but **do not**
+    # commit the transaction.  The surrounding Alembic migration will commit once
+    # the *entire* downgrade succeeds.  This keeps the whole downgrade atomic and
+    # avoids leaving the database in a partially-migrated state if a later schema
+    # operation fails.
+    session.flush()
+
+    print(
+        f"Migration back to PostgreSQL completed: {migrated_count} files staged for commit."
+    )
+
+
+def _migrate_files_to_external_storage() -> None:
+    """Migrate files from PostgreSQL large objects to external storage"""
+    # Get database session
+    bind = op.get_bind()
+    session = Session(bind=bind)
+    external_store = get_s3_file_store()
+
+    # Find all files currently stored in PostgreSQL (lobj_oid is not null)
+    result = session.execute(
+        text(
+            "SELECT file_id FROM file_record WHERE lobj_oid IS NOT NULL "
+            "AND bucket_name IS NULL AND object_key IS NULL"
+        )
+    )
+
+    files_to_migrate = [row[0] for row in result.fetchall()]
+    total_files = len(files_to_migrate)
+
+    if total_files == 0:
+        print("No files found in PostgreSQL storage to migrate.")
+        return
+
+    # might need to move this above the if statement when creating a new multi-tenant
+    # system. VERY extreme edge case.
+    external_store.initialize()
+    print(f"Found {total_files} files to migrate from PostgreSQL to external storage.")
+
+    _set_tenant_contextvar(session)
+    migrated_count = 0
+
+    for i, file_id in enumerate(files_to_migrate, 1):
+        print(f"Migrating file {i}/{total_files}: {file_id}")
+
+        # Read file record to get metadata
+        file_record = session.execute(
+            text("SELECT * FROM file_record WHERE file_id = :file_id"),
+            {"file_id": file_id},
+        ).fetchone()
+
+        if file_record is None:
+            print(f"File {file_id} not found in PostgreSQL storage.")
+            continue
+
+        lobj_id = cast(int, file_record.lobj_oid)  # type: ignore
+        file_metadata = cast(Any, file_record.file_metadata)  # type: ignore
+
+        # Read file content from PostgreSQL
+        try:
+            file_content = read_lobj(
+                lobj_id, db_session=session, mode="b", use_tempfile=True
+            )
+        except Exception as e:
+            if "large object" in str(e) and "does not exist" in str(e):
+                print(f"File {file_id} not found in PostgreSQL storage.")
+                continue
+            else:
+                raise
+
+        # Handle file_metadata type conversion
+        file_metadata = None
+        if file_metadata is not None:
+            if isinstance(file_metadata, dict):
+                file_metadata = file_metadata
+            else:
+                # Convert other types to dict if possible, otherwise None
+                try:
+                    file_metadata = dict(file_record.file_metadata)  # type: ignore
+                except (TypeError, ValueError):
+                    file_metadata = None
+
+        # Save to external storage (this will handle the database record update and cleanup)
+        # NOTE: this WILL .commit() the transaction.
+        external_store.save_file(
+            file_id=file_id,
+            content=file_content,
+            display_name=file_record.display_name,
+            file_origin=file_record.file_origin,
+            file_type=file_record.file_type,
+            file_metadata=file_metadata,
+        )
+        delete_lobj_by_id(lobj_id, db_session=session)
+
+        migrated_count += 1
+        print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
+
+    # See note above – flush but do **not** commit so the outer Alembic transaction
+    # controls atomicity.
+    session.flush()
+
+    print(
+        f"Migration completed: {migrated_count} files staged for commit to external storage."
+    )
+
+
+def _set_tenant_contextvar(session: Session) -> None:
+    """Set the tenant contextvar to the default schema"""
+    current_tenant = session.execute(text("SELECT current_schema()")).scalar()
+    print(f"Migrating files for tenant: {current_tenant}")
+    CURRENT_TENANT_ID_CONTEXTVAR.set(current_tenant)
--- a/backend/alembic/versions/da42808081e3_migrate_jira_connectors_to_new_format.py
+++ b/backend/alembic/versions/da42808081e3_migrate_jira_connectors_to_new_format.py
@@ -11,7 +11,7 @@ import sqlalchemy as sa
 import json

 from onyx.configs.constants import DocumentSource
-from onyx.connectors.onyx_jira.utils import extract_jira_project
+from onyx.connectors.jira.utils import extract_jira_project


 # revision identifiers, used by Alembic.
--- a/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py
+++ b/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py
@@ -18,11 +18,13 @@ depends_on: None = None


 def upgrade() -> None:
+    op.execute("DROP TABLE IF EXISTS document CASCADE")
    op.create_table(
        "document",
        sa.Column("id", sa.String(), nullable=False),
        sa.PrimaryKeyConstraint("id"),
    )
+    op.execute("DROP TABLE IF EXISTS chunk CASCADE")
    op.create_table(
        "chunk",
        sa.Column("id", sa.String(), nullable=False),
@@ -43,6 +45,7 @@ def upgrade() -> None:
        ),
        sa.PrimaryKeyConstraint("id", "document_store_type"),
    )
+    op.execute("DROP TABLE IF EXISTS deletion_attempt CASCADE")
    op.create_table(
        "deletion_attempt",
        sa.Column("id", sa.Integer(), nullable=False),
@@ -84,6 +87,7 @@ def upgrade() -> None:
        ),
        sa.PrimaryKeyConstraint("id"),
    )
+    op.execute("DROP TABLE IF EXISTS document_by_connector_credential_pair CASCADE")
    op.create_table(
        "document_by_connector_credential_pair",
        sa.Column("id", sa.String(), nullable=False),
@@ -106,7 +110,10 @@ def upgrade() -> None:


 def downgrade() -> None:
+    # upstream tables first
    op.drop_table("document_by_connector_credential_pair")
    op.drop_table("deletion_attempt")
    op.drop_table("chunk")
-    op.drop_table("document")
+
+    # Alembic op.drop_table() has no "cascade" flag – issue raw SQL
+    op.execute("DROP TABLE IF EXISTS document CASCADE")
--- a/backend/onyx/connectors/onyx_jira/init.py
+++ b/backend/onyx/connectors/onyx_jira/init.py
--- a/backend/alembic_tenants/env.py
+++ b/backend/alembic_tenants/env.py
@@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.schema import SchemaItem

 from alembic import context
-from onyx.db.engine import build_connection_string
+from onyx.db.engine.sql_engine import build_connection_string
 from onyx.db.models import PublicBase

 # this is the Alembic Config object, which provides
--- a/backend/ee/onyx/background/celery/apps/heavy.py
+++ b/backend/ee/onyx/background/celery/apps/heavy.py
@@ -16,7 +16,7 @@ from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import FileType
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import QueryHistoryType
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.tasks import delete_task_with_id
 from onyx.db.tasks import mark_task_as_finished_with_id
 from onyx.db.tasks import mark_task_as_started_with_id
@@ -35,7 +35,13 @@ logger = setup_logger()
    trail=False,
 )
 def export_query_history_task(
-    self: Task, *, start: datetime, end: datetime, start_time: datetime
+    self: Task,
+    *,
+    start: datetime,
+    end: datetime,
+    start_time: datetime,
+    # Need to include the tenant_id since the TenantAwareTask needs this
+    tenant_id: str,
 ) -> None:
    if not self.request.id:
        raise RuntimeError("No task id defined for this task; cannot identify it")
@@ -85,8 +91,7 @@ def export_query_history_task(
    with get_session_with_current_tenant() as db_session:
        try:
            stream.seek(0)
-            get_default_file_store(db_session).save_file(
-                file_name=report_name,
+            get_default_file_store().save_file(
                content=stream,
                display_name=report_name,
                file_origin=FileOrigin.QUERY_HISTORY_CSV,
@@ -96,6 +101,7 @@ def export_query_history_task(
                    "end": end.isoformat(),
                    "start_time": start_time.isoformat(),
                },
+                file_id=report_name,
            )

            delete_task_with_id(
--- a/backend/ee/onyx/background/celery/apps/primary.py
+++ b/backend/ee/onyx/background/celery/apps/primary.py
@@ -13,7 +13,7 @@ from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.db.chat import delete_chat_session
 from onyx.db.chat import get_chat_sessions_older_than
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import TaskStatus
 from onyx.db.tasks import mark_task_as_finished_with_id
 from onyx.db.tasks import register_task
--- a/backend/ee/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/ee/onyx/background/celery/tasks/beat_schedule.py
@@ -20,39 +20,36 @@ from shared_configs.configs import MULTI_TENANT

 ee_beat_system_tasks: list[dict] = []

-ee_beat_task_templates: list[dict] = []
-ee_beat_task_templates.extend(
-    [
-        {
-            "name": "autogenerate-usage-report",
-            "task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
-            "schedule": timedelta(days=30),
-            "options": {
-                "priority": OnyxCeleryPriority.MEDIUM,
-                "expires": BEAT_EXPIRES_DEFAULT,
-            },
+ee_beat_task_templates: list[dict] = [
+    {
+        "name": "autogenerate-usage-report",
+        "task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
+        "schedule": timedelta(days=30),
+        "options": {
+            "priority": OnyxCeleryPriority.MEDIUM,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
-        {
-            "name": "check-ttl-management",
-            "task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
-            "schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
-            "options": {
-                "priority": OnyxCeleryPriority.MEDIUM,
-                "expires": BEAT_EXPIRES_DEFAULT,
-            },
+    },
+    {
+        "name": "check-ttl-management",
+        "task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
+        "schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
+        "options": {
+            "priority": OnyxCeleryPriority.MEDIUM,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
-        {
-            "name": "export-query-history-cleanup-task",
-            "task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
-            "schedule": timedelta(hours=1),
-            "options": {
-                "priority": OnyxCeleryPriority.MEDIUM,
-                "expires": BEAT_EXPIRES_DEFAULT,
-                "queue": OnyxCeleryQueues.CSV_GENERATION,
-            },
+    },
+    {
+        "name": "export-query-history-cleanup-task",
+        "task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
+        "schedule": timedelta(hours=1),
+        "options": {
+            "priority": OnyxCeleryPriority.MEDIUM,
+            "expires": BEAT_EXPIRES_DEFAULT,
+            "queue": OnyxCeleryQueues.CSV_GENERATION,
        },
-    ]
-)
+    },
+]

 ee_tasks_to_schedule: list[dict] = []

--- a/backend/ee/onyx/background/celery/tasks/cleanup/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/cleanup/tasks.py
@@ -6,7 +6,7 @@ from celery import shared_task
 from ee.onyx.db.query_history import get_all_query_history_export_tasks
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import OnyxCeleryTask
-from onyx.db.engine import get_session_with_tenant
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.enums import TaskStatus
 from onyx.db.tasks import delete_task_with_id
 from onyx.utils.logger import setup_logger
--- a/backend/ee/onyx/background/celery/tasks/cloud/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/cloud/tasks.py
@@ -13,7 +13,7 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisLocks
-from onyx.db.engine import get_all_tenant_ids
+from onyx.db.engine.tenant_utils import get_all_tenant_ids
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import redis_lock_dump
 from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
--- a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -30,6 +30,7 @@ from onyx.background.celery.celery_redis import celery_find_task
 from onyx.background.celery.celery_redis import celery_get_queue_length
 from onyx.background.celery.celery_redis import celery_get_queued_task_ids
 from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
+from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT
@@ -47,8 +48,8 @@ from onyx.db.connector import mark_cc_pair_as_permissions_synced
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
 from onyx.db.document import get_document_ids_for_connector_credential_pair
 from onyx.db.document import upsert_document_by_connector_credential_pair
-from onyx.db.engine import get_session_with_current_tenant
-from onyx.db.engine import get_session_with_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.enums import AccessType
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import SyncStatus
@@ -73,6 +74,7 @@ from onyx.utils.logger import LoggerContextVars
 from onyx.utils.logger import setup_logger
 from onyx.utils.telemetry import optional_telemetry
 from onyx.utils.telemetry import RecordType
+from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()

@@ -87,6 +89,24 @@ LIGHT_SOFT_TIME_LIMIT = 105
 LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15


+def _get_fence_validation_block_expiration() -> int:
+    """
+    Compute the expiration time for the fence validation block signal.
+    Base expiration is 300 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
+    """
+    base_expiration = 300  # seconds
+
+    if not MULTI_TENANT:
+        return base_expiration
+
+    try:
+        beat_multiplier = OnyxRuntime.get_beat_multiplier()
+    except Exception:
+        beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
+
+    return int(base_expiration * beat_multiplier)
+
+
 """Jobs / utils for kicking off doc permissions sync tasks."""


@@ -194,7 +214,11 @@ def check_for_doc_permissions_sync(self: Task, *, tenant_id: str) -> bool | None
                    "Exception while validating permission sync fences"
                )

-            r.set(OnyxRedisSignals.BLOCK_VALIDATE_PERMISSION_SYNC_FENCES, 1, ex=300)
+            r.set(
+                OnyxRedisSignals.BLOCK_VALIDATE_PERMISSION_SYNC_FENCES,
+                1,
+                ex=_get_fence_validation_block_expiration(),
+            )

        # use a lookup table to find active fences. We still have to verify the fence
        # exists since it is an optimization and not the source of truth.
@@ -398,7 +422,7 @@ def connector_permission_sync_generator_task(

    lock: RedisLock = r.lock(
        OnyxRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
-        + f"_{redis_connector.id}",
+        + f"_{redis_connector.cc_pair_id}",
        timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT,
        thread_local=False,
    )
@@ -425,6 +449,7 @@ def connector_permission_sync_generator_task(
                created = validate_ccpair_for_user(
                    cc_pair.connector.id,
                    cc_pair.credential.id,
+                    cc_pair.access_type,
                    db_session,
                    enforce_creation=False,
                )
@@ -597,91 +622,6 @@ def document_update_permissions(
    return True


-# NOTE(rkuo): Deprecating this due to degenerate behavior in Redis from sending
-# large permissions through celery (over 1MB in size)
-# @shared_task(
-#     name=OnyxCeleryTask.UPDATE_EXTERNAL_DOCUMENT_PERMISSIONS_TASK,
-#     soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
-#     time_limit=LIGHT_TIME_LIMIT,
-#     max_retries=DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES,
-#     bind=True,
-# )
-# def update_external_document_permissions_task(
-#     self: Task,
-#     tenant_id: str,
-#     serialized_doc_external_access: dict,
-#     source_string: str,
-#     connector_id: int,
-#     credential_id: int,
-# ) -> bool:
-#     start = time.monotonic()
-
-#     completion_status = OnyxCeleryTaskCompletionStatus.UNDEFINED
-
-#     document_external_access = DocExternalAccess.from_dict(
-#         serialized_doc_external_access
-#     )
-#     doc_id = document_external_access.doc_id
-#     external_access = document_external_access.external_access
-
-#     try:
-#         with get_session_with_current_tenant() as db_session:
-#             # Add the users to the DB if they don't exist
-#             batch_add_ext_perm_user_if_not_exists(
-#                 db_session=db_session,
-#                 emails=list(external_access.external_user_emails),
-#                 continue_on_error=True,
-#             )
-#             # Then upsert the document's external permissions
-#             created_new_doc = upsert_document_external_perms(
-#                 db_session=db_session,
-#                 doc_id=doc_id,
-#                 external_access=external_access,
-#                 source_type=DocumentSource(source_string),
-#             )
-
-#             if created_new_doc:
-#                 # If a new document was created, we associate it with the cc_pair
-#                 upsert_document_by_connector_credential_pair(
-#                     db_session=db_session,
-#                     connector_id=connector_id,
-#                     credential_id=credential_id,
-#                     document_ids=[doc_id],
-#                 )
-
-#             elapsed = time.monotonic() - start
-#             task_logger.info(
-#                 f"connector_id={connector_id} "
-#                 f"doc={doc_id} "
-#                 f"action=update_permissions "
-#                 f"elapsed={elapsed:.2f}"
-#             )
-
-#         completion_status = OnyxCeleryTaskCompletionStatus.SUCCEEDED
-#     except Exception as e:
-#         error_msg = format_error_for_logging(e)
-#         task_logger.warning(
-#             f"Exception in update_external_document_permissions_task: connector_id={connector_id} doc_id={doc_id} {error_msg}"
-#         )
-#         task_logger.exception(
-#             f"update_external_document_permissions_task exceptioned: "
-#             f"connector_id={connector_id} doc_id={doc_id}"
-#         )
-#         completion_status = OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
-#     finally:
-#         task_logger.info(
-#             f"update_external_document_permissions_task completed: status={completion_status.value} doc={doc_id}"
-#         )
-
-#     if completion_status != OnyxCeleryTaskCompletionStatus.SUCCEEDED:
-#         return False
-
-#     task_logger.info(
-#         f"update_external_document_permissions_task finished: connector_id={connector_id} doc_id={doc_id}"
-#     )
-#     return True
-
-
 def validate_permission_sync_fences(
    tenant_id: str,
    r: Redis,
--- a/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
@@ -20,7 +20,9 @@ from ee.onyx.background.celery.tasks.external_group_syncing.group_sync_utils imp
 from ee.onyx.db.connector_credential_pair import get_all_auto_sync_cc_pairs
 from ee.onyx.db.connector_credential_pair import get_cc_pairs_by_source
 from ee.onyx.db.external_perm import ExternalUserGroup
-from ee.onyx.db.external_perm import replace_user__ext_group_for_cc_pair
+from ee.onyx.db.external_perm import mark_old_external_groups_as_stale
+from ee.onyx.db.external_perm import remove_stale_external_groups
+from ee.onyx.db.external_perm import upsert_external_groups
 from ee.onyx.external_permissions.sync_params import (
    get_all_cc_pair_agnostic_group_sync_sources,
 )
@@ -28,6 +30,7 @@ from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
 from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.celery_redis import celery_find_task
 from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
+from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
 from onyx.background.error_logging import emit_background_error
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT
@@ -39,9 +42,8 @@ from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisConstants
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.configs.constants import OnyxRedisSignals
-from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import AccessType
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import SyncStatus
@@ -56,19 +58,34 @@ from onyx.redis.redis_connector_ext_group_sync import (
 )
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
+from onyx.server.runtime.onyx_runtime import OnyxRuntime
 from onyx.server.utils import make_short_id
 from onyx.utils.logger import format_error_for_logging
 from onyx.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()


-EXTERNAL_GROUPS_UPDATE_MAX_RETRIES = 3
+_EXTERNAL_GROUP_BATCH_SIZE = 100


-# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
-LIGHT_SOFT_TIME_LIMIT = 105
-LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
+def _get_fence_validation_block_expiration() -> int:
+    """
+    Compute the expiration time for the fence validation block signal.
+    Base expiration is 300 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
+    """
+    base_expiration = 300  # seconds
+
+    if not MULTI_TENANT:
+        return base_expiration
+
+    try:
+        beat_multiplier = OnyxRuntime.get_beat_multiplier()
+    except Exception:
+        beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
+
+    return int(base_expiration * beat_multiplier)


 def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
@@ -198,7 +215,11 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
                    "Exception while validating external group sync fences"
                )

-            r.set(OnyxRedisSignals.BLOCK_VALIDATE_EXTERNAL_GROUP_SYNC_FENCES, 1, ex=300)
+            r.set(
+                OnyxRedisSignals.BLOCK_VALIDATE_EXTERNAL_GROUP_SYNC_FENCES,
+                1,
+                ex=_get_fence_validation_block_expiration(),
+            )
    except SoftTimeLimitExceeded:
        task_logger.info(
            "Soft time limit exceeded, task is being terminated gracefully."
@@ -362,7 +383,7 @@ def connector_external_group_sync_generator_task(

    lock: RedisLock = r.lock(
        OnyxRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX
-        + f"_{redis_connector.id}",
+        + f"_{redis_connector.cc_pair_id}",
        timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT,
    )

@@ -377,63 +398,12 @@ def connector_external_group_sync_generator_task(
        payload.started = datetime.now(timezone.utc)
        redis_connector.external_group_sync.set_fence(payload)

+        _perform_external_group_sync(
+            cc_pair_id=cc_pair_id,
+            tenant_id=tenant_id,
+        )
+
        with get_session_with_current_tenant() as db_session:
-            cc_pair = get_connector_credential_pair_from_id(
-                db_session=db_session,
-                cc_pair_id=cc_pair_id,
-                eager_load_credential=True,
-            )
-            if cc_pair is None:
-                raise ValueError(
-                    f"No connector credential pair found for id: {cc_pair_id}"
-                )
-
-            source_type = cc_pair.connector.source
-            sync_config = get_source_perm_sync_config(source_type)
-            if sync_config is None:
-                msg = (
-                    f"No sync config found for {source_type} for cc_pair: {cc_pair_id}"
-                )
-                emit_background_error(msg, cc_pair_id=cc_pair_id)
-                raise ValueError(msg)
-
-            if sync_config.group_sync_config is None:
-                msg = f"No group sync config found for {source_type} for cc_pair: {cc_pair_id}"
-                emit_background_error(msg, cc_pair_id=cc_pair_id)
-                raise ValueError(msg)
-
-            ext_group_sync_func = sync_config.group_sync_config.group_sync_func
-
-            logger.info(
-                f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
-            )
-            external_user_groups: list[ExternalUserGroup] = []
-            try:
-                external_user_groups = ext_group_sync_func(tenant_id, cc_pair)
-            except ConnectorValidationError as e:
-                # TODO: add some notification to the admins here
-                logger.exception(
-                    f"Error syncing external groups for {source_type} for cc_pair: {cc_pair_id} {e}"
-                )
-                raise e
-
-            logger.info(
-                f"Syncing {len(external_user_groups)} external user groups for {source_type}"
-            )
-            logger.debug(f"New external user groups: {external_user_groups}")
-
-            replace_user__ext_group_for_cc_pair(
-                db_session=db_session,
-                cc_pair_id=cc_pair.id,
-                group_defs=external_user_groups,
-                source=cc_pair.connector.source,
-            )
-            logger.info(
-                f"Synced {len(external_user_groups)} external user groups for {source_type}"
-            )
-
-            mark_all_relevant_cc_pairs_as_external_group_synced(db_session, cc_pair)
-
            update_sync_record_status(
                db_session=db_session,
                entity_id=cc_pair_id,
@@ -475,6 +445,81 @@ def connector_external_group_sync_generator_task(
    )


+def _perform_external_group_sync(
+    cc_pair_id: int,
+    tenant_id: str,
+) -> None:
+    with get_session_with_current_tenant() as db_session:
+        cc_pair = get_connector_credential_pair_from_id(
+            db_session=db_session,
+            cc_pair_id=cc_pair_id,
+            eager_load_credential=True,
+        )
+        if cc_pair is None:
+            raise ValueError(f"No connector credential pair found for id: {cc_pair_id}")
+
+        source_type = cc_pair.connector.source
+        sync_config = get_source_perm_sync_config(source_type)
+        if sync_config is None:
+            msg = f"No sync config found for {source_type} for cc_pair: {cc_pair_id}"
+            emit_background_error(msg, cc_pair_id=cc_pair_id)
+            raise ValueError(msg)
+
+        if sync_config.group_sync_config is None:
+            msg = f"No group sync config found for {source_type} for cc_pair: {cc_pair_id}"
+            emit_background_error(msg, cc_pair_id=cc_pair_id)
+            raise ValueError(msg)
+
+        ext_group_sync_func = sync_config.group_sync_config.group_sync_func
+
+        logger.info(
+            f"Marking old external groups as stale for {source_type} for cc_pair: {cc_pair_id}"
+        )
+        mark_old_external_groups_as_stale(db_session, cc_pair_id)
+
+        logger.info(
+            f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
+        )
+        external_user_group_batch: list[ExternalUserGroup] = []
+        try:
+            external_user_group_generator = ext_group_sync_func(tenant_id, cc_pair)
+            for external_user_group in external_user_group_generator:
+                external_user_group_batch.append(external_user_group)
+                if len(external_user_group_batch) >= _EXTERNAL_GROUP_BATCH_SIZE:
+                    logger.debug(
+                        f"New external user groups: {external_user_group_batch}"
+                    )
+                    upsert_external_groups(
+                        db_session=db_session,
+                        cc_pair_id=cc_pair_id,
+                        external_groups=external_user_group_batch,
+                        source=cc_pair.connector.source,
+                    )
+                    external_user_group_batch = []
+
+            if external_user_group_batch:
+                logger.debug(f"New external user groups: {external_user_group_batch}")
+                upsert_external_groups(
+                    db_session=db_session,
+                    cc_pair_id=cc_pair_id,
+                    external_groups=external_user_group_batch,
+                    source=cc_pair.connector.source,
+                )
+        except Exception as e:
+            # TODO: add some notification to the admins here
+            logger.exception(
+                f"Error syncing external groups for {source_type} for cc_pair: {cc_pair_id} {e}"
+            )
+            raise e
+
+        logger.info(
+            f"Removing stale external groups for {source_type} for cc_pair: {cc_pair_id}"
+        )
+        remove_stale_external_groups(db_session, cc_pair_id)
+
+        mark_all_relevant_cc_pairs_as_external_group_synced(db_session, cc_pair)
+
+
 def validate_external_group_sync_fences(
    tenant_id: str,
    celery_app: Celery,
--- a/backend/ee/onyx/background/celery/tasks/tenant_provisioning/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/tenant_provisioning/tasks.py
@@ -19,7 +19,7 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisLocks
-from onyx.db.engine import get_session_with_shared_schema
+from onyx.db.engine.sql_engine import get_session_with_shared_schema
 from onyx.db.models import AvailableTenant
 from onyx.redis.redis_pool import get_redis_client
 from shared_configs.configs import MULTI_TENANT
--- a/backend/ee/onyx/configs/app_configs.py
+++ b/backend/ee/onyx/configs/app_configs.py
@@ -53,6 +53,16 @@ CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = (
 )


+#####
+# JIRA
+#####
+
+# In seconds, default is 30 minutes
+JIRA_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("JIRA_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
+)
+
+
 #####
 # Google Drive
 #####
@@ -71,6 +81,15 @@ SLACK_PERMISSION_DOC_SYNC_FREQUENCY = int(
 NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2)


+#####
+# Teams
+#####
+# In seconds, default is 5 minutes
+TEAMS_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("TEAMS_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
+)
+
+
 ####
 # Celery Job Frequency
 ####
--- a/backend/ee/onyx/connectors/perm_sync_valid.py
+++ b/backend/ee/onyx/connectors/perm_sync_valid.py
@@ -0,0 +1,28 @@
+from onyx.connectors.confluence.connector import ConfluenceConnector
+from onyx.connectors.google_drive.connector import GoogleDriveConnector
+from onyx.connectors.interfaces import BaseConnector
+
+
+def validate_confluence_perm_sync(connector: ConfluenceConnector) -> None:
+    """
+    Validate that the connector is configured correctly for permissions syncing.
+    """
+
+
+def validate_drive_perm_sync(connector: GoogleDriveConnector) -> None:
+    """
+    Validate that the connector is configured correctly for permissions syncing.
+    """
+
+
+def validate_perm_sync(connector: BaseConnector) -> None:
+    """
+    Override this if your connector needs to validate permissions syncing.
+    Raise an exception if invalid, otherwise do nothing.
+
+    Default is a no-op (always successful).
+    """
+    if isinstance(connector, ConfluenceConnector):
+        validate_confluence_perm_sync(connector)
+    elif isinstance(connector, GoogleDriveConnector):
+        validate_drive_perm_sync(connector)
--- a/backend/ee/onyx/db/external_perm.py
+++ b/backend/ee/onyx/db/external_perm.py
@@ -4,6 +4,7 @@ from uuid import UUID
 from pydantic import BaseModel
 from sqlalchemy import delete
 from sqlalchemy import select
+from sqlalchemy import update
 from sqlalchemy.orm import Session

 from onyx.access.utils import build_ext_group_name_for_onyx
@@ -62,20 +63,41 @@ def delete_public_external_group_for_cc_pair__no_commit(
    )


-def replace_user__ext_group_for_cc_pair(
+def mark_old_external_groups_as_stale(
    db_session: Session,
    cc_pair_id: int,
-    group_defs: list[ExternalUserGroup],
+) -> None:
+    db_session.execute(
+        update(User__ExternalUserGroupId)
+        .where(User__ExternalUserGroupId.cc_pair_id == cc_pair_id)
+        .values(stale=True)
+    )
+    db_session.execute(
+        update(PublicExternalUserGroup)
+        .where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
+        .values(stale=True)
+    )
+
+
+def upsert_external_groups(
+    db_session: Session,
+    cc_pair_id: int,
+    external_groups: list[ExternalUserGroup],
    source: DocumentSource,
 ) -> None:
    """
-    This function clears all existing external user group relations for a given cc_pair_id
-    and replaces them with the new group definitions and commits the changes.
+    Performs a true upsert operation for external user groups:
+    - For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
+    - For new groups, inserts them with stale=False
+    - For public groups, uses upsert logic as well
    """
+    # If there are no groups to add, return early
+    if not external_groups:
+        return

    # collect all emails from all groups to batch add all users at once for efficiency
    all_group_member_emails = set()
-    for external_group in group_defs:
+    for external_group in external_groups:
        for user_email in external_group.user_emails:
            all_group_member_emails.add(user_email)

@@ -86,26 +108,17 @@ def replace_user__ext_group_for_cc_pair(
        emails=list(all_group_member_emails),
    )

-    delete_user__ext_group_for_cc_pair__no_commit(
-        db_session=db_session,
-        cc_pair_id=cc_pair_id,
-    )
-    delete_public_external_group_for_cc_pair__no_commit(
-        db_session=db_session,
-        cc_pair_id=cc_pair_id,
-    )
-
    # map emails to ids
-    email_id_map = {user.email: user.id for user in all_group_members}
+    email_id_map = {user.email.lower(): user.id for user in all_group_members}

-    # use these ids to create new external user group relations relating group_id to user_ids
-    new_external_permissions: list[User__ExternalUserGroupId] = []
-    new_public_external_groups: list[PublicExternalUserGroup] = []
-    for external_group in group_defs:
+    # Process each external group
+    for external_group in external_groups:
        external_group_id = build_ext_group_name_for_onyx(
            ext_group_name=external_group.id,
            source=source,
        )
+
+        # Handle user-group mappings
        for user_email in external_group.user_emails:
            user_id = email_id_map.get(user_email.lower())
            if user_id is None:
@@ -114,24 +127,71 @@ def replace_user__ext_group_for_cc_pair(
                    f" with email {user_email} not found"
                )
                continue
-            new_external_permissions.append(
-                User__ExternalUserGroupId(
+
+            # Check if the user-group mapping already exists
+            existing_user_group = db_session.scalar(
+                select(User__ExternalUserGroupId).where(
+                    User__ExternalUserGroupId.user_id == user_id,
+                    User__ExternalUserGroupId.external_user_group_id
+                    == external_group_id,
+                    User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
+                )
+            )
+
+            if existing_user_group:
+                # Update existing record
+                existing_user_group.stale = False
+            else:
+                # Insert new record
+                new_user_group = User__ExternalUserGroupId(
                    user_id=user_id,
                    external_user_group_id=external_group_id,
                    cc_pair_id=cc_pair_id,
+                    stale=False,
+                )
+                db_session.add(new_user_group)
+
+        # Handle public group if needed
+        if external_group.gives_anyone_access:
+            # Check if the public group already exists
+            existing_public_group = db_session.scalar(
+                select(PublicExternalUserGroup).where(
+                    PublicExternalUserGroup.external_user_group_id == external_group_id,
+                    PublicExternalUserGroup.cc_pair_id == cc_pair_id,
                )
            )

-        if external_group.gives_anyone_access:
-            new_public_external_groups.append(
-                PublicExternalUserGroup(
+            if existing_public_group:
+                # Update existing record
+                existing_public_group.stale = False
+            else:
+                # Insert new record
+                new_public_group = PublicExternalUserGroup(
                    external_user_group_id=external_group_id,
                    cc_pair_id=cc_pair_id,
+                    stale=False,
                )
-            )
+                db_session.add(new_public_group)

-    db_session.add_all(new_external_permissions)
-    db_session.add_all(new_public_external_groups)
+    db_session.commit()
+
+
+def remove_stale_external_groups(
+    db_session: Session,
+    cc_pair_id: int,
+) -> None:
+    db_session.execute(
+        delete(User__ExternalUserGroupId).where(
+            User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
+            User__ExternalUserGroupId.stale.is_(True),
+        )
+    )
+    db_session.execute(
+        delete(PublicExternalUserGroup).where(
+            PublicExternalUserGroup.cc_pair_id == cc_pair_id,
+            PublicExternalUserGroup.stale.is_(True),
+        )
+    )
    db_session.commit()


--- a/backend/ee/onyx/db/usage_export.py
+++ b/backend/ee/onyx/db/usage_export.py
@@ -114,12 +114,24 @@ def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]:


 def get_usage_report_data(
-    db_session: Session,
-    report_name: str,
+    report_display_name: str,
 ) -> IO:
-    file_store = get_default_file_store(db_session)
+    """
+    Get the usage report data from the file store.
+
+    Args:
+        db_session: The database session.
+        report_display_name: The display name of the usage report. Also assumes
+                             that the file is stored with this as the ID in the file store.
+
+    Returns:
+        The usage report data.
+    """
+    file_store = get_default_file_store()
    # usage report may be very large, so don't load it all into memory
-    return file_store.read_file(file_name=report_name, mode="b", use_tempfile=True)
+    return file_store.read_file(
+        file_id=report_display_name, mode="b", use_tempfile=True
+    )


 def write_usage_report(
--- a/backend/ee/onyx/db/user_group.py
+++ b/backend/ee/onyx/db/user_group.py
@@ -128,11 +128,14 @@ def validate_object_creation_for_user(
    target_group_ids: list[int] | None = None,
    object_is_public: bool | None = None,
    object_is_perm_sync: bool | None = None,
+    object_is_owned_by_user: bool = False,
+    object_is_new: bool = False,
 ) -> None:
    """
    All users can create/edit permission synced objects if they don't specify a group
    All admin actions are allowed.
-    Prevents non-admins from creating/editing:
+    Curators and global curators can create public objects.
+    Prevents other non-admins from creating/editing:
    - public objects
    - objects with no groups
    - objects that belong to a group they don't curate
@@ -143,13 +146,23 @@ def validate_object_creation_for_user(
    if not user or user.role == UserRole.ADMIN:
        return

-    if object_is_public:
-        detail = "User does not have permission to create public credentials"
+    # Allow curators and global curators to create public objects
+    # w/o associated groups IF the object is new/owned by them
+    if (
+        object_is_public
+        and user.role in [UserRole.CURATOR, UserRole.GLOBAL_CURATOR]
+        and (object_is_new or object_is_owned_by_user)
+    ):
+        return
+
+    if object_is_public and user.role == UserRole.BASIC:
+        detail = "User does not have permission to create public objects"
        logger.error(detail)
        raise HTTPException(
            status_code=400,
            detail=detail,
        )
+
    if not target_group_ids:
        detail = "Curators must specify 1+ groups"
        logger.error(detail)
--- a/backend/ee/onyx/external_permissions/confluence/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
@@ -6,11 +6,11 @@ https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.htm
 from collections.abc import Generator

 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.utils import generic_doc_sync
 from onyx.access.models import DocExternalAccess
-from onyx.access.models import ExternalAccess
+from onyx.configs.constants import DocumentSource
 from onyx.connectors.confluence.connector import ConfluenceConnector
 from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
-from onyx.connectors.models import SlimDocument
 from onyx.db.models import ConnectorCredentialPair
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger
@@ -19,6 +19,9 @@ from shared_configs.contextvars import get_current_tenant_id
 logger = setup_logger()


+CONFLUENCE_DOC_SYNC_LABEL = "confluence_doc_sync"
+
+
 def confluence_doc_sync(
    cc_pair: ConnectorCredentialPair,
    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
@@ -29,7 +32,6 @@ def confluence_doc_sync(
    Compares fetched documents against existing documents in the DB for the connector.
    If a document exists in the DB but not in the Confluence fetch, it's marked as restricted.
    """
-    logger.info(f"Starting confluence doc sync for CC Pair ID: {cc_pair.id}")
    confluence_connector = ConfluenceConnector(
        **cc_pair.connector.connector_specific_config
    )
@@ -39,52 +41,11 @@ def confluence_doc_sync(
    )
    confluence_connector.set_credentials_provider(provider)

-    slim_docs: list[SlimDocument] = []
-    logger.info("Fetching all slim documents from confluence")
-    for doc_batch in confluence_connector.retrieve_all_slim_documents(
-        callback=callback
-    ):
-        logger.info(f"Got {len(doc_batch)} slim documents from confluence")
-        if callback:
-            if callback.should_stop():
-                raise RuntimeError("confluence_doc_sync: Stop signal detected")
-
-            callback.progress("confluence_doc_sync", 1)
-
-        slim_docs.extend(doc_batch)
-
-    # Find documents that are no longer accessible in Confluence
-    logger.info(f"Querying existing document IDs for CC Pair ID: {cc_pair.id}")
-    existing_doc_ids = fetch_all_existing_docs_fn()
-
-    # Find missing doc IDs
-    fetched_doc_ids = {doc.id for doc in slim_docs}
-    missing_doc_ids = set(existing_doc_ids) - fetched_doc_ids
-
-    # Yield access removal for missing docs. Better to be safe.
-    if missing_doc_ids:
-        logger.warning(
-            f"Found {len(missing_doc_ids)} documents that are in the DB but "
-            "not present in Confluence fetch. Making them inaccessible."
-        )
-        for missing_id in missing_doc_ids:
-            logger.warning(f"Removing access for document ID: {missing_id}")
-            yield DocExternalAccess(
-                doc_id=missing_id,
-                external_access=ExternalAccess(
-                    external_user_emails=set(),
-                    external_user_group_ids=set(),
-                    is_public=False,
-                ),
-            )
-
-    for doc in slim_docs:
-        if not doc.external_access:
-            raise RuntimeError(f"No external access found for document ID: {doc.id}")
-
-        yield DocExternalAccess(
-            doc_id=doc.id,
-            external_access=doc.external_access,
-        )
-
-    logger.info("Finished confluence doc sync")
+    yield from generic_doc_sync(
+        cc_pair=cc_pair,
+        fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
+        callback=callback,
+        doc_source=DocumentSource.CONFLUENCE,
+        slim_connector=confluence_connector,
+        label=CONFLUENCE_DOC_SYNC_LABEL,
+    )
--- a/backend/ee/onyx/external_permissions/confluence/group_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/group_sync.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 from ee.onyx.db.external_perm import ExternalUserGroup
 from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
 from onyx.background.error_logging import emit_background_error
@@ -65,7 +67,7 @@ def _build_group_member_email_map(
 def confluence_group_sync(
    tenant_id: str,
    cc_pair: ConnectorCredentialPair,
-) -> list[ExternalUserGroup]:
+) -> Generator[ExternalUserGroup, None, None]:
    provider = OnyxDBCredentialsProvider(tenant_id, "confluence", cc_pair.credential_id)
    is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False)
    wiki_base: str = cc_pair.connector.connector_specific_config["wiki_base"]
@@ -89,10 +91,10 @@ def confluence_group_sync(
        confluence_client=confluence_client,
        cc_pair_id=cc_pair.id,
    )
-    onyx_groups: list[ExternalUserGroup] = []
+
    all_found_emails = set()
    for group_id, group_member_emails in group_member_email_map.items():
-        onyx_groups.append(
+        yield (
            ExternalUserGroup(
                id=group_id,
                user_emails=list(group_member_emails),
@@ -107,6 +109,4 @@ def confluence_group_sync(
            id=ALL_CONF_EMAILS_GROUP_NAME,
            user_emails=list(all_found_emails),
        )
-        onyx_groups.append(all_found_group)
-
-    return onyx_groups
+        yield all_found_group
--- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
@@ -40,8 +40,28 @@ def _get_slim_doc_generator(
    )


+def _merge_permissions_lists(
+    permission_lists: list[list[GoogleDrivePermission]],
+) -> list[GoogleDrivePermission]:
+    """
+    Merge a list of permission lists into a single list of permissions.
+    """
+    seen_permission_ids: set[str] = set()
+    merged_permissions: list[GoogleDrivePermission] = []
+    for permission_list in permission_lists:
+        for permission in permission_list:
+            if permission.id not in seen_permission_ids:
+                merged_permissions.append(permission)
+                seen_permission_ids.add(permission.id)
+
+    return merged_permissions
+
+
 def get_external_access_for_raw_gdrive_file(
-    file: GoogleDriveFileType, company_domain: str, drive_service: GoogleDriveService
+    file: GoogleDriveFileType,
+    company_domain: str,
+    retriever_drive_service: GoogleDriveService | None,
+    admin_drive_service: GoogleDriveService,
 ) -> ExternalAccess:
    """
    Get the external access for a raw Google Drive file.
@@ -62,11 +82,28 @@ def get_external_access_for_raw_gdrive_file(
            GoogleDrivePermission.from_drive_permission(p) for p in permissions
        ]
    elif permission_ids:
-        permissions_list = get_permissions_by_ids(
-            drive_service=drive_service,
-            doc_id=doc_id,
-            permission_ids=permission_ids,
+
+        def _get_permissions(
+            drive_service: GoogleDriveService,
+        ) -> list[GoogleDrivePermission]:
+            return get_permissions_by_ids(
+                drive_service=drive_service,
+                doc_id=doc_id,
+                permission_ids=permission_ids,
+            )
+
+        permissions_list = _get_permissions(
+            retriever_drive_service or admin_drive_service
        )
+        if len(permissions_list) != len(permission_ids) and retriever_drive_service:
+            logger.warning(
+                f"Failed to get all permissions for file {doc_id} with retriever service, "
+                "trying admin service"
+            )
+            backup_permissions_list = _get_permissions(admin_drive_service)
+            permissions_list = _merge_permissions_lists(
+                [permissions_list, backup_permissions_list]
+            )

    folder_ids_to_inherit_permissions_from: set[str] = set()
    user_emails: set[str] = set()
--- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 from googleapiclient.errors import HttpError  # type: ignore
 from pydantic import BaseModel

@@ -42,11 +44,17 @@ def _get_all_folders(

    TODO: tweak things so we can fetch deltas.
    """
+    MAX_FAILED_PERCENTAGE = 0.5
+
    all_folders: list[FolderInfo] = []
    seen_folder_ids: set[str] = set()

-    user_emails = google_drive_connector._get_all_user_emails()
-    for user_email in user_emails:
+    def _get_all_folders_for_user(
+        google_drive_connector: GoogleDriveConnector,
+        skip_folders_without_permissions: bool,
+        user_email: str,
+    ) -> None:
+        """Helper to get folders for a specific user + update shared seen_folder_ids"""
        drive_service = get_drive_service(
            google_drive_connector.creds,
            user_email,
@@ -96,9 +104,61 @@ def _get_all_folders(
                )
            )

+    failed_count = 0
+    user_emails = google_drive_connector._get_all_user_emails()
+    for user_email in user_emails:
+        try:
+            _get_all_folders_for_user(
+                google_drive_connector, skip_folders_without_permissions, user_email
+            )
+        except Exception:
+            logger.exception(f"Error getting folders for user {user_email}")
+            failed_count += 1
+
+            if failed_count > MAX_FAILED_PERCENTAGE * len(user_emails):
+                raise RuntimeError("Too many failed folder fetches during group sync")
+
    return all_folders


+def _drive_folder_to_onyx_group(
+    folder: FolderInfo,
+    group_email_to_member_emails_map: dict[str, list[str]],
+) -> ExternalUserGroup:
+    """
+    Converts a folder into an Onyx group.
+    """
+    anyone_can_access = False
+    folder_member_emails: set[str] = set()
+
+    for permission in folder.permissions:
+        if permission.type == PermissionType.USER:
+            if permission.email_address is None:
+                logger.warning(
+                    f"User email is None for folder {folder.id} permission {permission}"
+                )
+                continue
+            folder_member_emails.add(permission.email_address)
+        elif permission.type == PermissionType.GROUP:
+            if permission.email_address not in group_email_to_member_emails_map:
+                logger.warning(
+                    f"Group email {permission.email_address} for folder {folder.id} "
+                    "not found in group_email_to_member_emails_map"
+                )
+                continue
+            folder_member_emails.update(
+                group_email_to_member_emails_map[permission.email_address]
+            )
+        elif permission.type == PermissionType.ANYONE:
+            anyone_can_access = True
+
+    return ExternalUserGroup(
+        id=folder.id,
+        user_emails=list(folder_member_emails),
+        gives_anyone_access=anyone_can_access,
+    )
+
+
 """Individual Shared Drive / My Drive Permission Sync"""


@@ -167,7 +227,29 @@ def _get_drive_members(
    return drive_id_to_members_map


-def _get_all_groups(
+def _drive_member_map_to_onyx_groups(
+    drive_id_to_members_map: dict[str, tuple[set[str], set[str]]],
+    group_email_to_member_emails_map: dict[str, list[str]],
+) -> Generator[ExternalUserGroup, None, None]:
+    """The `user_emails` for the Shared Drive should be all individuals in the
+    Shared Drive + the union of all flattened group emails."""
+    for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
+        drive_member_emails: set[str] = user_emails
+        for group_email in group_emails:
+            if group_email not in group_email_to_member_emails_map:
+                logger.warning(
+                    f"Group email {group_email} for drive {drive_id} not found in "
+                    "group_email_to_member_emails_map"
+                )
+                continue
+            drive_member_emails.update(group_email_to_member_emails_map[group_email])
+        yield ExternalUserGroup(
+            id=drive_id,
+            user_emails=list(drive_member_emails),
+        )
+
+
+def _get_all_google_groups(
    admin_service: AdminService,
    google_domain: str,
 ) -> set[str]:
@@ -185,6 +267,28 @@ def _get_all_groups(
    return group_emails


+def _google_group_to_onyx_group(
+    admin_service: AdminService,
+    group_email: str,
+) -> ExternalUserGroup:
+    """
+    This maps google group emails to their member emails.
+    """
+    group_member_emails: set[str] = set()
+    for member in execute_paginated_retrieval(
+        admin_service.members().list,
+        list_key="members",
+        groupKey=group_email,
+        fields="members(email),nextPageToken",
+    ):
+        group_member_emails.add(member["email"])
+
+    return ExternalUserGroup(
+        id=group_email,
+        user_emails=list(group_member_emails),
+    )
+
+
 def _map_group_email_to_member_emails(
    admin_service: AdminService,
    group_emails: set[str],
@@ -282,7 +386,7 @@ def _build_onyx_groups(
 def gdrive_group_sync(
    tenant_id: str,
    cc_pair: ConnectorCredentialPair,
-) -> list[ExternalUserGroup]:
+) -> Generator[ExternalUserGroup, None, None]:
    # Initialize connector and build credential/service objects
    google_drive_connector = GoogleDriveConnector(
        **cc_pair.connector.connector_specific_config
@@ -296,26 +400,27 @@ def gdrive_group_sync(
    drive_id_to_members_map = _get_drive_members(google_drive_connector, admin_service)

    # Get all group emails
-    all_group_emails = _get_all_groups(
+    all_group_emails = _get_all_google_groups(
        admin_service, google_drive_connector.google_domain
    )

+    # Each google group is an Onyx group, yield those
+    group_email_to_member_emails_map: dict[str, list[str]] = {}
+    for group_email in all_group_emails:
+        onyx_group = _google_group_to_onyx_group(admin_service, group_email)
+        group_email_to_member_emails_map[group_email] = onyx_group.user_emails
+        yield onyx_group
+
+    # Each drive is a group, yield those
+    for onyx_group in _drive_member_map_to_onyx_groups(
+        drive_id_to_members_map, group_email_to_member_emails_map
+    ):
+        yield onyx_group
+
    # Get all folder permissions
    folder_info = _get_all_folders(
        google_drive_connector=google_drive_connector,
        skip_folders_without_permissions=True,
    )
-
-    # Map group emails to their members
-    group_email_to_member_emails_map = _map_group_email_to_member_emails(
-        admin_service, all_group_emails
-    )
-
-    # Convert the maps to onyx groups
-    onyx_groups = _build_onyx_groups(
-        drive_id_to_members_map=drive_id_to_members_map,
-        group_email_to_member_emails_map=group_email_to_member_emails_map,
-        folder_info=folder_info,
-    )
-
-    return onyx_groups
+    for folder in folder_info:
+        yield _drive_folder_to_onyx_group(folder, group_email_to_member_emails_map)
--- a/backend/ee/onyx/external_permissions/jira/init.py
+++ b/backend/ee/onyx/external_permissions/jira/init.py
--- a/backend/ee/onyx/external_permissions/jira/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/jira/doc_sync.py
@@ -0,0 +1,34 @@
+from collections.abc import Generator
+
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.utils import generic_doc_sync
+from onyx.access.models import DocExternalAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.jira.connector import JiraConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+JIRA_DOC_SYNC_TAG = "jira_doc_sync"
+
+
+def jira_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    callback: IndexingHeartbeatInterface | None = None,
+) -> Generator[DocExternalAccess, None, None]:
+    jira_connector = JiraConnector(
+        **cc_pair.connector.connector_specific_config,
+    )
+    jira_connector.load_credentials(cc_pair.credential.credential_json)
+
+    yield from generic_doc_sync(
+        cc_pair=cc_pair,
+        fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
+        callback=callback,
+        doc_source=DocumentSource.JIRA,
+        slim_connector=jira_connector,
+        label=JIRA_DOC_SYNC_TAG,
+    )
--- a/backend/ee/onyx/external_permissions/jira/models.py
+++ b/backend/ee/onyx/external_permissions/jira/models.py
@@ -0,0 +1,25 @@
+from typing import Any
+
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic.alias_generators import to_camel
+
+
+Holder = dict[str, Any]
+
+
+class Permission(BaseModel):
+    id: int
+    permission: str
+    holder: Holder | None
+
+
+class User(BaseModel):
+    account_id: str
+    email_address: str
+    display_name: str
+    active: bool
+
+    model_config = ConfigDict(
+        alias_generator=to_camel,
+    )
--- a/backend/ee/onyx/external_permissions/jira/page_access.py
+++ b/backend/ee/onyx/external_permissions/jira/page_access.py
@@ -0,0 +1,209 @@
+from collections import defaultdict
+
+from jira import JIRA
+from jira.resources import PermissionScheme
+from pydantic import ValidationError
+
+from ee.onyx.external_permissions.jira.models import Holder
+from ee.onyx.external_permissions.jira.models import Permission
+from ee.onyx.external_permissions.jira.models import User
+from onyx.access.models import ExternalAccess
+from onyx.utils.logger import setup_logger
+
+HolderMap = dict[str, list[Holder]]
+
+
+logger = setup_logger()
+
+
+def _build_holder_map(permissions: list[dict]) -> dict[str, list[Holder]]:
+    """
+    A "Holder" in JIRA is a person / entity who "holds" the corresponding permission.
+    It can have different types. They can be one of (but not limited to):
+        - user (an explicitly whitelisted user)
+        - projectRole (for project level "roles")
+        - reporter (the reporter of an issue)
+
+    A "Holder" usually has following structure:
+        - `{ "type": "user", "value": "$USER_ID", "user": { .. }, .. }`
+        - `{ "type": "projectRole", "value": "$PROJECT_ID", ..  }`
+
+    When we fetch the PermissionSchema from JIRA, we retrieve a list of "Holder"s.
+    The list of "Holder"s can have multiple "Holder"s of the same type in the list (e.g., you can have two `"type": "user"`s in
+    there, each corresponding to a different user).
+    This function constructs a map of "Holder" types to a list of the "Holder"s which contained that type.
+
+    Returns:
+        A dict from the "Holder" type to the actual "Holder" instance.
+
+    Example:
+        ```
+        {
+            "user": [
+                { "type": "user", "value": "10000", "user": { .. }, .. },
+                { "type": "user", "value": "10001", "user": { .. }, .. },
+            ],
+            "projectRole": [
+                { "type": "projectRole", "value": "10010", ..  },
+                { "type": "projectRole", "value": "10011", ..  },
+            ],
+            "applicationRole": [
+                { "type": "applicationRole" },
+            ],
+            ..
+        }
+        ```
+    """
+
+    holder_map: defaultdict[str, list[Holder]] = defaultdict(list)
+
+    for raw_perm in permissions:
+        if not hasattr(raw_perm, "raw"):
+            logger.warn(f"Expected a 'raw' field, but none was found: {raw_perm=}")
+            continue
+
+        permission = Permission(**raw_perm.raw)
+
+        # We only care about ability to browse through projects + issues (not other permissions such as read/write).
+        if permission.permission != "BROWSE_PROJECTS":
+            continue
+
+        # In order to associate this permission to some Atlassian entity, we need the "Holder".
+        # If this doesn't exist, then we cannot associate this permission to anyone; just skip.
+        if not permission.holder:
+            logger.warn(
+                f"Expected to find a permission holder, but none was found: {permission=}"
+            )
+            continue
+
+        type = permission.holder.get("type")
+        if not type:
+            logger.warn(
+                f"Expected to find the type of permission holder, but none was found: {permission=}"
+            )
+            continue
+
+        holder_map[type].append(permission.holder)
+
+    return holder_map
+
+
+def _get_user_emails(user_holders: list[Holder]) -> list[str]:
+    emails = []
+
+    for user_holder in user_holders:
+        if "user" not in user_holder:
+            continue
+        raw_user_dict = user_holder["user"]
+
+        try:
+            user_model = User.model_validate(raw_user_dict)
+        except ValidationError:
+            logger.error(
+                "Expected to be able to serialize the raw-user-dict into an instance of `User`, but validation failed;"
+                f"{raw_user_dict=}"
+            )
+            continue
+
+        emails.append(user_model.email_address)
+
+    return emails
+
+
+def _get_user_emails_from_project_roles(
+    jira_client: JIRA,
+    jira_project: str,
+    project_role_holders: list[Holder],
+) -> list[str]:
+    # NOTE (@raunakab) a `parallel_yield` may be helpful here...?
+    roles = [
+        jira_client.project_role(project=jira_project, id=project_role_holder["value"])
+        for project_role_holder in project_role_holders
+        if "value" in project_role_holder
+    ]
+
+    emails = []
+
+    for role in roles:
+        if not hasattr(role, "actors"):
+            continue
+
+        for actor in role.actors:
+            if not hasattr(actor, "actorUser") or not hasattr(
+                actor.actorUser, "accountId"
+            ):
+                continue
+
+            user = jira_client.user(id=actor.actorUser.accountId)
+            if not hasattr(user, "accountType") or user.accountType != "atlassian":
+                continue
+
+            if not hasattr(user, "emailAddress"):
+                msg = f"User's email address was not able to be retrieved;  {actor.actorUser.accountId=}"
+                if hasattr(user, "displayName"):
+                    msg += f" {actor.displayName=}"
+                logger.warn(msg)
+                continue
+
+            emails.append(user.emailAddress)
+
+    return emails
+
+
+def _build_external_access_from_holder_map(
+    jira_client: JIRA, jira_project: str, holder_map: HolderMap
+) -> ExternalAccess:
+    """
+    # Note:
+        If the `holder_map` contains an instance of "anyone", then this is a public JIRA project.
+        Otherwise, we fetch the "projectRole"s (i.e., the user-groups in JIRA speak), and the user emails.
+    """
+
+    if "anyone" in holder_map:
+        return ExternalAccess(
+            external_user_emails=set(), external_user_group_ids=set(), is_public=True
+        )
+
+    user_emails = (
+        _get_user_emails(user_holders=holder_map["user"])
+        if "user" in holder_map
+        else []
+    )
+    project_role_user_emails = (
+        _get_user_emails_from_project_roles(
+            jira_client=jira_client,
+            jira_project=jira_project,
+            project_role_holders=holder_map["projectRole"],
+        )
+        if "projectRole" in holder_map
+        else []
+    )
+
+    external_user_emails = set(user_emails + project_role_user_emails)
+
+    return ExternalAccess(
+        external_user_emails=external_user_emails,
+        external_user_group_ids=set(),
+        is_public=False,
+    )
+
+
+def get_project_permissions(
+    jira_client: JIRA,
+    jira_project: str,
+) -> ExternalAccess | None:
+    project_permissions: PermissionScheme = jira_client.project_permissionscheme(
+        project=jira_project
+    )
+
+    if not hasattr(project_permissions, "permissions"):
+        return None
+
+    if not isinstance(project_permissions.permissions, list):
+        return None
+
+    holder_map = _build_holder_map(permissions=project_permissions.permissions)
+
+    return _build_external_access_from_holder_map(
+        jira_client=jira_client, jira_project=jira_project, holder_map=holder_map
+    )
--- a/backend/ee/onyx/external_permissions/perm_sync_types.py
+++ b/backend/ee/onyx/external_permissions/perm_sync_types.py
@@ -39,10 +39,10 @@ DocSyncFuncType = Callable[

 GroupSyncFuncType = Callable[
    [
-        str,
-        "ConnectorCredentialPair",
+        str,  # tenant_id
+        "ConnectorCredentialPair",  # cc_pair
    ],
-    list["ExternalUserGroup"],
+    Generator["ExternalUserGroup", None, None],
 ]

 # list of chunks to be censored and the user email. returns censored chunks
--- a/backend/ee/onyx/external_permissions/post_query_censoring.py
+++ b/backend/ee/onyx/external_permissions/post_query_censoring.py
@@ -3,7 +3,7 @@ from ee.onyx.external_permissions.sync_params import get_all_censoring_enabled_s
 from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
 from onyx.configs.constants import DocumentSource
 from onyx.context.search.pipeline import InferenceChunk
-from onyx.db.engine import get_session_context_manager
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.models import User
 from onyx.utils.logger import setup_logger

@@ -22,7 +22,7 @@ def _get_all_censoring_enabled_sources() -> set[DocumentSource]:
    for every single chunk.
    """
    all_censoring_enabled_sources = get_all_censoring_enabled_sources()
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        enabled_sync_connectors = get_all_auto_sync_cc_pairs(db_session)
        return {
            cc_pair.connector.source
--- a/backend/ee/onyx/external_permissions/salesforce/postprocessing.py
+++ b/backend/ee/onyx/external_permissions/salesforce/postprocessing.py
@@ -10,7 +10,7 @@ from ee.onyx.external_permissions.salesforce.utils import (
 )
 from onyx.configs.app_configs import BLURB_SIZE
 from onyx.context.search.models import InferenceChunk
-from onyx.db.engine import get_session_context_manager
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
@@ -44,7 +44,7 @@ def _get_objects_access_for_user_email_from_salesforce(
    # This is cached in the function so the first query takes an extra 0.1-0.3 seconds
    # but subsequent queries for this source are essentially instant
    first_doc_id = chunks[0].document_id
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        salesforce_client = get_any_salesforce_client_for_doc_id(
            db_session, first_doc_id
        )
@@ -217,7 +217,7 @@ def censor_salesforce_chunks(
 def _get_objects_access_for_user_email(
    object_ids: set[str], user_email: str
 ) -> dict[str, bool]:
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        external_groups = fetch_external_groups_for_user_email_and_group_ids(
            db_session=db_session,
            user_email=user_email,
--- a/backend/ee/onyx/external_permissions/sync_params.py
+++ b/backend/ee/onyx/external_permissions/sync_params.py
@@ -8,12 +8,15 @@ from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY
+from ee.onyx.configs.app_configs import JIRA_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.configs.app_configs import SLACK_PERMISSION_DOC_SYNC_FREQUENCY
+from ee.onyx.configs.app_configs import TEAMS_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.external_permissions.confluence.doc_sync import confluence_doc_sync
 from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
 from ee.onyx.external_permissions.gmail.doc_sync import gmail_doc_sync
 from ee.onyx.external_permissions.google_drive.doc_sync import gdrive_doc_sync
 from ee.onyx.external_permissions.google_drive.group_sync import gdrive_group_sync
+from ee.onyx.external_permissions.jira.doc_sync import jira_doc_sync
 from ee.onyx.external_permissions.perm_sync_types import CensoringFuncType
 from ee.onyx.external_permissions.perm_sync_types import DocSyncFuncType
 from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
@@ -22,6 +25,7 @@ from ee.onyx.external_permissions.salesforce.postprocessing import (
    censor_salesforce_chunks,
 )
 from ee.onyx.external_permissions.slack.doc_sync import slack_doc_sync
+from ee.onyx.external_permissions.teams.doc_sync import teams_doc_sync
 from onyx.configs.constants import DocumentSource

 if TYPE_CHECKING:
@@ -90,15 +94,21 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
            group_sync_is_cc_pair_agnostic=True,
        ),
    ),
+    DocumentSource.JIRA: SyncConfig(
+        doc_sync_config=DocSyncConfig(
+            doc_sync_frequency=JIRA_PERMISSION_DOC_SYNC_FREQUENCY,
+            doc_sync_func=jira_doc_sync,
+            initial_index_should_sync=True,
+        ),
+    ),
+    # Groups are not needed for Slack.
+    # All channel access is done at the individual user level.
    DocumentSource.SLACK: SyncConfig(
        doc_sync_config=DocSyncConfig(
            doc_sync_frequency=SLACK_PERMISSION_DOC_SYNC_FREQUENCY,
            doc_sync_func=slack_doc_sync,
            initial_index_should_sync=True,
        ),
-        # groups are not needed for Slack. All channel access is done at the
-        # individual user level
-        group_sync_config=None,
    ),
    DocumentSource.GMAIL: SyncConfig(
        doc_sync_config=DocSyncConfig(
@@ -119,6 +129,15 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
            initial_index_should_sync=True,
        ),
    ),
+    # Groups are not needed for Teams.
+    # All channel access is done at the individual user level.
+    DocumentSource.TEAMS: SyncConfig(
+        doc_sync_config=DocSyncConfig(
+            doc_sync_frequency=TEAMS_PERMISSION_DOC_SYNC_FREQUENCY,
+            doc_sync_func=teams_doc_sync,
+            initial_index_should_sync=True,
+        ),
+    ),
 }


--- a/backend/ee/onyx/external_permissions/teams/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/teams/doc_sync.py
@@ -0,0 +1,35 @@
+from collections.abc import Generator
+
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.utils import generic_doc_sync
+from onyx.access.models import DocExternalAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.teams.connector import TeamsConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+TEAMS_DOC_SYNC_LABEL = "teams_doc_sync"
+
+
+def teams_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    callback: IndexingHeartbeatInterface | None,
+) -> Generator[DocExternalAccess, None, None]:
+    teams_connector = TeamsConnector(
+        **cc_pair.connector.connector_specific_config,
+    )
+    teams_connector.load_credentials(cc_pair.credential.credential_json)
+
+    yield from generic_doc_sync(
+        cc_pair=cc_pair,
+        fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
+        callback=callback,
+        doc_source=DocumentSource.TEAMS,
+        slim_connector=teams_connector,
+        label=TEAMS_DOC_SYNC_LABEL,
+    )
--- a/backend/ee/onyx/external_permissions/utils.py
+++ b/backend/ee/onyx/external_permissions/utils.py
@@ -0,0 +1,83 @@
+from collections.abc import Generator
+
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from onyx.access.models import DocExternalAccess
+from onyx.access.models import ExternalAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.interfaces import SlimConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def generic_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    callback: IndexingHeartbeatInterface | None,
+    doc_source: DocumentSource,
+    slim_connector: SlimConnector,
+    label: str,
+) -> Generator[DocExternalAccess, None, None]:
+    """
+    A convenience function for performing a generic document synchronization.
+
+    Notes:
+    A generic doc sync includes:
+        - fetching existing docs
+        - fetching *all* new (slim) docs
+        - yielding external-access permissions for existing docs which do not exist in the newly fetched slim-docs set (with their
+        `external_access` set to "private")
+        - yielding external-access permissions for newly fetched docs
+
+    Returns:
+        A `Generator` which yields existing and newly fetched external-access permissions.
+    """
+
+    logger.info(f"Starting {doc_source} doc sync for CC Pair ID: {cc_pair.id}")
+
+    newly_fetched_doc_ids: set[str] = set()
+
+    logger.info(f"Fetching all slim documents from {doc_source}")
+    for doc_batch in slim_connector.retrieve_all_slim_documents(callback=callback):
+        logger.info(f"Got {len(doc_batch)} slim documents from {doc_source}")
+
+        if callback:
+            if callback.should_stop():
+                raise RuntimeError(f"{label}: Stop signal detected")
+            callback.progress(label, 1)
+
+        for doc in doc_batch:
+            if not doc.external_access:
+                raise RuntimeError(
+                    f"No external access found for document ID; {cc_pair.id=} {doc_source=} {doc.id=}"
+                )
+
+            newly_fetched_doc_ids.add(doc.id)
+
+            yield DocExternalAccess(
+                doc_id=doc.id,
+                external_access=doc.external_access,
+            )
+
+    logger.info(f"Querying existing document IDs for CC Pair ID: {cc_pair.id=}")
+    existing_doc_ids = set(fetch_all_existing_docs_fn())
+
+    missing_doc_ids = existing_doc_ids - newly_fetched_doc_ids
+
+    if not missing_doc_ids:
+        return
+
+    logger.warning(
+        f"Found {len(missing_doc_ids)=} documents that are in the DB but not present in fetch. Making them inaccessible."
+    )
+
+    for missing_id in missing_doc_ids:
+        logger.warning(f"Removing access for {missing_id=}")
+        yield DocExternalAccess(
+            doc_id=missing_id,
+            external_access=ExternalAccess.empty(),
+        )
+
+    logger.info(f"Finished {doc_source} doc sync")
--- a/backend/ee/onyx/server/analytics/api.py
+++ b/backend/ee/onyx/server/analytics/api.py
@@ -19,7 +19,7 @@ from ee.onyx.db.analytics import fetch_query_analytics
 from ee.onyx.db.analytics import user_can_view_assistant_stats
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_user
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User

 router = APIRouter(prefix="/analytics")
--- a/backend/ee/onyx/server/documents/cc_pair.py
+++ b/backend/ee/onyx/server/documents/cc_pair.py
@@ -17,7 +17,7 @@ from onyx.background.celery.versioned_apps.client import app as client_app
 from onyx.db.connector_credential_pair import (
    get_connector_credential_pair_from_id_for_user,
 )
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_pool import get_redis_client
--- a/backend/ee/onyx/server/enterprise_settings/api.py
+++ b/backend/ee/onyx/server/enterprise_settings/api.py
@@ -26,9 +26,9 @@ from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_user_with_expired_token
 from onyx.auth.users import get_user_manager
 from onyx.auth.users import UserManager
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
-from onyx.file_store.file_store import PostgresBackedFileStore
+from onyx.file_store.file_store import get_default_file_store
 from onyx.server.utils import BasicAuthenticationError
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
@@ -134,19 +134,19 @@ def ee_fetch_settings() -> EnterpriseSettings:
 def put_logo(
    file: UploadFile,
    is_logotype: bool = False,
-    db_session: Session = Depends(get_session),
    _: User | None = Depends(current_admin_user),
 ) -> None:
-    upload_logo(file=file, db_session=db_session, is_logotype=is_logotype)
+    upload_logo(file=file, is_logotype=is_logotype)


 def fetch_logo_helper(db_session: Session) -> Response:
    try:
-        file_store = PostgresBackedFileStore(db_session)
+        file_store = get_default_file_store()
        onyx_file = file_store.get_file_with_mime_type(get_logo_filename())
        if not onyx_file:
            raise ValueError("get_onyx_file returned None!")
    except Exception:
+        logger.exception("Faield to fetch logo file")
        raise HTTPException(
            status_code=404,
            detail="No logo file found",
@@ -157,7 +157,7 @@ def fetch_logo_helper(db_session: Session) -> Response:

 def fetch_logotype_helper(db_session: Session) -> Response:
    try:
-        file_store = PostgresBackedFileStore(db_session)
+        file_store = get_default_file_store()
        onyx_file = file_store.get_file_with_mime_type(get_logotype_filename())
        if not onyx_file:
            raise ValueError("get_onyx_file returned None!")
--- a/backend/ee/onyx/server/enterprise_settings/store.py
+++ b/backend/ee/onyx/server/enterprise_settings/store.py
@@ -6,7 +6,6 @@ from typing import IO

 from fastapi import HTTPException
 from fastapi import UploadFile
-from sqlalchemy.orm import Session

 from ee.onyx.server.enterprise_settings.models import AnalyticsScriptUpload
 from ee.onyx.server.enterprise_settings.models import EnterpriseSettings
@@ -99,9 +98,7 @@ def guess_file_type(filename: str) -> str:
    return "application/octet-stream"


-def upload_logo(
-    db_session: Session, file: UploadFile | str, is_logotype: bool = False
-) -> bool:
+def upload_logo(file: UploadFile | str, is_logotype: bool = False) -> bool:
    content: IO[Any]

    if isinstance(file, str):
@@ -129,13 +126,13 @@ def upload_logo(
        display_name = file.filename
        file_type = file.content_type or "image/jpeg"

-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
    file_store.save_file(
-        file_name=_LOGOTYPE_FILENAME if is_logotype else _LOGO_FILENAME,
        content=content,
        display_name=display_name,
        file_origin=FileOrigin.OTHER,
        file_type=file_type,
+        file_id=_LOGOTYPE_FILENAME if is_logotype else _LOGO_FILENAME,
    )
    return True

--- a/backend/ee/onyx/server/manage/standard_answer.py
+++ b/backend/ee/onyx/server/manage/standard_answer.py
@@ -13,7 +13,7 @@ from ee.onyx.db.standard_answer import remove_standard_answer
 from ee.onyx.db.standard_answer import update_standard_answer
 from ee.onyx.db.standard_answer import update_standard_answer_category
 from onyx.auth.users import current_admin_user
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.server.manage.models import StandardAnswer
 from onyx.server.manage.models import StandardAnswerCategory
--- a/backend/ee/onyx/server/middleware/tenant_tracking.py
+++ b/backend/ee/onyx/server/middleware/tenant_tracking.py
@@ -11,7 +11,7 @@ from ee.onyx.auth.users import decode_anonymous_user_jwt_token
 from onyx.auth.api_key import extract_tenant_from_api_key_header
 from onyx.configs.constants import ANONYMOUS_USER_COOKIE_NAME
 from onyx.configs.constants import TENANT_ID_COOKIE_NAME
-from onyx.db.engine import is_valid_schema_name
+from onyx.db.engine.sql_engine import is_valid_schema_name
 from onyx.redis.redis_pool import retrieve_auth_token_data_from_redis
 from shared_configs.configs import MULTI_TENANT
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
--- a/backend/ee/onyx/server/oauth/api.py
+++ b/backend/ee/onyx/server/oauth/api.py
@@ -12,10 +12,10 @@ from ee.onyx.server.oauth.slack import SlackOAuth
 from onyx.auth.users import current_admin_user
 from onyx.configs.app_configs import DEV_MODE
 from onyx.configs.constants import DocumentSource
-from onyx.db.engine import get_current_tenant_id
 from onyx.db.models import User
 from onyx.redis.redis_pool import get_redis_client
 from onyx.utils.logger import setup_logger
+from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

--- a/backend/ee/onyx/server/oauth/confluence_cloud.py
+++ b/backend/ee/onyx/server/oauth/confluence_cloud.py
@@ -25,12 +25,12 @@ from onyx.connectors.confluence.utils import CONFLUENCE_OAUTH_TOKEN_URL
 from onyx.db.credentials import create_credential
 from onyx.db.credentials import fetch_credential_by_id_for_user
 from onyx.db.credentials import update_credential_json
-from onyx.db.engine import get_current_tenant_id
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.redis.redis_pool import get_redis_client
 from onyx.server.documents.models import CredentialBase
 from onyx.utils.logger import setup_logger
+from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

--- a/backend/ee/onyx/server/oauth/google_drive.py
+++ b/backend/ee/onyx/server/oauth/google_drive.py
@@ -33,11 +33,11 @@ from onyx.connectors.google_utils.shared_constants import (
    GoogleOAuthAuthenticationMethod,
 )
 from onyx.db.credentials import create_credential
-from onyx.db.engine import get_current_tenant_id
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.redis.redis_pool import get_redis_client
 from onyx.server.documents.models import CredentialBase
+from shared_configs.contextvars import get_current_tenant_id


 class GoogleDriveOAuth:
--- a/backend/ee/onyx/server/oauth/slack.py
+++ b/backend/ee/onyx/server/oauth/slack.py
@@ -17,11 +17,11 @@ from onyx.configs.app_configs import OAUTH_SLACK_CLIENT_SECRET
 from onyx.configs.app_configs import WEB_DOMAIN
 from onyx.configs.constants import DocumentSource
 from onyx.db.credentials import create_credential
-from onyx.db.engine import get_current_tenant_id
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.redis.redis_pool import get_redis_client
 from onyx.server.documents.models import CredentialBase
+from shared_configs.contextvars import get_current_tenant_id


 class SlackOAuth:
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -1,5 +1,6 @@
 import re
 from typing import cast
+from uuid import UUID

 from fastapi import APIRouter
 from fastapi import Depends
@@ -40,7 +41,7 @@ from onyx.context.search.models import SavedSearchDoc
 from onyx.db.chat import create_chat_session
 from onyx.db.chat import create_new_chat_message
 from onyx.db.chat import get_or_create_root_message
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.llm.factory import get_llms_for_persona
 from onyx.natural_language_processing.utils import get_tokenizer
@@ -73,6 +74,7 @@ def _get_final_context_doc_indices(

 def _convert_packet_stream_to_response(
    packets: ChatPacketStream,
+    chat_session_id: UUID,
 ) -> ChatBasicResponse:
    response = ChatBasicResponse()
    final_context_docs: list[LlmDoc] = []
@@ -216,6 +218,8 @@ def _convert_packet_stream_to_response(
    if answer:
        response.answer_citationless = remove_answer_citations(answer)

+    response.chat_session_id = chat_session_id
+
    return response


@@ -237,13 +241,36 @@ def handle_simplified_chat_message(
    if not chat_message_req.message:
        raise HTTPException(status_code=400, detail="Empty chat message is invalid")

+    # Handle chat session creation if chat_session_id is not provided
+    if chat_message_req.chat_session_id is None:
+        if chat_message_req.persona_id is None:
+            raise HTTPException(
+                status_code=400,
+                detail="Either chat_session_id or persona_id must be provided",
+            )
+
+        # Create a new chat session with the provided persona_id
+        try:
+            new_chat_session = create_chat_session(
+                db_session=db_session,
+                description="",  # Leave empty for simple API
+                user_id=user.id if user else None,
+                persona_id=chat_message_req.persona_id,
+            )
+            chat_session_id = new_chat_session.id
+        except Exception as e:
+            logger.exception(e)
+            raise HTTPException(status_code=400, detail="Invalid Persona provided.")
+    else:
+        chat_session_id = chat_message_req.chat_session_id
+
    try:
        parent_message, _ = create_chat_chain(
-            chat_session_id=chat_message_req.chat_session_id, db_session=db_session
+            chat_session_id=chat_session_id, db_session=db_session
        )
    except Exception:
        parent_message = get_or_create_root_message(
-            chat_session_id=chat_message_req.chat_session_id, db_session=db_session
+            chat_session_id=chat_session_id, db_session=db_session
        )

    if (
@@ -258,7 +285,7 @@ def handle_simplified_chat_message(
        retrieval_options = chat_message_req.retrieval_options

    full_chat_msg_info = CreateChatMessageRequest(
-        chat_session_id=chat_message_req.chat_session_id,
+        chat_session_id=chat_session_id,
        parent_message_id=parent_message.id,
        message=chat_message_req.message,
        file_descriptors=[],
@@ -283,7 +310,7 @@ def handle_simplified_chat_message(
        enforce_chat_session_id_for_search_docs=False,
    )

-    return _convert_packet_stream_to_response(packets)
+    return _convert_packet_stream_to_response(packets, chat_session_id)


@router.post("/send-message-simple-with-history")
@@ -403,4 +430,4 @@ def handle_send_message_simple_with_history(
        enforce_chat_session_id_for_search_docs=False,
    )

-    return _convert_packet_stream_to_response(packets)
+    return _convert_packet_stream_to_response(packets, chat_session.id)
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -41,11 +41,13 @@ class DocumentSearchRequest(ChunkContext):


 class BasicCreateChatMessageRequest(ChunkContext):
-    """Before creating messages, be sure to create a chat_session and get an id
+    """If a chat_session_id is not provided, a persona_id must be provided to automatically create a new chat session
    Note, for simplicity this option only allows for a single linear chain of messages
    """

-    chat_session_id: UUID
+    chat_session_id: UUID | None = None
+    # Optional persona_id to create a new chat session if chat_session_id is not provided
+    persona_id: int | None = None
    # New message contents
    message: str
    # Defaults to using retrieval with no additional filters
@@ -62,6 +64,12 @@ class BasicCreateChatMessageRequest(ChunkContext):
    # If True, uses agentic search instead of basic search
    use_agentic_search: bool = False

+    @model_validator(mode="after")
+    def validate_chat_session_or_persona(self) -> "BasicCreateChatMessageRequest":
+        if self.chat_session_id is None and self.persona_id is None:
+            raise ValueError("Either chat_session_id or persona_id must be provided")
+        return self
+

 class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
    # Last element is the new query. All previous elements are historical context
@@ -171,6 +179,9 @@ class ChatBasicResponse(BaseModel):
    agent_sub_queries: dict[int, dict[int, list[AgentSubQuery]]] | None = None
    agent_refined_answer_improvement: bool | None = None

+    # Chat session ID for tracking conversation continuity
+    chat_session_id: UUID | None = None
+

 class OneShotQARequest(ChunkContext):
    # Supports simplier APIs that don't deal with chat histories or message edits
--- a/backend/ee/onyx/server/query_and_chat/query_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/query_backend.py
@@ -31,7 +31,7 @@ from onyx.context.search.utils import dedupe_documents
 from onyx.context.search.utils import drop_llm_indices
 from onyx.context.search.utils import relevant_sections_to_indices
 from onyx.db.chat import get_prompt_by_id
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import Persona
 from onyx.db.models import User
 from onyx.db.persona import get_persona_by_id
--- a/backend/ee/onyx/server/query_and_chat/token_limit.py
+++ b/backend/ee/onyx/server/query_and_chat/token_limit.py
@@ -13,7 +13,7 @@ from sqlalchemy import select
 from sqlalchemy.orm import Session

 from onyx.db.api_key import is_api_key_email_address
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.models import ChatMessage
 from onyx.db.models import ChatSession
 from onyx.db.models import TokenRateLimit
--- a/backend/ee/onyx/server/query_history/api.py
+++ b/backend/ee/onyx/server/query_history/api.py
@@ -37,11 +37,11 @@ from onyx.configs.constants import QueryHistoryType
 from onyx.configs.constants import SessionType
 from onyx.db.chat import get_chat_session_by_id
 from onyx.db.chat import get_chat_sessions_by_user
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.enums import TaskStatus
+from onyx.db.file_record import get_query_history_export_files
 from onyx.db.models import ChatSession
 from onyx.db.models import User
-from onyx.db.pg_file_store import get_query_history_export_files
 from onyx.db.tasks import get_task_with_id
 from onyx.db.tasks import register_task
 from onyx.file_store.file_store import get_default_file_store
@@ -49,6 +49,7 @@ from onyx.server.documents.models import PaginatedReturn
 from onyx.server.query_and_chat.models import ChatSessionDetails
 from onyx.server.query_and_chat.models import ChatSessionsResponse
 from onyx.utils.threadpool_concurrency import parallel_yield
+from shared_configs.contextvars import get_current_tenant_id

 router = APIRouter()

@@ -334,6 +335,7 @@ def start_query_history_export(
            "start": start,
            "end": end,
            "start_time": start_time,
+            "tenant_id": get_current_tenant_id(),
        },
    )

@@ -356,11 +358,11 @@ def get_query_history_export_status(
    # If task is None, then it's possible that the task has already finished processing.
    # Therefore, we should then check if the export file has already been stored inside of the file-store.
    # If that *also* doesn't exist, then we can return a 404.
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()

    report_name = construct_query_history_report_name(request_id)
    has_file = file_store.has_file(
-        file_name=report_name,
+        file_id=report_name,
        file_origin=FileOrigin.QUERY_HISTORY_CSV,
        file_type=FileType.CSV,
    )
@@ -383,9 +385,9 @@ def download_query_history_csv(
    ensure_query_history_is_enabled(disallowed=[QueryHistoryType.DISABLED])

    report_name = construct_query_history_report_name(request_id)
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()
    has_file = file_store.has_file(
-        file_name=report_name,
+        file_id=report_name,
        file_origin=FileOrigin.QUERY_HISTORY_CSV,
        file_type=FileType.CSV,
    )
--- a/backend/ee/onyx/server/query_history/models.py
+++ b/backend/ee/onyx/server/query_history/models.py
@@ -12,7 +12,7 @@ from onyx.configs.constants import SessionType
 from onyx.db.enums import TaskStatus
 from onyx.db.models import ChatMessage
 from onyx.db.models import ChatSession
-from onyx.db.models import PGFileStore
+from onyx.db.models import FileRecord
 from onyx.db.models import TaskQueueState


@@ -254,7 +254,7 @@ class QueryHistoryExport(BaseModel):
    @classmethod
    def from_file(
        cls,
-        file: PGFileStore,
+        file: FileRecord,
    ) -> "QueryHistoryExport":
        if not file.file_metadata or not isinstance(file.file_metadata, dict):
            raise RuntimeError(
@@ -262,7 +262,7 @@ class QueryHistoryExport(BaseModel):
            )

        metadata = QueryHistoryFileMetadata.model_validate(dict(file.file_metadata))
-        task_id = extract_task_id_from_query_history_report_name(file.file_name)
+        task_id = extract_task_id_from_query_history_report_name(file.file_id)

        return cls(
            task_id=task_id,
--- a/backend/ee/onyx/server/reporting/usage_export_api.py
+++ b/backend/ee/onyx/server/reporting/usage_export_api.py
@@ -14,7 +14,7 @@ from ee.onyx.db.usage_export import get_usage_report_data
 from ee.onyx.db.usage_export import UsageReportMetadata
 from ee.onyx.server.reporting.usage_export_generation import create_new_usage_report
 from onyx.auth.users import current_admin_user
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.file_store.constants import STANDARD_CHUNK_SIZE

@@ -53,7 +53,7 @@ def read_usage_report(
    db_session: Session = Depends(get_session),
 ) -> Response:
    try:
-        file = get_usage_report_data(db_session, report_name)
+        file = get_usage_report_data(report_name)
    except ValueError as e:
        raise HTTPException(status_code=404, detail=str(e))

--- a/backend/ee/onyx/server/reporting/usage_export_generation.py
+++ b/backend/ee/onyx/server/reporting/usage_export_generation.py
@@ -62,17 +62,16 @@ def generate_chat_messages_report(
                    ]
                )

-        # after writing seek to begining of buffer
+        # after writing seek to beginning of buffer
        temp_file.seek(0)
-        file_store.save_file(
-            file_name=file_name,
+        file_id = file_store.save_file(
            content=temp_file,
            display_name=file_name,
            file_origin=FileOrigin.OTHER,
            file_type="text/csv",
        )

-    return file_name
+    return file_id


 def generate_user_report(
@@ -97,15 +96,14 @@ def generate_user_report(
            csvwriter.writerow([user_skeleton.user_id, user_skeleton.is_active])

        temp_file.seek(0)
-        file_store.save_file(
-            file_name=file_name,
+        file_id = file_store.save_file(
            content=temp_file,
            display_name=file_name,
            file_origin=FileOrigin.OTHER,
            file_type="text/csv",
        )

-    return file_name
+    return file_id


 def create_new_usage_report(
@@ -114,18 +112,18 @@ def create_new_usage_report(
    period: tuple[datetime, datetime] | None,
 ) -> UsageReportMetadata:
    report_id = str(uuid.uuid4())
-    file_store = get_default_file_store(db_session)
+    file_store = get_default_file_store()

-    messages_filename = generate_chat_messages_report(
+    messages_file_id = generate_chat_messages_report(
        db_session, file_store, report_id, period
    )
-    users_filename = generate_user_report(db_session, file_store, report_id)
+    users_file_id = generate_user_report(db_session, file_store, report_id)

    with tempfile.SpooledTemporaryFile(max_size=MAX_IN_MEMORY_SIZE) as zip_buffer:
        with zipfile.ZipFile(zip_buffer, "a", zipfile.ZIP_DEFLATED) as zip_file:
            # write messages
            chat_messages_tmpfile = file_store.read_file(
-                messages_filename, mode="b", use_tempfile=True
+                messages_file_id, mode="b", use_tempfile=True
            )
            zip_file.writestr(
                "chat_messages.csv",
@@ -134,7 +132,7 @@ def create_new_usage_report(

            # write users
            users_tmpfile = file_store.read_file(
-                users_filename, mode="b", use_tempfile=True
+                users_file_id, mode="b", use_tempfile=True
            )
            zip_file.writestr("users.csv", users_tmpfile.read())

@@ -146,11 +144,11 @@ def create_new_usage_report(
            f"_{report_id}_usage_report.zip"
        )
        file_store.save_file(
-            file_name=report_name,
            content=zip_buffer,
            display_name=report_name,
            file_origin=FileOrigin.GENERATED_REPORT,
            file_type="application/zip",
+            file_id=report_name,
        )

    # add report after zip file is written
--- a/backend/ee/onyx/server/saml.py
+++ b/backend/ee/onyx/server/saml.py
@@ -27,9 +27,9 @@ from onyx.auth.users import get_user_manager
 from onyx.configs.app_configs import SESSION_EXPIRE_TIME_SECONDS
 from onyx.db.auth import get_user_count
 from onyx.db.auth import get_user_db
-from onyx.db.engine import get_async_session
-from onyx.db.engine import get_async_session_context_manager
-from onyx.db.engine import get_session
+from onyx.db.engine.async_sql_engine import get_async_session
+from onyx.db.engine.async_sql_engine import get_async_session_context_manager
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.utils.logger import setup_logger

--- a/backend/ee/onyx/server/seeding.py
+++ b/backend/ee/onyx/server/seeding.py
@@ -19,7 +19,7 @@ from ee.onyx.server.enterprise_settings.store import (
 )
 from ee.onyx.server.enterprise_settings.store import upload_logo
 from onyx.context.search.enums import RecencyBiasSetting
-from onyx.db.engine import get_session_context_manager
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.llm import update_default_provider
 from onyx.db.llm import upsert_llm_provider
 from onyx.db.models import Tool
@@ -200,10 +200,10 @@ def _seed_enterprise_settings(seed_config: SeedConfiguration) -> None:
        store_ee_settings(final_enterprise_settings)


-def _seed_logo(db_session: Session, logo_path: str | None) -> None:
+def _seed_logo(logo_path: str | None) -> None:
    if logo_path:
        logger.notice("Uploading logo")
-        upload_logo(db_session=db_session, file=logo_path)
+        upload_logo(file=logo_path)


 def _seed_analytics_script(seed_config: SeedConfiguration) -> None:
@@ -235,7 +235,7 @@ def seed_db() -> None:
        logger.debug("No seeding configuration file passed")
        return

-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        if seed_config.llms is not None:
            _seed_llms(db_session, seed_config.llms)
        if seed_config.personas is not None:
@@ -245,7 +245,7 @@ def seed_db() -> None:
        if seed_config.custom_tools is not None:
            _seed_custom_tools(db_session, seed_config.custom_tools)

-        _seed_logo(db_session, seed_config.seeded_logo_path)
+        _seed_logo(seed_config.seeded_logo_path)
        _seed_enterprise_settings(seed_config)
        _seed_analytics_script(seed_config)

--- a/backend/ee/onyx/server/tenants/admin_api.py
+++ b/backend/ee/onyx/server/tenants/admin_api.py
@@ -10,7 +10,7 @@ from ee.onyx.server.tenants.user_mapping import get_tenant_id_for_email
 from onyx.auth.users import auth_backend
 from onyx.auth.users import get_redis_strategy
 from onyx.auth.users import User
-from onyx.db.engine import get_session_with_tenant
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.users import get_user_by_email
 from onyx.utils.logger import setup_logger

--- a/backend/ee/onyx/server/tenants/anonymous_users_api.py
+++ b/backend/ee/onyx/server/tenants/anonymous_users_api.py
@@ -18,7 +18,7 @@ from onyx.auth.users import optional_user
 from onyx.auth.users import User
 from onyx.configs.constants import ANONYMOUS_USER_COOKIE_NAME
 from onyx.configs.constants import FASTAPI_USERS_AUTH_COOKIE_NAME
-from onyx.db.engine import get_session_with_shared_schema
+from onyx.db.engine.sql_engine import get_session_with_shared_schema
 from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import get_current_tenant_id

--- a/backend/ee/onyx/server/tenants/billing_api.py
+++ b/backend/ee/onyx/server/tenants/billing_api.py
@@ -10,10 +10,12 @@ from ee.onyx.server.tenants.billing import fetch_billing_information
 from ee.onyx.server.tenants.billing import fetch_stripe_checkout_session
 from ee.onyx.server.tenants.billing import fetch_tenant_stripe_information
 from ee.onyx.server.tenants.models import BillingInformation
+from ee.onyx.server.tenants.models import ProductGatingFullSyncRequest
 from ee.onyx.server.tenants.models import ProductGatingRequest
 from ee.onyx.server.tenants.models import ProductGatingResponse
 from ee.onyx.server.tenants.models import SubscriptionSessionResponse
 from ee.onyx.server.tenants.models import SubscriptionStatusResponse
+from ee.onyx.server.tenants.product_gating import overwrite_full_gated_set
 from ee.onyx.server.tenants.product_gating import store_product_gating
 from onyx.auth.users import User
 from onyx.configs.app_configs import WEB_DOMAIN
@@ -47,6 +49,26 @@ def gate_product(
        return ProductGatingResponse(updated=False, error=str(e))


+@router.post("/product-gating/full-sync")
+def gate_product_full_sync(
+    product_gating_request: ProductGatingFullSyncRequest,
+    _: None = Depends(control_plane_dep),
+) -> ProductGatingResponse:
+    """
+    Bulk operation to overwrite the entire gated tenant set.
+    This replaces all currently gated tenants with the provided list.
+    Gated tenants are not available to access the product and will be
+    directed to the billing page when their subscription has ended.
+    """
+    try:
+        overwrite_full_gated_set(product_gating_request.gated_tenant_ids)
+        return ProductGatingResponse(updated=True, error=None)
+
+    except Exception as e:
+        logger.exception("Failed to gate products during full sync")
+        return ProductGatingResponse(updated=False, error=str(e))
+
+
@router.get("/billing-information")
 async def billing_information(
    _: User = Depends(current_admin_user),
--- a/backend/ee/onyx/server/tenants/models.py
+++ b/backend/ee/onyx/server/tenants/models.py
@@ -19,6 +19,10 @@ class ProductGatingRequest(BaseModel):
    application_status: ApplicationStatus


+class ProductGatingFullSyncRequest(BaseModel):
+    gated_tenant_ids: list[str]
+
+
 class SubscriptionStatusResponse(BaseModel):
    subscribed: bool

--- a/backend/ee/onyx/server/tenants/product_gating.py
+++ b/backend/ee/onyx/server/tenants/product_gating.py
@@ -16,10 +16,6 @@ logger = setup_logger()
 def update_tenant_gating(tenant_id: str, status: ApplicationStatus) -> None:
    redis_client = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)

-    # Store the full status
-    status_key = f"tenant:{tenant_id}:status"
-    redis_client.set(status_key, status.value)
-
    # Maintain the GATED_ACCESS set
    if status == ApplicationStatus.GATED_ACCESS:
        redis_client.sadd(GATED_TENANTS_KEY, tenant_id)
@@ -46,6 +42,25 @@ def store_product_gating(tenant_id: str, application_status: ApplicationStatus)
        raise


+def overwrite_full_gated_set(tenant_ids: list[str]) -> None:
+    redis_client = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
+
+    pipeline = redis_client.pipeline()
+
+    # using pipeline doesn't automatically add the tenant_id prefix
+    full_gated_set_key = f"{ONYX_CLOUD_TENANT_ID}:{GATED_TENANTS_KEY}"
+
+    # Clear the existing set
+    pipeline.delete(full_gated_set_key)
+
+    # Add all tenant IDs to the set and set their status
+    for tenant_id in tenant_ids:
+        pipeline.sadd(full_gated_set_key, tenant_id)
+
+    # Execute all commands at once
+    pipeline.execute()
+
+
 def get_gated_tenants() -> set[str]:
    redis_client = get_redis_replica_client(tenant_id=ONYX_CLOUD_TENANT_ID)
    gated_tenants_bytes = cast(set[bytes], redis_client.smembers(GATED_TENANTS_KEY))
--- a/backend/ee/onyx/server/tenants/provisioning.py
+++ b/backend/ee/onyx/server/tenants/provisioning.py
@@ -28,8 +28,8 @@ from onyx.auth.users import exceptions
 from onyx.configs.app_configs import CONTROL_PLANE_API_BASE_URL
 from onyx.configs.app_configs import DEV_MODE
 from onyx.configs.constants import MilestoneRecordType
-from onyx.db.engine import get_session_with_shared_schema
-from onyx.db.engine import get_session_with_tenant
+from onyx.db.engine.sql_engine import get_session_with_shared_schema
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.llm import update_default_provider
 from onyx.db.llm import upsert_cloud_embedding_provider
 from onyx.db.llm import upsert_llm_provider
--- a/backend/ee/onyx/server/tenants/schema_management.py
+++ b/backend/ee/onyx/server/tenants/schema_management.py
@@ -8,8 +8,8 @@ from sqlalchemy.schema import CreateSchema

 from alembic import command
 from alembic.config import Config
-from onyx.db.engine import build_connection_string
-from onyx.db.engine import get_sqlalchemy_engine
+from onyx.db.engine.sql_engine import build_connection_string
+from onyx.db.engine.sql_engine import get_sqlalchemy_engine

 logger = logging.getLogger(__name__)

@@ -34,7 +34,7 @@ def run_alembic_migrations(schema_name: str) -> None:

        # Mimic command-line options by adding 'cmd_opts' to the config
        alembic_cfg.cmd_opts = SimpleNamespace()  # type: ignore
-        alembic_cfg.cmd_opts.x = [f"schema={schema_name}"]  # type: ignore
+        alembic_cfg.cmd_opts.x = [f"schemas={schema_name}"]  # type: ignore

        # Run migrations programmatically
        command.upgrade(alembic_cfg, "head")
--- a/backend/ee/onyx/server/tenants/team_membership_api.py
+++ b/backend/ee/onyx/server/tenants/team_membership_api.py
@@ -9,7 +9,7 @@ from ee.onyx.server.tenants.user_mapping import remove_users_from_tenant
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import User
 from onyx.db.auth import get_user_count
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.users import delete_user_from_db
 from onyx.db.users import get_user_by_email
 from onyx.server.manage.models import UserByEmail
--- a/backend/ee/onyx/server/tenants/user_mapping.py
+++ b/backend/ee/onyx/server/tenants/user_mapping.py
@@ -5,8 +5,8 @@ from onyx.auth.invited_users import get_invited_users
 from onyx.auth.invited_users import get_pending_users
 from onyx.auth.invited_users import write_invited_users
 from onyx.auth.invited_users import write_pending_users
-from onyx.db.engine import get_session_with_shared_schema
-from onyx.db.engine import get_session_with_tenant
+from onyx.db.engine.sql_engine import get_session_with_shared_schema
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.models import UserTenantMapping
 from onyx.server.manage.models import TenantSnapshot
 from onyx.utils.logger import setup_logger
--- a/backend/ee/onyx/server/token_rate_limits/api.py
+++ b/backend/ee/onyx/server/token_rate_limits/api.py
@@ -9,7 +9,7 @@ from ee.onyx.db.token_limit import fetch_user_group_token_rate_limits_for_user
 from ee.onyx.db.token_limit import insert_user_group_token_rate_limit
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_curator_or_admin_user
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.db.token_limit import fetch_all_user_token_rate_limits
 from onyx.db.token_limit import insert_user_token_rate_limit
--- a/backend/ee/onyx/server/user_group/api.py
+++ b/backend/ee/onyx/server/user_group/api.py
@@ -16,7 +16,7 @@ from ee.onyx.server.user_group.models import UserGroupCreate
 from ee.onyx.server.user_group.models import UserGroupUpdate
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_curator_or_admin_user
-from onyx.db.engine import get_session
+from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.db.models import UserRole
 from onyx.utils.logger import setup_logger
--- a/backend/onyx/access/models.py
+++ b/backend/onyx/access/models.py
@@ -40,6 +40,30 @@ class ExternalAccess:
    def num_entries(self) -> int:
        return len(self.external_user_emails) + len(self.external_user_group_ids)

+    @classmethod
+    def public(cls) -> "ExternalAccess":
+        return cls(
+            external_user_emails=set(),
+            external_user_group_ids=set(),
+            is_public=True,
+        )
+
+    @classmethod
+    def empty(cls) -> "ExternalAccess":
+        """
+        A helper function that returns an *empty* set of external user-emails and group-ids, and sets `is_public` to `False`.
+        This effectively makes the document in question "private" or inaccessible to anyone else.
+
+        This is especially helpful to use when you are performing permission-syncing, and some document's permissions aren't able
+        to be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
+        """
+
+        return cls(
+            external_user_emails=set(),
+            external_user_group_ids=set(),
+            is_public=False,
+        )
+

@dataclass(frozen=True)
 class DocExternalAccess:
--- a/backend/onyx/agents/agent_search/basic/graph_builder.py
+++ b/backend/onyx/agents/agent_search/basic/graph_builder.py
@@ -78,7 +78,7 @@ def should_continue(state: BasicState) -> str:


 if __name__ == "__main__":
-    from onyx.db.engine import get_session_context_manager
+    from onyx.db.engine.sql_engine import get_session_with_current_tenant
    from onyx.context.search.models import SearchRequest
    from onyx.llm.factory import get_default_llms
    from onyx.agents.agent_search.shared_graph_utils.utils import get_test_config
@@ -87,7 +87,7 @@ if __name__ == "__main__":
    compiled_graph = graph.compile()
    input = BasicInput(unused=True)
    primary_llm, fast_llm = get_default_llms()
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        config, _ = get_test_config(
            db_session=db_session,
            primary_llm=primary_llm,
--- a/backend/onyx/agents/agent_search/dc_search_analysis/ops.py
+++ b/backend/onyx/agents/agent_search/dc_search_analysis/ops.py
@@ -4,7 +4,7 @@ from typing import cast
 from onyx.chat.models import LlmDoc
 from onyx.configs.constants import DocumentSource
 from onyx.context.search.models import InferenceSection
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.tools.models import SearchToolOverrideKwargs
 from onyx.tools.tool_implementations.search.search_tool import (
    FINAL_CONTEXT_DOCUMENTS_ID,
--- a/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/initial/generate_individual_sub_answer/graph_builder.py
@@ -111,7 +111,7 @@ def answer_query_graph_builder() -> StateGraph:


 if __name__ == "__main__":
-    from onyx.db.engine import get_session_context_manager
+    from onyx.db.engine.sql_engine import get_session_with_current_tenant
    from onyx.llm.factory import get_default_llms
    from onyx.context.search.models import SearchRequest

@@ -121,7 +121,7 @@ if __name__ == "__main__":
    search_request = SearchRequest(
        query="what can you do with onyx or danswer?",
    )
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        graph_config, search_tool = get_test_config(
            db_session, primary_llm, fast_llm, search_request
        )
--- a/backend/onyx/agents/agent_search/deep_search/main/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/main/graph_builder.py
@@ -238,7 +238,7 @@ def agent_search_graph_builder() -> StateGraph:
 if __name__ == "__main__":
    pass

-    from onyx.db.engine import get_session_context_manager
+    from onyx.db.engine.sql_engine import get_session_with_current_tenant
    from onyx.llm.factory import get_default_llms
    from onyx.context.search.models import SearchRequest

@@ -246,7 +246,7 @@ if __name__ == "__main__":
    compiled_graph = graph.compile()
    primary_llm, fast_llm = get_default_llms()

-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        search_request = SearchRequest(query="Who created Excel?")
        graph_config = get_test_config(
            db_session, primary_llm, fast_llm, search_request
--- a/backend/onyx/agents/agent_search/deep_search/refinement/consolidate_sub_answers/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/refinement/consolidate_sub_answers/graph_builder.py
@@ -109,7 +109,7 @@ def answer_refined_query_graph_builder() -> StateGraph:


 if __name__ == "__main__":
-    from onyx.db.engine import get_session_context_manager
+    from onyx.db.engine.sql_engine import get_session_with_current_tenant
    from onyx.llm.factory import get_default_llms
    from onyx.context.search.models import SearchRequest

@@ -119,7 +119,7 @@ if __name__ == "__main__":
    search_request = SearchRequest(
        query="what can you do with onyx or danswer?",
    )
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        inputs = SubQuestionAnsweringInput(
            question="what can you do with onyx?",
            question_id="0_0",
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/graph_builder.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/graph_builder.py
@@ -131,7 +131,7 @@ def expanded_retrieval_graph_builder() -> StateGraph:


 if __name__ == "__main__":
-    from onyx.db.engine import get_session_context_manager
+    from onyx.db.engine.sql_engine import get_session_with_current_tenant
    from onyx.llm.factory import get_default_llms
    from onyx.context.search.models import SearchRequest

@@ -142,7 +142,7 @@ if __name__ == "__main__":
        query="what can you do with onyx or danswer?",
    )

-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        graph_config, search_tool = get_test_config(
            db_session, primary_llm, fast_llm, search_request
        )
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/rerank_documents.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/rerank_documents.py
@@ -24,7 +24,7 @@ from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import RerankingDetails
 from onyx.context.search.postprocessing.postprocessing import rerank_sections
 from onyx.context.search.postprocessing.postprocessing import should_rerank
-from onyx.db.engine import get_session_context_manager
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.search_settings import get_current_search_settings
 from onyx.utils.timing import log_function_time

@@ -60,7 +60,7 @@ def rerank_documents(
    allow_agent_reranking = graph_config.behavior.allow_agent_reranking

    if rerank_settings is None:
-        with get_session_context_manager() as db_session:
+        with get_session_with_current_tenant() as db_session:
            search_settings = get_current_search_settings(db_session)
            if not search_settings.disable_rerank_for_streaming:
                rerank_settings = RerankingDetails.from_db_model(search_settings)
--- a/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/retrieve_documents.py
+++ b/backend/onyx/agents/agent_search/deep_search/shared/expanded_retrieval/nodes/retrieve_documents.py
@@ -21,7 +21,7 @@ from onyx.agents.agent_search.shared_graph_utils.utils import (
 from onyx.configs.agent_configs import AGENT_MAX_QUERY_RETRIEVAL_RESULTS
 from onyx.configs.agent_configs import AGENT_RETRIEVAL_STATS
 from onyx.context.search.models import InferenceSection
-from onyx.db.engine import get_session_context_manager
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.tools.models import SearchQueryInfo
 from onyx.tools.models import SearchToolOverrideKwargs
 from onyx.tools.tool_implementations.search.search_tool import (
@@ -67,7 +67,7 @@ def retrieve_documents(
    callback_container: list[list[InferenceSection]] = []

    # new db session to avoid concurrency issues
-    with get_session_context_manager() as db_session:
+    with get_session_with_current_tenant() as db_session:
        for tool_response in search_tool.run(
            query=query_to_retrieve,
            override_kwargs=SearchToolOverrideKwargs(
--- a/backend/onyx/agents/agent_search/kb_search/graph_utils.py
+++ b/backend/onyx/agents/agent_search/kb_search/graph_utils.py
@@ -19,7 +19,7 @@ from onyx.chat.models import SubQuestionPiece
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
 from onyx.db.document import get_kg_doc_info_for_entity_name
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.entities import get_document_id_for_entity
 from onyx.db.entities import get_entity_name
 from onyx.db.entity_type import get_entity_types
--- a/backend/onyx/agents/agent_search/kb_search/nodes/a1_extract_ert.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/a1_extract_ert.py
@@ -25,16 +25,17 @@ from onyx.agents.agent_search.shared_graph_utils.utils import (
 )
 from onyx.configs.kg_configs import KG_ENTITY_EXTRACTION_TIMEOUT
 from onyx.configs.kg_configs import KG_RELATIONSHIP_EXTRACTION_TIMEOUT
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.kg_temp_view import create_views
 from onyx.db.kg_temp_view import get_user_view_names
 from onyx.db.relationships import get_allowed_relationship_type_pairs
-from onyx.kg.extractions.extraction_processing import get_entity_types_str
-from onyx.kg.extractions.extraction_processing import get_relationship_types_str
+from onyx.kg.utils.extraction_utils import get_entity_types_str
+from onyx.kg.utils.extraction_utils import get_relationship_types_str
 from onyx.prompts.kg_prompts import QUERY_ENTITY_EXTRACTION_PROMPT
 from onyx.prompts.kg_prompts import QUERY_RELATIONSHIP_EXTRACTION_PROMPT
 from onyx.utils.logger import setup_logger
 from onyx.utils.threadpool_concurrency import run_with_timeout
+from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

@@ -80,10 +81,12 @@ def extract_ert(
    stream_write_step_activities(writer, _KG_STEP_NR)

    # Create temporary views. TODO: move into parallel step, if ultimately materialized
-    kg_views = get_user_view_names(user_email)
+    tenant_id = get_current_tenant_id()
+    kg_views = get_user_view_names(user_email, tenant_id)
    with get_session_with_current_tenant() as db_session:
        create_views(
            db_session,
+            tenant_id=tenant_id,
            user_email=user_email,
            allowed_docs_view_name=kg_views.allowed_docs_view_name,
            kg_relationships_view_name=kg_views.kg_relationships_view_name,
@@ -133,15 +136,14 @@ def extract_ert(
        last_bracket = cleaned_response.rfind("}")
        cleaned_response = cleaned_response[first_bracket : last_bracket + 1]

-        try:
-            entity_extraction_result = (
-                KGQuestionEntityExtractionResult.model_validate_json(cleaned_response)
-            )
-        except ValidationError:
-            logger.error("Failed to parse LLM response as JSON in Entity Extraction")
-            entity_extraction_result = KGQuestionEntityExtractionResult(
-                entities=[], time_filter=""
-            )
+        entity_extraction_result = KGQuestionEntityExtractionResult.model_validate_json(
+            cleaned_response
+        )
+    except ValidationError:
+        logger.error("Failed to parse LLM response as JSON in Entity Extraction")
+        entity_extraction_result = KGQuestionEntityExtractionResult(
+            entities=[], time_filter=""
+        )
    except Exception as e:
        logger.error(f"Error in extract_ert: {e}")
        entity_extraction_result = KGQuestionEntityExtractionResult(
--- a/backend/onyx/agents/agent_search/kb_search/nodes/a2_analyze.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/a2_analyze.py
@@ -27,7 +27,7 @@ from onyx.agents.agent_search.shared_graph_utils.utils import (
    get_langgraph_node_log_string,
 )
 from onyx.configs.kg_configs import KG_STRATEGY_GENERATION_TIMEOUT
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.entities import get_document_id_for_entity
 from onyx.kg.clustering.normalizations import normalize_entities
 from onyx.kg.clustering.normalizations import normalize_relationships
@@ -265,10 +265,7 @@ def analyze(
 Format: {output_format.value}, Broken down question: {broken_down_question}"

    extraction_detected_relationships = len(query_graph_relationships) > 0
-    if (
-        extraction_detected_relationships
-        or relationship_detection == KGRelationshipDetection.RELATIONSHIPS.value
-    ):
+    if extraction_detected_relationships:
        query_type = KGRelationshipDetection.RELATIONSHIPS.value

        if extraction_detected_relationships:
--- a/backend/onyx/agents/agent_search/kb_search/nodes/a3_generate_simple_sql.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/a3_generate_simple_sql.py
@@ -29,7 +29,7 @@ from onyx.configs.kg_configs import KG_SQL_GENERATION_TIMEOUT_OVERRIDE
 from onyx.configs.kg_configs import KG_TEMP_ALLOWED_DOCS_VIEW_NAME_PREFIX
 from onyx.configs.kg_configs import KG_TEMP_KG_ENTITIES_VIEW_NAME_PREFIX
 from onyx.configs.kg_configs import KG_TEMP_KG_RELATIONSHIPS_VIEW_NAME_PREFIX
-from onyx.db.engine import get_db_readonly_user_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_db_readonly_user_session_with_current_tenant
 from onyx.db.kg_temp_view import drop_views
 from onyx.llm.interfaces import LLM
 from onyx.prompts.kg_prompts import ENTITY_SOURCE_DETECTION_PROMPT
@@ -200,6 +200,11 @@ def generate_simple_sql(
    if state.kg_rel_temp_view_name is None:
        raise ValueError("kg_rel_temp_view_name is not set")

+    if state.kg_entity_temp_view_name is None:
+        raise ValueError("kg_entity_temp_view_name is not set")
+
+    sql_statement_display: str | None = None
+
    ## STEP 3 - articulate goals

    stream_write_step_activities(writer, _KG_STEP_NR)
@@ -311,9 +316,8 @@ def generate_simple_sql(
            )
            sql_statement = sql_statement.split(";")[0].strip() + ";"
            sql_statement = sql_statement.replace("sql", "").strip()
-            sql_statement = sql_statement.replace("kg_relationship", rel_temp_view)
-            if ent_temp_view:
-                sql_statement = sql_statement.replace("kg_entity", ent_temp_view)
+            sql_statement = sql_statement.replace("relationship_table", rel_temp_view)
+            sql_statement = sql_statement.replace("entity_table", ent_temp_view)

            reasoning = (
                cleaned_response.split("<reasoning>")[1]
@@ -379,7 +383,18 @@ def generate_simple_sql(

                raise e

-        logger.debug(f"A3 - sql_statement after correction: {sql_statement}")
+        # display sql statement with view names replaced by general view names
+        sql_statement_display = sql_statement.replace(
+            state.kg_doc_temp_view_name, "<your_allowed_docs_view_name>"
+        )
+        sql_statement_display = sql_statement_display.replace(
+            state.kg_rel_temp_view_name, "<your_relationship_view_name>"
+        )
+        sql_statement_display = sql_statement_display.replace(
+            state.kg_entity_temp_view_name, "<your_entity_view_name>"
+        )
+
+        logger.debug(f"A3 - sql_statement after correction: {sql_statement_display}")

        # Get SQL for source documents

@@ -399,10 +414,28 @@ def generate_simple_sql(

            if source_documents_sql and ent_temp_view:
                source_documents_sql = source_documents_sql.replace(
-                    "kg_entity", ent_temp_view
+                    "entity_table", ent_temp_view
                )

-            logger.debug(f"A3 source_documents_sql: {source_documents_sql}")
+            if source_documents_sql and rel_temp_view:
+                source_documents_sql = source_documents_sql.replace(
+                    "relationship_table", rel_temp_view
+                )
+
+            if source_documents_sql:
+                source_documents_sql_display = source_documents_sql.replace(
+                    state.kg_doc_temp_view_name, "<your_allowed_docs_view_name>"
+                )
+                source_documents_sql_display = source_documents_sql_display.replace(
+                    state.kg_rel_temp_view_name, "<your_relationship_view_name>"
+                )
+                source_documents_sql_display = source_documents_sql_display.replace(
+                    state.kg_entity_temp_view_name, "<your_entity_view_name>"
+                )
+            else:
+                source_documents_sql_display = "(No source documents SQL generated)"
+
+            logger.debug(f"A3 source_documents_sql: {source_documents_sql_display}")

        scalar_result = None
        query_results = None
@@ -428,7 +461,13 @@ def generate_simple_sql(
                    rows = result.fetchall()
                    query_results = [dict(row._mapping) for row in rows]
            except Exception as e:
+                # TODO: raise error on frontend
                logger.error(f"Error executing SQL query: {e}")
+                drop_views(
+                    allowed_docs_view_name=doc_temp_view,
+                    kg_relationships_view_name=rel_temp_view,
+                    kg_entity_view_name=ent_temp_view,
+                )

                raise e

@@ -452,8 +491,14 @@ def generate_simple_sql(
                        for source_document_result in query_source_document_results
                    ]
                except Exception as e:
-                    # No stopping here, the individualized SQL query is not mandatory
                    # TODO: raise error on frontend
+
+                    drop_views(
+                        allowed_docs_view_name=doc_temp_view,
+                        kg_relationships_view_name=rel_temp_view,
+                        kg_entity_view_name=ent_temp_view,
+                    )
+
                    logger.error(f"Error executing Individualized SQL query: {e}")

        else:
@@ -486,11 +531,11 @@ def generate_simple_sql(
    if reasoning:
        stream_write_step_answer_explicit(writer, step_nr=_KG_STEP_NR, answer=reasoning)

-    if main_sql_statement:
+    if sql_statement_display:
        stream_write_step_answer_explicit(
            writer,
            step_nr=_KG_STEP_NR,
-            answer=f" \n Generated SQL: {main_sql_statement}",
+            answer=f" \n Generated SQL: {sql_statement_display}",
        )

    stream_close_step_answer(writer, _KG_STEP_NR)
--- a/backend/onyx/agents/agent_search/kb_search/nodes/b1_construct_deep_search_filters.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/b1_construct_deep_search_filters.py
@@ -13,7 +13,7 @@ from onyx.agents.agent_search.shared_graph_utils.utils import (
    get_langgraph_node_log_string,
 )
 from onyx.configs.kg_configs import KG_FILTER_CONSTRUCTION_TIMEOUT
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.entity_type import get_entity_types_with_grounded_source_name
 from onyx.kg.utils.formatting_utils import make_entity_id
 from onyx.prompts.kg_prompts import SEARCH_FILTER_CONSTRUCTION_PROMPT
--- a/backend/onyx/agents/agent_search/kb_search/nodes/c1_process_kg_only_answers.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/c1_process_kg_only_answers.py
@@ -16,7 +16,7 @@ from onyx.agents.agent_search.shared_graph_utils.utils import (
 from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
 from onyx.chat.models import SubQueryPiece
 from onyx.db.document import get_base_llm_doc_information
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.utils.logger import setup_logger


--- a/backend/onyx/agents/agent_search/kb_search/nodes/d1_generate_answer.py
+++ b/backend/onyx/agents/agent_search/kb_search/nodes/d1_generate_answer.py
@@ -28,7 +28,7 @@ from onyx.configs.kg_configs import KG_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERA
 from onyx.configs.kg_configs import KG_TIMEOUT_LLM_INITIAL_ANSWER_GENERATION
 from onyx.context.search.enums import SearchType
 from onyx.context.search.models import InferenceSection
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.prompts.kg_prompts import OUTPUT_FORMAT_NO_EXAMPLES_PROMPT
 from onyx.prompts.kg_prompts import OUTPUT_FORMAT_NO_OVERALL_ANSWER_PROMPT
 from onyx.tools.tool_implementations.search.search_tool import IndexFilters
--- a/Show More
+++ b/Show More