fix: image gen tool causing error (#5445 )

fix(infra): remove setfit dependency from api server (#5449 )
fix: non-image gen models (#5381 )
2026-02-16 23:35:46 +00:00 · 2025-09-18 11:08:29 -07:00 · 2025-09-18 06:49:06 +00:00 · 2025-09-15 19:01:08 -07:00 · 2025-09-09 10:23:33 -07:00 · 2025-09-09 09:37:16 -07:00
1791 changed files with 180902 additions and 59843 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1,3 @@
 * @onyx-dot-app/onyx-core-team
+# Helm charts Owners
+/helm/ @justin-tahara
--- a/.github/actions/custom-build-and-push/action.yml
+++ b/.github/actions/custom-build-and-push/action.yml
@@ -25,6 +25,10 @@ inputs:
  tags:
    description: 'Image tags'
    required: true
+  no-cache:
+    description: 'Read from cache'
+    required: false
+    default: 'false'
  cache-from:
    description: 'Cache sources'
    required: false
@@ -55,6 +59,7 @@ runs:
        push: ${{ inputs.push }}
        load: ${{ inputs.load }}
        tags: ${{ inputs.tags }}
+        no-cache: ${{ inputs.no-cache }}
        cache-from: ${{ inputs.cache-from }}
        cache-to: ${{ inputs.cache-to }}

@@ -77,6 +82,7 @@ runs:
        push: ${{ inputs.push }}
        load: ${{ inputs.load }}
        tags: ${{ inputs.tags }}
+        no-cache: ${{ inputs.no-cache }}
        cache-from: ${{ inputs.cache-from }}
        cache-to: ${{ inputs.cache-to }}

@@ -99,6 +105,7 @@ runs:
        push: ${{ inputs.push }}
        load: ${{ inputs.load }}
        tags: ${{ inputs.tags }}
+        no-cache: ${{ inputs.no-cache }}
        cache-from: ${{ inputs.cache-from }}
        cache-to: ${{ inputs.cache-to }}

--- a/.github/workflows/docker-build-push-backend-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml
@@ -7,18 +7,47 @@ on:

 env:
  REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
-  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
+  DEPLOYMENT: ${{ contains(github.ref_name, 'cloud') && 'cloud' || 'standalone' }}
+  
+  # don't tag cloud images with "latest"
+  LATEST_TAG: ${{ contains(github.ref_name, 'latest') && !contains(github.ref_name, 'cloud') }}

 jobs:
  build-and-push:
    # TODO: investigate a matrix build like the web container
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
-
+    runs-on:
+      - runs-on
+      - runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }}
+      - run-id=${{ github.run_id }}
+      - tag=platform-${{ matrix.platform }}
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - linux/amd64
+          - linux/arm64
+          
    steps:
+      - name: Prepare
+        run: |
+          platform=${{ matrix.platform }}
+          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
+          
      - name: Checkout code
        uses: actions/checkout@v4

+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
+            
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

@@ -34,18 +63,80 @@ jobs:
          sudo apt-get install -y build-essential

      - name: Backend Image Docker Build and Push
-        uses: docker/build-push-action@v5
+        id: build
+        uses: docker/build-push-action@v6
        with:
          context: ./backend
          file: ./backend/Dockerfile
-          platforms: linux/amd64,linux/arm64
+          platforms: ${{ matrix.platform }}
          push: true
-          tags: |
-            ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-            ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
          build-args: |
            ONYX_VERSION=${{ github.ref_name }}
+          labels: ${{ steps.meta.outputs.labels }}
+          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

+      - name: Export digest      
+        run: |
+          mkdir -p /tmp/digests
+          digest="${{ steps.build.outputs.digest }}"
+          touch "/tmp/digests/${digest#sha256:}"
+
+      - name: Upload digest
+        uses: actions/upload-artifact@v4
+        with:
+          name: backend-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
+          path: /tmp/digests/*
+          if-no-files-found: error
+          retention-days: 1
+          
+  merge:
+    runs-on: ubuntu-latest
+    needs:
+      - build-and-push
+    steps:
+      # Needed for trivyignore
+      - name: Checkout
+        uses: actions/checkout@v4
+        
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp/digests
+          pattern: backend-digests-*-${{ github.run_id }}
+          merge-multiple: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      - name: Create manifest list and push
+        working-directory: /tmp/digests
+        run: |
+          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
+            $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
+
+      - name: Inspect image
+        run: |
+          docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}
+          
      # trivy has their own rate limiting issues causing this action to flake
      # we worked around it by hardcoding to different db repos in env
      # can re-enable when they figure it out
@@ -56,6 +147,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          # To run locally: trivy image --severity HIGH,CRITICAL onyxdotapp/onyx-backend
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
--- a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
@@ -4,12 +4,12 @@ name: Build and Push Cloud Web Image on Tag
 on:
  push:
    tags:
-      - "*"
+      - "*cloud*"

 env:
  REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
-  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
-
+  DEPLOYMENT: cloud
+  
 jobs:
  build:
    runs-on:
@@ -38,9 +38,10 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
          tags: |
-            type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-            type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
+            type=raw,value=${{ github.ref_name }}

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@@ -53,7 +54,7 @@ jobs:

      - name: Build and push by digest
        id: build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -70,10 +71,12 @@ jobs:
            NEXT_PUBLIC_FORGOT_PASSWORD_ENABLED=true
            NEXT_PUBLIC_INCLUDE_ERROR_POPUP_SUPPORT_LINK=true
            NODE_OPTIONS=--max-old-space-size=8192
-          # needed due to weird interactions with the builds for different platforms
-          no-cache: true
          labels: ${{ steps.meta.outputs.labels }}
          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/cloudweb-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/cloudweb-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          # no-cache needed due to weird interactions with the builds for different platforms
+          # NOTE(rkuo): this may not be true any more with the proper cache prefixing by architecture - currently testing with it off

      - name: Export digest
        run: |
@@ -84,7 +87,7 @@ jobs:
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
+          name: cloudweb-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1
@@ -98,7 +101,7 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          path: /tmp/digests
-          pattern: digests-*
+          pattern: cloudweb-digests-*-${{ github.run_id }}
          merge-multiple: true

      - name: Set up Docker Buildx
@@ -109,6 +112,10 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}

      - name: Login to Docker Hub
        uses: docker/login-action@v3
@@ -136,6 +143,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: "CRITICAL,HIGH"
--- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -7,39 +7,55 @@ on:

 env:
  REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
-  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
  DOCKER_BUILDKIT: 1
  BUILDKIT_PROGRESS: plain
+  DEPLOYMENT: ${{ contains(github.ref_name, 'cloud') && 'cloud' || 'standalone' }}

+  # don't tag cloud images with "latest"
+  LATEST_TAG: ${{ contains(github.ref_name, 'latest') && !contains(github.ref_name, 'cloud') }}
+  
 jobs:
-  # 1) Preliminary job to check if the changed files are relevant
+
+#   Bypassing this for now as the idea of not building is glitching
+#   releases and builds that depends on everything being tagged in docker
+#   1) Preliminary job to check if the changed files are relevant
+#   check_model_server_changes:
+#     runs-on: ubuntu-latest
+#     outputs:
+#       changed: ${{ steps.check.outputs.changed }}
+#     steps:
+#       - name: Checkout code
+#         uses: actions/checkout@v4
+# 
+#       - name: Check if relevant files changed
+#         id: check
+#         run: |
+#           # Default to "false"
+#           echo "changed=false" >> $GITHUB_OUTPUT
+# 
+#           # Compare the previous commit (github.event.before) to the current one (github.sha)
+#           # If any file in backend/model_server/** or backend/Dockerfile.model_server is changed,
+#           # set changed=true
+#           if git diff --name-only ${{ github.event.before }} ${{ github.sha }} \
+#              | grep -E '^backend/model_server/|^backend/Dockerfile.model_server'; then
+#             echo "changed=true" >> $GITHUB_OUTPUT
+#           fi
+
  check_model_server_changes:
    runs-on: ubuntu-latest
    outputs:
-      changed: ${{ steps.check.outputs.changed }}
+      changed: "true"
    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Check if relevant files changed
-        id: check
-        run: |
-          # Default to "false"
-          echo "changed=false" >> $GITHUB_OUTPUT
-
-          # Compare the previous commit (github.event.before) to the current one (github.sha)
-          # If any file in backend/model_server/** or backend/Dockerfile.model_server is changed,
-          # set changed=true
-          if git diff --name-only ${{ github.event.before }} ${{ github.sha }} \
-             | grep -E '^backend/model_server/|^backend/Dockerfile.model_server'; then
-            echo "changed=true" >> $GITHUB_OUTPUT
-          fi
-
+      - name: Bypass check and set output
+        run: echo "changed=true" >> $GITHUB_OUTPUT
+        
  build-amd64:
    needs: [check_model_server_changes]
    if: needs.check_model_server_changes.outputs.changed == 'true'
    runs-on:
      [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}-amd64"]
+    env:
+      PLATFORM_PAIR: linux-amd64
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -64,7 +80,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and Push AMD64
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./backend
          file: ./backend/Dockerfile.model_server
@@ -75,12 +91,17 @@ jobs:
            DANSWER_VERSION=${{ github.ref_name }}
          outputs: type=registry
          provenance: false
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+#           no-cache: true

  build-arm64:
    needs: [check_model_server_changes]
    if: needs.check_model_server_changes.outputs.changed == 'true'
    runs-on:
-      [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}-arm64"]
+      [runs-on, runner=8cpu-linux-arm64, "run-id=${{ github.run_id }}-arm64"]
+    env:
+      PLATFORM_PAIR: linux-arm64
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -105,7 +126,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and Push ARM64
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./backend
          file: ./backend/Dockerfile.model_server
@@ -116,6 +137,8 @@ jobs:
            DANSWER_VERSION=${{ github.ref_name }}
          outputs: type=registry
          provenance: false
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

  merge-and-scan:
    needs: [build-amd64, build-arm64, check_model_server_changes]
@@ -145,6 +168,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: "CRITICAL,HIGH"
--- a/.github/workflows/docker-build-push-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-web-container-on-tag.yml
@@ -8,9 +8,25 @@ on:
 env:
  REGISTRY_IMAGE: onyxdotapp/onyx-web-server
  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
+  DEPLOYMENT: standalone

 jobs:
+  precheck:
+    runs-on: [runs-on, runner=2cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    outputs:
+      should-run: ${{ steps.set-output.outputs.should-run }}
+    steps:
+      - name: Check if tag contains "cloud"
+        id: set-output
+        run: |
+          if [[ "${{ github.ref_name }}" == *cloud* ]]; then
+            echo "should-run=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "should-run=true" >> "$GITHUB_OUTPUT"
+          fi
  build:
+    needs: precheck
+    if: needs.precheck.outputs.should-run == 'true'
    runs-on:
      - runs-on
      - runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }}
@@ -37,9 +53,11 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
          tags: |
-            type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-            type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@@ -52,7 +70,7 @@ jobs:

      - name: Build and push by digest
        id: build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -62,11 +80,13 @@ jobs:
            ONYX_VERSION=${{ github.ref_name }}
            NODE_OPTIONS=--max-old-space-size=8192

-          # needed due to weird interactions with the builds for different platforms
-          no-cache: true
          labels: ${{ steps.meta.outputs.labels }}
          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
-
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/web-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/web-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          # no-cache needed due to weird interactions with the builds for different platforms
+          # NOTE(rkuo): this may not be true any more with the proper cache prefixing by architecture - currently testing with it off
+          
      - name: Export digest
        run: |
          mkdir -p /tmp/digests
@@ -76,21 +96,22 @@ jobs:
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
+          name: web-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1

  merge:
-    runs-on: ubuntu-latest
    needs:
      - build
+    if: needs.precheck.outputs.should-run == 'true'
+    runs-on: ubuntu-latest
    steps:
      - name: Download digests
        uses: actions/download-artifact@v4
        with:
          path: /tmp/digests
-          pattern: digests-*
+          pattern: web-digests-*-${{ github.run_id }}
          merge-multiple: true

      - name: Set up Docker Buildx
@@ -101,6 +122,11 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}

      - name: Login to Docker Hub
        uses: docker/login-action@v3
@@ -128,6 +154,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: "CRITICAL,HIGH"
--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -0,0 +1,49 @@
+name: Release Onyx Helm Charts
+
+on:
+  push:
+    branches:
+      - main
+
+permissions: write-all
+
+jobs:
+  release:
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install Helm CLI
+        uses: azure/setup-helm@v4
+        with:
+          version: v3.12.1
+
+      - name: Add required Helm repositories
+        run: |
+          helm repo add bitnami https://charts.bitnami.com/bitnami
+          helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
+          helm repo update
+
+      - name: Build chart dependencies
+        run: |
+          set -euo pipefail
+          for chart_dir in deployment/helm/charts/*; do
+            if [ -f "$chart_dir/Chart.yaml" ]; then
+              echo "Building dependencies for $chart_dir"
+              helm dependency build "$chart_dir"
+            fi
+          done
+
+      - name: Publish Helm charts to gh-pages
+        uses: stefanprodan/helm-gh-pages@v1.7.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          charts_dir: deployment/helm/charts
+          branch: gh-pages
+          commit_username: ${{ github.actor }}
+          commit_email: ${{ github.actor }}@users.noreply.github.com
--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -0,0 +1,98 @@
+name: External Dependency Unit Tests
+
+on:
+  merge_group:
+  pull_request:
+    branches: [main]
+
+env:
+  # AWS
+  S3_AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  S3_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
+
+  # MinIO
+  S3_ENDPOINT_URL: "http://localhost:9004"
+
+  # Confluence
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
+  # LLMs
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+
+jobs:
+  discover-test-dirs:
+    runs-on: ubuntu-latest
+    outputs:
+      test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      
+      - name: Discover test directories
+        id: set-matrix
+        run: |
+          # Find all subdirectories in backend/tests/external_dependency_unit
+          dirs=$(find backend/tests/external_dependency_unit -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-dirs=$dirs" >> $GITHUB_OUTPUT
+
+  external-dependency-unit-tests:
+    needs: discover-test-dirs
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
+
+    env:
+      PYTHONPATH: ./backend
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          playwright install chromium
+          playwright install-deps chromium
+
+      - name: Set up Standard Dependencies
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p onyx-stack up -d minio relational_db cache index
+
+      - name: Run migrations
+        run: |
+          cd backend
+          alembic upgrade head
+
+      - name: Run Tests for ${{ matrix.test-dir }}
+        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
+        run: |
+          py.test \
+            -n 8 \
+            --dist loadfile \
+            --durations=8 \
+            -o junit_family=xunit2 \
+            -xv \
+            --ff \
+            backend/tests/external_dependency_unit/${{ matrix.test-dir }}
--- a/.github/workflows/pr-helm-chart-testing.yml
+++ b/.github/workflows/pr-helm-chart-testing.yml
@@ -37,6 +37,11 @@ jobs:
          echo "changed=true" >> "$GITHUB_OUTPUT"
        fi

+    # uncomment to force run chart-testing
+#     - name: Force run chart-testing (list-changed)
+#       id: list-changed
+#       run: echo "changed=true" >> $GITHUB_OUTPUT
+        
    # lint all charts if any changes were detected
    - name: Run chart-testing (lint)
      if: steps.list-changed.outputs.changed == 'true'
@@ -50,7 +55,25 @@ jobs:

    - name: Run chart-testing (install)
      if: steps.list-changed.outputs.changed == 'true'
-      run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml
+      run: ct install --all \
+        --helm-extra-set-args="\
+          --set=nginx.enabled=false \
+          --set=postgresql.enabled=false \
+          --set=redis.enabled=false \
+          --set=minio.enabled=false \
+          --set=vespa.enabled=false \
+          --set=slackbot.enabled=false \
+          --set=api.replicaCount=0 \
+          --set=inferenceCapability.replicaCount=0 \
+          --set=indexCapability.replicaCount=0 \
+          --set=celery_beat.replicaCount=0 \
+          --set=celery_worker_heavy.replicaCount=0 \
+          --set=celery_worker_docprocessing.replicaCount=0 \
+          --set=celery_worker_light.replicaCount=0 \
+          --set=celery_worker_monitoring.replicaCount=0 \
+          --set=celery_worker_primary.replicaCount=0 \
+          --set=celery_worker_user_files_indexing.replicaCount=0" \
+        --debug --config ct.yaml
      # the following would install only changed charts, but we only have one chart so 
      # don't worry about that for now
      # run: ct install --target-branch ${{ github.event.repository.default_branch }}
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -16,15 +16,64 @@ env:
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
+  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
+  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
+  PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
+  PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
+  PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
+  PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
+  PLATFORM_PAIR: linux-amd64

 jobs:
  integration-tests:
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on, runner=32cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    runs-on:
+      [
+        runs-on,
+        runner=32cpu-linux-x64,
+        disk=large,
+        "run-id=${{ github.run_id }}",
+      ]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+            backend/requirements/ee.txt
+      - run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/ee.txt
+
+      - name: Generate OpenAPI schema
+        working-directory: ./backend
+        env:
+          PYTHONPATH: "."
+        run: |
+          python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+      - name: Generate OpenAPI Python client
+        working-directory: ./backend
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}/backend/generated:/local" \
+            openapitools/openapi-generator-cli generate \
+            -i /local/openapi.json \
+            -g python \
+            -o /local/onyx_openapi_client \
+            --package-name onyx_openapi_client \
+            --skip-validate-spec \
+            --openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

@@ -61,8 +110,8 @@ jobs:
          tags: onyxdotapp/onyx-backend:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      - name: Build Model Server Docker image
        uses: ./.github/actions/custom-build-and-push
@@ -73,8 +122,8 @@ jobs:
          tags: onyxdotapp/onyx-model-server:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      - name: Build integration test Docker image
        uses: ./.github/actions/custom-build-and-push
@@ -85,8 +134,8 @@ jobs:
          tags: onyxdotapp/onyx-integration:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      # Start containers for multi-tenant tests
      - name: Start Docker containers for multi-tenant tests
@@ -113,6 +162,8 @@ jobs:
            -e POSTGRES_HOST=relational_db \
            -e POSTGRES_USER=postgres \
            -e POSTGRES_PASSWORD=password \
+            -e DB_READONLY_USER=db_readonly_user \
+            -e DB_READONLY_PASSWORD=password \
            -e POSTGRES_DB=postgres \
            -e POSTGRES_USE_NULL_POOL=true \
            -e VESPA_HOST=index \
@@ -158,6 +209,7 @@ jobs:
          DISABLE_TELEMETRY=true \
          IMAGE_TAG=test \
          INTEGRATION_TESTS_MODE=true \
+          CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=0.001 \
          docker compose -f docker-compose.dev.yml -p onyx-stack up -d
        id: start_docker

@@ -202,6 +254,8 @@ jobs:
            -p mock-it-services-stack up -d

      # NOTE: Use pre-ping/null to reduce flakiness due to dropped connections
+      # NOTE: `-e ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true` should be added once
+      # enterprise tests are fixed 
      - name: Run Standard Integration Tests
        run: |
          echo "Running integration tests..."
@@ -210,6 +264,8 @@ jobs:
            -e POSTGRES_HOST=relational_db \
            -e POSTGRES_USER=postgres \
            -e POSTGRES_PASSWORD=password \
+            -e DB_READONLY_USER=db_readonly_user \
+            -e DB_READONLY_PASSWORD=password \
            -e POSTGRES_DB=postgres \
            -e POSTGRES_POOL_PRE_PING=true \
            -e POSTGRES_USE_NULL_POOL=true \
@@ -221,6 +277,13 @@ jobs:
            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
+            -e JIRA_BASE_URL=${JIRA_BASE_URL} \
+            -e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
+            -e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
+            -e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
+            -e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
+            -e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
+            -e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
            -e TEST_WEB_HOSTNAME=test-runner \
            -e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
            -e MOCK_CONNECTOR_SERVER_PORT=8001 \
--- a/.github/workflows/pr-labeler.yml
+++ b/.github/workflows/pr-labeler.yml
@@ -0,0 +1,38 @@
+name: PR Labeler
+
+on:
+  pull_request_target:
+    branches:
+      - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - edited
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  validate_pr_title:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR title for Conventional Commits
+        env:
+          PR_TITLE: ${{ github.event.pull_request.title }}
+        run: |
+          echo "PR Title: $PR_TITLE"
+          if [[ ! "$PR_TITLE" =~ ^(feat|fix|docs|test|ci|refactor|perf|chore|revert|build)(\(.+\))?:\ .+ ]]; then
+            echo "::error::❌ Your PR title does not follow the Conventional Commits format.
+              This check ensures that all pull requests use clear, consistent titles that help automate changelogs and improve project history.
+
+              Please update your PR title to follow the Conventional Commits style.  
+              Here is a link to a blog explaining the reason why we've included the Conventional Commits style into our PR titles: https://xfuture-blog.com/working-with-conventional-commits
+
+              **Here are some examples of valid PR titles:**
+              - feat: add user authentication
+              - fix(login): handle null password error
+              - docs(readme): update installation instructions"
+            exit 1
+          fi
--- a/.github/workflows/pr-mit-integration-tests.yml
+++ b/.github/workflows/pr-mit-integration-tests.yml
@@ -0,0 +1,264 @@
+name: Run MIT Integration Tests v2
+concurrency:
+  group: Run-MIT-Integration-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }}
+  cancel-in-progress: true
+
+on:
+  merge_group:
+  pull_request:
+    branches:
+      - main
+      - "release/**"
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
+  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
+  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
+  PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
+  PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
+  PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
+  PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
+  PLATFORM_PAIR: linux-amd64
+jobs:
+  integration-tests-mit:
+    # See https://runs-on.com/runners/linux/
+    runs-on:
+      [
+        runs-on,
+        runner=32cpu-linux-x64,
+        disk=large,
+        "run-id=${{ github.run_id }}",
+      ]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+      - run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+
+      - name: Generate OpenAPI schema
+        working-directory: ./backend
+        env:
+          PYTHONPATH: "."
+        run: |
+          python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+      - name: Generate OpenAPI Python client
+        working-directory: ./backend
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}/backend/generated:/local" \
+            openapitools/openapi-generator-cli generate \
+            -i /local/openapi.json \
+            -g python \
+            -o /local/onyx_openapi_client \
+            --package-name onyx_openapi_client \
+            --skip-validate-spec \
+            --openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
+            
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      # tag every docker image with "test" so that we can spin up the correct set
+      # of images during testing
+
+      # We don't need to build the Web Docker image since it's not yet used
+      # in the integration tests. We have a separate action to verify that it builds
+      # successfully.
+      - name: Pull Web Docker image
+        run: |
+          docker pull onyxdotapp/onyx-web-server:latest
+          docker tag onyxdotapp/onyx-web-server:latest onyxdotapp/onyx-web-server:test
+
+      # we use the runs-on cache for docker builds
+      # in conjunction with runs-on runners, it has better speed and unlimited caching
+      # https://runs-on.com/caching/s3-cache-for-github-actions/
+      # https://runs-on.com/caching/docker/
+      # https://github.com/moby/buildkit#s3-cache-experimental
+
+      # images are built and run locally for testing purposes. Not pushed.
+      - name: Build Backend Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile
+          platforms: linux/amd64
+          tags: onyxdotapp/onyx-backend:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Build Model Server Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile.model_server
+          platforms: linux/amd64
+          tags: onyxdotapp/onyx-model-server:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Build integration test Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/tests/integration/Dockerfile
+          platforms: linux/amd64
+          tags: onyxdotapp/onyx-integration:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      # NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
+      - name: Start Docker containers
+        run: |
+          cd deployment/docker_compose
+          AUTH_TYPE=basic \
+          POSTGRES_POOL_PRE_PING=true \
+          POSTGRES_USE_NULL_POOL=true \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          IMAGE_TAG=test \
+          INTEGRATION_TESTS_MODE=true \
+          docker compose -f docker-compose.dev.yml -p onyx-stack up -d
+        id: start_docker
+
+      - name: Wait for service to be ready
+        run: |
+          echo "Starting wait-for-service script..."
+
+          docker logs -f onyx-stack-api_server-1 &
+
+          start_time=$(date +%s)
+          timeout=300  # 5 minutes in seconds
+
+          while true; do
+            current_time=$(date +%s)
+            elapsed_time=$((current_time - start_time))
+            
+            if [ $elapsed_time -ge $timeout ]; then
+              echo "Timeout reached. Service did not become ready in 5 minutes."
+              exit 1
+            fi
+            
+            # Use curl with error handling to ignore specific exit code 56
+            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
+            
+            if [ "$response" = "200" ]; then
+              echo "Service is ready!"
+              break
+            elif [ "$response" = "curl_error" ]; then
+              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
+            else
+              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
+            fi
+            
+            sleep 5
+          done
+          echo "Finished waiting for service."
+
+      - name: Start Mock Services
+        run: |
+          cd backend/tests/integration/mock_services
+          docker compose -f docker-compose.mock-it-services.yml \
+            -p mock-it-services-stack up -d
+
+      # NOTE: Use pre-ping/null to reduce flakiness due to dropped connections
+      - name: Run Standard Integration Tests
+        run: |
+          echo "Running integration tests..."
+          docker run --rm --network onyx-stack_default \
+            --name test-runner \
+            -e POSTGRES_HOST=relational_db \
+            -e POSTGRES_USER=postgres \
+            -e POSTGRES_PASSWORD=password \
+            -e POSTGRES_DB=postgres \
+            -e DB_READONLY_USER=db_readonly_user \
+            -e DB_READONLY_PASSWORD=password \
+            -e POSTGRES_POOL_PRE_PING=true \
+            -e POSTGRES_USE_NULL_POOL=true \
+            -e VESPA_HOST=index \
+            -e REDIS_HOST=cache \
+            -e API_SERVER_HOST=api_server \
+            -e OPENAI_API_KEY=${OPENAI_API_KEY} \
+            -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
+            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
+            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
+            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
+            -e JIRA_BASE_URL=${JIRA_BASE_URL} \
+            -e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
+            -e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
+            -e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
+            -e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
+            -e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
+            -e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
+            -e TEST_WEB_HOSTNAME=test-runner \
+            -e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
+            -e MOCK_CONNECTOR_SERVER_PORT=8001 \
+            onyxdotapp/onyx-integration:test \
+            /app/tests/integration/tests \
+            /app/tests/integration/connector_job_tests
+        continue-on-error: true
+        id: run_tests
+
+      - name: Check test results
+        run: |
+          if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then
+            echo "Integration tests failed. Exiting with error."
+            exit 1
+          else
+            echo "All integration tests passed successfully."
+          fi
+
+      # ------------------------------------------------------------
+      # Always gather logs BEFORE "down":
+      - name: Dump API server logs
+        if: always()
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color api_server > $GITHUB_WORKSPACE/api_server.log || true
+
+      - name: Dump all-container logs (optional)
+        if: always()
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
+
+      - name: Upload logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: docker-all-logs
+          path: ${{ github.workspace }}/docker-compose.log
+      # ------------------------------------------------------------
+
+      - name: Stop Docker containers
+        if: always()
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p onyx-stack down -v
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -159,12 +159,6 @@ jobs:
          done
          echo "Finished waiting for service."

-      - name: Run pytest playwright test init
-        working-directory: ./backend
-        env:
-          PYTEST_IGNORE_SKIP: true
-        run: pytest -s tests/integration/tests/playwright/test_playwright.py
-
      - name: Run Playwright tests
        working-directory: ./web
        run: npx playwright test
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -31,20 +31,35 @@ jobs:
        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt

+    - name: Generate OpenAPI schema
+      working-directory: ./backend
+      env:
+        PYTHONPATH: "."
+      run: |
+        python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+    - name: Generate OpenAPI Python client
+      working-directory: ./backend
+      run: |
+        docker run --rm \
+          -v "${{ github.workspace }}/backend/generated:/local" \
+          openapitools/openapi-generator-cli generate \
+          -i /local/openapi.json \
+          -g python \
+          -o /local/onyx_openapi_client \
+          --package-name onyx_openapi_client \
+          --skip-validate-spec \
+          --openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
+            
    - name: Run MyPy
      run: |
        cd backend
        mypy .

-    - name: Run ruff
-      run: |
-        cd backend
-        ruff .
-
    - name: Check import order with reorder-python-imports
      run: |
        cd backend
-        find ./danswer -name "*.py" | xargs reorder-python-imports --py311-plus
+        find ./onyx -name "*.py" | xargs reorder-python-imports --py311-plus

    - name: Check code formatting with Black
      run: |
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -1,6 +1,7 @@
 name: Connector Tests

 on:
+  merge_group:
  pull_request:
    branches: [main]
  schedule:
@@ -8,50 +9,97 @@ on:
    - cron: "0 16 * * *"

 env:
+  # AWS
+  AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS }}
+  AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS }}
+
  # Confluence
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
-  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
  # Jira
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
+
+  # Gong
+  GONG_ACCESS_KEY: ${{ secrets.GONG_ACCESS_KEY }}
+  GONG_ACCESS_KEY_SECRET: ${{ secrets.GONG_ACCESS_KEY_SECRET }}
+
  # Google
  GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}
  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
  GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
  GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
+
  # Slab
  SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}
+
  # Zendesk
  ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }}
  ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }}
  ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }}
+
  # Salesforce
  SF_USERNAME: ${{ secrets.SF_USERNAME }}
  SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
  SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
+
+  # Hubspot
+  HUBSPOT_ACCESS_TOKEN: ${{ secrets.HUBSPOT_ACCESS_TOKEN }}
+
+  # IMAP
+  IMAP_HOST: ${{ secrets.IMAP_HOST }}
+  IMAP_USERNAME: ${{ secrets.IMAP_USERNAME }}
+  IMAP_PASSWORD: ${{ secrets.IMAP_PASSWORD }}
+  IMAP_MAILBOXES: ${{ secrets.IMAP_MAILBOXES }}
+
  # Airtable
  AIRTABLE_TEST_BASE_ID: ${{ secrets.AIRTABLE_TEST_BASE_ID }}
  AIRTABLE_TEST_TABLE_ID: ${{ secrets.AIRTABLE_TEST_TABLE_ID }}
  AIRTABLE_TEST_TABLE_NAME: ${{ secrets.AIRTABLE_TEST_TABLE_NAME }}
  AIRTABLE_ACCESS_TOKEN: ${{ secrets.AIRTABLE_ACCESS_TOKEN }}
+
  # Sharepoint
  SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }}
  SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
  SHAREPOINT_CLIENT_DIRECTORY_ID: ${{ secrets.SHAREPOINT_CLIENT_DIRECTORY_ID }}
  SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
+
+  # Github
+  ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
+
+  # Gitlab
+  GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
+
  # Gitbook
  GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
  GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}

+  # Notion
+  NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }}
+
+  # Highspot
+  HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }}
+  HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }}
+
+  # Slack
+  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  # Teams
+  TEAMS_APPLICATION_ID: ${{ secrets.TEAMS_APPLICATION_ID }}
+  TEAMS_DIRECTORY_ID: ${{ secrets.TEAMS_DIRECTORY_ID }}
+  TEAMS_SECRET: ${{ secrets.TEAMS_SECRET }}
+
 jobs:
  connectors-check:
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"]
+    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]

    env:
      PYTHONPATH: ./backend
@@ -76,10 +124,18 @@ jobs:
          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
          playwright install chromium
          playwright install-deps chromium
-          
+
      - name: Run Tests
        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
-        run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
+        run: |
+          py.test \
+            -n 8 \
+            --dist loadfile \
+            --durations=8 \
+            -o junit_family=xunit2 \
+            -xv \
+            --ff \
+            backend/tests/daily/connectors

      - name: Alert on Failure
        if: failure() && github.event_name == 'schedule'
--- a/.github/workflows/pr-python-tests.yml
+++ b/.github/workflows/pr-python-tests.yml
@@ -15,6 +15,9 @@ jobs:
    env:
      PYTHONPATH: ./backend
      REDIS_CLOUD_PYTEST_PASSWORD: ${{ secrets.REDIS_CLOUD_PYTEST_PASSWORD }}
+      SF_USERNAME: ${{ secrets.SF_USERNAME }}
+      SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
+      SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
      
    steps:
    - name: Checkout code
@@ -28,12 +31,14 @@ jobs:
        cache-dependency-path: |
          backend/requirements/default.txt
          backend/requirements/dev.txt
+          backend/requirements/model_server.txt

    - name: Install Dependencies
      run: |
        python -m pip install --upgrade pip
        pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt

    - name: Run Tests
      shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,40 @@
-.env
+# editors
+.vscode
+.zed
+
+# macos
 .DS_store
+
+# python
 .venv
 .mypy_cache
 .idea
-/deployment/data/nginx/app.conf
-.vscode/
-*.sw?
-/backend/tests/regression/answer_quality/search_test_config.yaml
+
+# testing
 /web/test-results/
 backend/onyx/agent_search/main/test_data.json
 backend/tests/regression/answer_quality/test_data.json
+backend/tests/regression/search_quality/eval-*
+backend/tests/regression/search_quality/search_eval_config.yaml
+backend/tests/regression/search_quality/*.json
+*.log
+
+# secret files
+.env
+jira_test_env
+settings.json
+
+# others
+/deployment/data/nginx/app.conf
+*.sw?
+/backend/tests/regression/answer_quality/search_test_config.yaml
+
+# Local .terraform directories
+**/.terraform/*
+
+# Local .tfstate files
+*.tfstate
+*.tfstate.*
+
+# Local .terraform.lock.hcl file
+.terraform.lock.hcl
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,13 @@
 repos:
  - repo: https://github.com/psf/black
-    rev: 23.3.0
+    rev: 25.1.0
    hooks:
    - id: black
      language_version: python3.11

-  - repo: https://github.com/asottile/reorder_python_imports
-    rev: v3.9.0
+  # this is a fork which keeps compatibility with black
+  - repo: https://github.com/wimglenn/reorder-python-imports-black
+    rev: v3.14.0
    hooks:
    - id: reorder-python-imports
      args: ['--py311-plus', '--application-directories=backend/']
@@ -18,14 +19,14 @@ repos:
  # These settings will remove unused imports with side effects
  # Note: The repo currently does not and should not have imports with side effects
  - repo: https://github.com/PyCQA/autoflake
-    rev: v2.2.0
+    rev: v2.3.1
    hooks:
      - id: autoflake
        args: [ '--remove-all-unused-imports', '--remove-unused-variables', '--in-place' , '--recursive']

  - repo: https://github.com/astral-sh/ruff-pre-commit
    # Ruff version.
-    rev: v0.0.286
+    rev: v0.11.4
    hooks:
      - id: ruff
  - repo: https://github.com/pre-commit/mirrors-prettier
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -23,6 +23,9 @@ DISABLE_LLM_DOC_RELEVANCE=False
 # Useful if you want to toggle auth on/off (google_oauth/OIDC specifically)
 OAUTH_CLIENT_ID=<REPLACE THIS>
 OAUTH_CLIENT_SECRET=<REPLACE THIS>
+OPENID_CONFIG_URL=<REPLACE THIS>
+SAML_CONF_DIR=/<ABSOLUTE PATH TO ONYX>/onyx/backend/ee/onyx/configs/saml_config
+
 # Generally not useful for dev, we don't generally want to set up an SMTP server for dev
 REQUIRE_EMAIL_VERIFICATION=False

@@ -45,8 +48,8 @@ PYTHONPATH=../backend
 PYTHONUNBUFFERED=1


-# Internet Search 
-BING_API_KEY=<REPLACE THIS>
+# Internet Search
+EXA_API_KEY=<REPLACE THIS>


 # Enable the full set of Danswer Enterprise Edition features
@@ -58,3 +61,18 @@ AGENT_RETRIEVAL_STATS=False   # Note: This setting will incur substantial re-ran
 AGENT_RERANKING_STATS=True
 AGENT_MAX_QUERY_RETRIEVAL_RESULTS=20
 AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS=20
+
+# S3 File Store Configuration (MinIO for local development)
+S3_ENDPOINT_URL=http://localhost:9004
+S3_FILE_STORE_BUCKET_NAME=onyx-file-store-bucket
+S3_AWS_ACCESS_KEY_ID=minioadmin
+S3_AWS_SECRET_ACCESS_KEY=minioadmin
+
+# Show extra/uncommon connectors
+SHOW_EXTRA_CONNECTORS=True
+
+# Local langsmith tracing
+LANGSMITH_TRACING="true"
+LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
+LANGSMITH_API_KEY=<REPLACE_THIS>
+LANGSMITH_PROJECT=<REPLACE_THIS>
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -6,396 +6,464 @@
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "compounds": [
-		{
-			// Dummy entry used to label the group
-			"name": "--- Compound ---",
-		  	"configurations": [
-		  		"--- Individual ---"
-		  	],
-			"presentation": {
-				 "group": "1",
-			 }
-		},
-        {
-            "name": "Run All Onyx Services",
-            "configurations": [
-                "Web Server",
-                "Model Server",
-                "API Server",
-                "Slack Bot",
-		  		"Celery primary", 
-		  		"Celery light", 
-		  		"Celery heavy", 
-		  		"Celery indexing", 
-		  		"Celery beat",
-                "Celery monitoring",
-            ],
-			"presentation": {
-				 "group": "1",
-			 }
+      {
+        // Dummy entry used to label the group
+        "name": "--- Compound ---",
+        "configurations": ["--- Individual ---"],
+        "presentation": {
+          "group": "1"
+        }
+      },
+      {
+        "name": "Run All Onyx Services",
+        "configurations": [
+          "Web Server",
+          "Model Server",
+          "API Server",
+          "Slack Bot",
+          "Celery primary",
+          "Celery light",
+          "Celery heavy",
+          "Celery docfetching",
+          "Celery docprocessing",
+          "Celery beat",
+          "Celery monitoring"
+        ],
+        "presentation": {
+          "group": "1"
        },
-		{
-			"name": "Web / Model / API",
-		  	"configurations": [
-		  		"Web Server", 
-		  		"Model Server", 
-		  		"API Server",
-		  	],
-			"presentation": {
-				 "group": "1",
-			 }
-		},
-		{
-			"name": "Celery (all)",
-		  	"configurations": [
-		  		"Celery primary", 
-		  		"Celery light", 
-		  		"Celery heavy", 
-		  		"Celery indexing", 
-		  		"Celery beat",
-                "Celery monitoring",
-		  	],
-			"presentation": {
-				 "group": "1",
-			 }
-		}
+        "stopAll": true
+      },
+      {
+        "name": "Web / Model / API",
+        "configurations": ["Web Server", "Model Server", "API Server"],
+        "presentation": {
+          "group": "1"
+        },
+        "stopAll": true
+      },
+      {
+        "name": "Celery (all)",
+        "configurations": [
+          "Celery primary",
+          "Celery light",
+          "Celery heavy",
+          "Celery docfetching",
+          "Celery docprocessing",
+          "Celery beat",
+          "Celery monitoring"
+        ],
+        "presentation": {
+          "group": "1"
+        },
+        "stopAll": true
+      }
    ],
    "configurations": [
-		{
-	    	// Dummy entry used to label the group
-			"name": "--- Individual ---",
-			"type": "node",
-			"request": "launch",
-			"presentation": {
-				"group": "2",
-				"order": 0
-			}
-		},
-        {
-            "name": "Web Server",
-            "type": "node",
-            "request": "launch",
-            "cwd": "${workspaceRoot}/web",
-            "runtimeExecutable": "npm",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "runtimeArgs": [
-                "run", "dev"
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "console": "integratedTerminal",
-            "consoleTitle": "Web Server Console"
+      {
+        // Dummy entry used to label the group
+        "name": "--- Individual ---",
+        "type": "node",
+        "request": "launch",
+        "presentation": {
+          "group": "2",
+          "order": 0
+        }
+      },
+      {
+        "name": "Web Server",
+        "type": "node",
+        "request": "launch",
+        "cwd": "${workspaceRoot}/web",
+        "runtimeExecutable": "npm",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "runtimeArgs": ["run", "dev"],
+        "presentation": {
+          "group": "2"
        },
-        {
-            "name": "Model Server",
-            "consoleName": "Model Server",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "uvicorn",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1"
-            },
-            "args": [
-                "model_server.main:app",
-                "--reload",
-                "--port",
-                "9000"
-            ],
-            "presentation": {
-				 "group": "2",
-			},
-            "consoleTitle": "Model Server Console"
+        "console": "integratedTerminal",
+        "consoleTitle": "Web Server Console"
+      },
+      {
+        "name": "Model Server",
+        "consoleName": "Model Server",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "uvicorn",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "DEBUG",
+          "PYTHONUNBUFFERED": "1"
        },
-        {
-            "name": "API Server",
-            "consoleName": "API Server",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "uvicorn",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_DANSWER_MODEL_INTERACTIONS": "True",
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1"
-            },
-            "args": [
-                "onyx.main:app",
-                "--reload",
-                "--port",
-                "8080"
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "API Server Console"
+        "args": ["model_server.main:app", "--reload", "--port", "9000"],
+        "presentation": {
+          "group": "2"
        },
-        // For the listener to access the Slack API,
-        // DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
-        {
-            "name": "Slack Bot",
-            "consoleName": "Slack Bot",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "onyx/onyxbot/slack/listener.py",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "presentation": {
-				 "group": "2",
-			},
-            "consoleTitle": "Slack Bot Console"
+        "consoleTitle": "Model Server Console"
+      },
+      {
+        "name": "API Server",
+        "consoleName": "API Server",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "uvicorn",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_DANSWER_MODEL_INTERACTIONS": "True",
+          "LOG_LEVEL": "DEBUG",
+          "PYTHONUNBUFFERED": "1"
        },
-        {
-            "name": "Celery primary",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "celery",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "INFO",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "args": [
-                "-A",
-                "onyx.background.celery.versioned_apps.primary",
-                "worker",
-                "--pool=threads",
-                "--concurrency=4",
-                "--prefetch-multiplier=1",
-                "--loglevel=INFO",
-                "--hostname=primary@%n",
-                "-Q",
-                "celery",
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Celery primary Console"
+        "args": ["onyx.main:app", "--reload", "--port", "8080"],
+        "presentation": {
+          "group": "2"
        },
-        {
-            "name": "Celery light",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "celery",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "INFO",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "args": [
-                "-A",
-                "onyx.background.celery.versioned_apps.light",
-                "worker",
-                "--pool=threads",
-                "--concurrency=64",
-                "--prefetch-multiplier=8",
-                "--loglevel=INFO",
-                "--hostname=light@%n",
-                "-Q",
-                "vespa_metadata_sync,connector_deletion,doc_permissions_upsert,checkpoint_cleanup",
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Celery light Console"
+        "consoleTitle": "API Server Console"
+      },
+      // For the listener to access the Slack API,
+      // DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
+      {
+        "name": "Slack Bot",
+        "consoleName": "Slack Bot",
+        "type": "debugpy",
+        "request": "launch",
+        "program": "onyx/onyxbot/slack/listener.py",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "DEBUG",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
        },
-        {
-            "name": "Celery heavy",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "celery",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "INFO",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "args": [
-                "-A",
-                "onyx.background.celery.versioned_apps.heavy",
-                "worker",
-                "--pool=threads",
-                "--concurrency=4",
-                "--prefetch-multiplier=1",
-                "--loglevel=INFO",
-                "--hostname=heavy@%n",
-                "-Q",
-                "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Celery heavy Console"
+        "presentation": {
+          "group": "2"
        },
-        {
-            "name": "Celery indexing",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "celery",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "ENABLE_MULTIPASS_INDEXING": "false",
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "args": [
-                "-A",
-                "onyx.background.celery.versioned_apps.indexing",
-                "worker",
-                "--pool=threads",
-                "--concurrency=1",
-                "--prefetch-multiplier=1",
-                "--loglevel=INFO",
-                "--hostname=indexing@%n",
-                "-Q",
-                "connector_indexing",
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Celery indexing Console"
+        "consoleTitle": "Slack Bot Console"
+      },
+      {
+        "name": "Celery primary",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "INFO",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
        },
-        {
-            "name": "Celery monitoring",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "celery",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {},
-            "args": [
-                "-A",
-                "onyx.background.celery.versioned_apps.monitoring",
-                "worker",
-                "--pool=solo",
-                "--concurrency=1",
-                "--prefetch-multiplier=1",
-                "--loglevel=INFO",
-                "--hostname=monitoring@%n",
-                "-Q",
-                "monitoring",
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Celery monitoring Console"
+        "args": [
+          "-A",
+          "onyx.background.celery.versioned_apps.primary",
+          "worker",
+          "--pool=threads",
+          "--concurrency=4",
+          "--prefetch-multiplier=1",
+          "--loglevel=INFO",
+          "--hostname=primary@%n",
+          "-Q",
+          "celery"
+        ],
+        "presentation": {
+          "group": "2"
        },
-        {
-            "name": "Celery beat",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "celery",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "args": [
-                "-A",
-                "onyx.background.celery.versioned_apps.beat",
-                "beat",
-                "--loglevel=INFO",
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Celery beat Console"
+        "consoleTitle": "Celery primary Console"
+      },
+      {
+        "name": "Celery light",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "INFO",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
        },
-        {
-            "name": "Pytest",
-            "consoleName": "Pytest",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "pytest",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
-            "args": [
-                "-v"
-                // Specify a sepcific module/test to run or provide nothing to run all tests
-                //"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
-            ],
-            "presentation": {
-				 "group": "2",
-			 },
-            "consoleTitle": "Pytest Console"
+        "args": [
+          "-A",
+          "onyx.background.celery.versioned_apps.light",
+          "worker",
+          "--pool=threads",
+          "--concurrency=64",
+          "--prefetch-multiplier=8",
+          "--loglevel=INFO",
+          "--hostname=light@%n",
+          "-Q",
+          "vespa_metadata_sync,connector_deletion,doc_permissions_upsert,index_attempt_cleanup"
+        ],
+        "presentation": {
+          "group": "2"
        },
-		{
-	    	// Dummy entry used to label the group
-			"name": "--- Tasks ---",
-			"type": "node",
-			"request": "launch",
-			"presentation": {
-				"group": "3",
-				"order": 0
-			}
-		},
-        {
-            "name": "Clear and Restart External Volumes and Containers",
-            "type": "node",
-            "request": "launch",
-            "runtimeExecutable": "bash",
-            "runtimeArgs": ["${workspaceFolder}/backend/scripts/restart_containers.sh"],
-            "cwd": "${workspaceFolder}",
-            "console": "integratedTerminal",
-            "stopOnEntry": true,
-            "presentation": {
-				 "group": "3",
-			 },
+        "consoleTitle": "Celery light Console"
+      },
+      {
+        "name": "Celery heavy",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "INFO",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
        },
-        {
-	        // Celery jobs launched through a single background script (legacy)
-	        // Recommend using the "Celery (all)" compound launch instead.
-            "name": "Background Jobs",
-            "consoleName": "Background Jobs",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "scripts/dev_run_background_jobs.py",
-            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
-            "env": {
-                "LOG_DANSWER_MODEL_INTERACTIONS": "True",
-                "LOG_LEVEL": "DEBUG",
-                "PYTHONUNBUFFERED": "1",
-                "PYTHONPATH": "."
-            },
+        "args": [
+          "-A",
+          "onyx.background.celery.versioned_apps.heavy",
+          "worker",
+          "--pool=threads",
+          "--concurrency=4",
+          "--prefetch-multiplier=1",
+          "--loglevel=INFO",
+          "--hostname=heavy@%n",
+          "-Q",
+          "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync"
+        ],
+        "presentation": {
+          "group": "2"
        },
-        {
-            "name": "Install Python Requirements",
-            "type": "node",
-            "request": "launch",
-            "runtimeExecutable": "bash",
-            "runtimeArgs": [
-                "-c",
-                "pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
-            ],
-            "cwd": "${workspaceFolder}",
-            "console": "integratedTerminal",
-            "presentation": {
-                 "group": "3"
-            }
+        "consoleTitle": "Celery heavy Console"
+      },
+      {
+        "name": "Celery docfetching",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
        },
+        "args": [
+            "-A",
+            "onyx.background.celery.versioned_apps.docfetching",
+            "worker",
+            "--pool=threads",
+            "--concurrency=1",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docfetching@%n",
+            "-Q",
+            "connector_doc_fetching,user_files_indexing"
+        ],
+        "presentation": {
+            "group": "2"
+        },
+        "consoleTitle": "Celery docfetching Console",
+        "justMyCode": false
+    },
+    {
+        "name": "Celery docprocessing",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+            "ENABLE_MULTIPASS_INDEXING": "false",
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
+        },
+        "args": [
+            "-A",
+            "onyx.background.celery.versioned_apps.docprocessing",
+            "worker",
+            "--pool=threads",
+            "--concurrency=6",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docprocessing@%n",
+            "-Q",
+            "docprocessing"
+        ],
+        "presentation": {
+            "group": "2"
+        },
+        "consoleTitle": "Celery docprocessing Console",
+        "justMyCode": false
+    },
+      {
+        "name": "Celery monitoring",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {},
+        "args": [
+          "-A",
+          "onyx.background.celery.versioned_apps.monitoring",
+          "worker",
+          "--pool=solo",
+          "--concurrency=1",
+          "--prefetch-multiplier=1",
+          "--loglevel=INFO",
+          "--hostname=monitoring@%n",
+          "-Q",
+          "monitoring"
+        ],
+        "presentation": {
+          "group": "2"
+        },
+        "consoleTitle": "Celery monitoring Console"
+      },
+      {
+        "name": "Celery beat",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "DEBUG",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
+        },
+        "args": [
+          "-A",
+          "onyx.background.celery.versioned_apps.beat",
+          "beat",
+          "--loglevel=INFO"
+        ],
+        "presentation": {
+          "group": "2"
+        },
+        "consoleTitle": "Celery beat Console"
+      },
+      {
+        "name": "Pytest",
+        "consoleName": "Pytest",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "pytest",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_LEVEL": "DEBUG",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
+        },
+        "args": [
+          "-v"
+          // Specify a sepcific module/test to run or provide nothing to run all tests
+          //"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
+        ],
+        "presentation": {
+          "group": "2"
+        },
+        "consoleTitle": "Pytest Console"
+      },
+      {
+        // Dummy entry used to label the group
+        "name": "--- Tasks ---",
+        "type": "node",
+        "request": "launch",
+        "presentation": {
+          "group": "3",
+          "order": 0
+        }
+      },
+      {
+        "name": "Clear and Restart External Volumes and Containers",
+        "type": "node",
+        "request": "launch",
+        "runtimeExecutable": "bash",
+        "runtimeArgs": [
+          "${workspaceFolder}/backend/scripts/restart_containers.sh"
+        ],
+        "cwd": "${workspaceFolder}",
+        "console": "integratedTerminal",
+        "stopOnEntry": true,
+        "presentation": {
+          "group": "3"
+        }
+      },
+      {
+        // Celery jobs launched through a single background script (legacy)
+        // Recommend using the "Celery (all)" compound launch instead.
+        "name": "Background Jobs",
+        "consoleName": "Background Jobs",
+        "type": "debugpy",
+        "request": "launch",
+        "program": "scripts/dev_run_background_jobs.py",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+          "LOG_DANSWER_MODEL_INTERACTIONS": "True",
+          "LOG_LEVEL": "DEBUG",
+          "PYTHONUNBUFFERED": "1",
+          "PYTHONPATH": "."
+        }
+      },
+      {
+        "name": "Install Python Requirements",
+        "type": "node",
+        "request": "launch",
+        "runtimeExecutable": "bash",
+        "runtimeArgs": [
+          "-c",
+          "pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
+        ],
+        "cwd": "${workspaceFolder}",
+        "console": "integratedTerminal",
+        "presentation": {
+          "group": "3"
+        }
+      },
+    {
+      // script to generate the openapi schema
+      "name": "Onyx OpenAPI Schema Generator",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "scripts/onyx_openapi_schema.py",
+      "cwd": "${workspaceFolder}/backend",
+      "envFile": "${workspaceFolder}/.env",
+      "env": {
+        "PYTHONUNBUFFERED": "1",
+        "PYTHONPATH": "."
+      },
+      "args": [
+        "--filename",
+        "generated/openapi.json"
+      ]
+    },
+    {
+      // script to debug multi tenant db issues
+      "name": "Onyx DB Manager (Top Chunks)",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "scripts/debugging/onyx_db.py",
+      "cwd": "${workspaceFolder}/backend",
+      "envFile": "${workspaceFolder}/.env",
+      "env": {
+        "PYTHONUNBUFFERED": "1",
+        "PYTHONPATH": "."
+      },
+      "args": [
+        "--password",
+        "your_password_here",
+        "--port",
+        "5433",
+        "--report",
+        "top-chunks",
+        "--filename",
+        "generated/tenants_by_num_docs.csv"
+      ]
+    },
+      {
+        "name": "Debug React Web App in Chrome",
+        "type": "chrome",
+        "request": "launch",
+        "url": "http://localhost:3000",
+        "webRoot": "${workspaceFolder}/web"
+      }
    ]
-}
+  }
+  
--- a/.vscode/tasks.template.jsonc
+++ b/.vscode/tasks.template.jsonc
@@ -0,0 +1,101 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "type": "austin",
+            "label": "Profile celery beat",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/backend"
+            },
+            "command": [
+                "sudo",
+                "-E"
+            ],
+            "args": [
+              "celery",
+              "-A",
+              "onyx.background.celery.versioned_apps.beat",
+              "beat",
+              "--loglevel=INFO"
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate Onyx OpenAPI Python client",
+            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/backend"
+            },
+            "command": [
+                "openapi-generator"
+            ],
+            "args": [
+                "generate",
+                "-i",
+                "generated/openapi.json",
+                "-g",
+                "python",
+                "-o",
+                "generated/onyx_openapi_client",
+                "--package-name",
+                "onyx_openapi_client",
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate Typescript Fetch client (openapi-generator)",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}"
+            },
+            "command": [
+                "openapi-generator"
+            ],
+            "args": [
+                "generate",
+                "-i",
+                "backend/generated/openapi.json",
+                "-g",
+                "typescript-fetch",
+                "-o",
+                "${workspaceFolder}/web/src/lib/generated/onyx_api",
+                "--additional-properties=disallowAdditionalPropertiesIfNotPresent=false,legacyDiscriminatorBehavior=false,supportsES6=true",
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate TypeScript Client (openapi-ts)",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/web"
+            },
+            "command": [
+                "npx"
+            ],
+            "args": [
+                "openapi-typescript",
+                "../backend/generated/openapi.json",
+                "--output",
+                "./src/lib/generated/onyx-schema.ts",
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate TypeScript Client (orval)",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/web"
+            },
+            "command": [
+                "npx"
+            ],
+            "args": [
+            	"orval",
+                "--config",
+                "orval.config.js",
+            ]
+        }
+    ]
+}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->

 # Contributing to Onyx

@@ -12,8 +12,8 @@ As an open source project in a rapidly changing space, we welcome all contributi

 The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to start for contribution ideas.

-To ensure that your contribution is aligned with the project's direction, please reach out to Hagen (or any other maintainer) on the Onyx team
-via [Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA) /
+To ensure that your contribution is aligned with the project's direction, please reach out to any maintainer on the Onyx team
+via [Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA) /
 [Discord](https://discord.gg/TDJ59cGV2X) or [email](mailto:founders@onyx.app).

 Issues that have been explicitly approved by the maintainers (aligned with the direction of the project)
@@ -28,7 +28,7 @@ Your input is vital to making sure that Onyx moves in the right direction.
 Before starting on implementation, please raise a GitHub issue.

 Also, always feel free to message the founders (Chris Weaver / Yuhong Sun) on
-[Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA) /
+[Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA) /
 [Discord](https://discord.gg/TDJ59cGV2X) directly about anything at all.

 ### Contributing Code
@@ -59,6 +59,7 @@ Onyx being a fully functional app, relies on some external software, specificall
 - [Postgres](https://www.postgresql.org/) (Relational DB)
 - [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
 - [Redis](https://redis.io/) (Cache)
+- [MinIO](https://min.io/) (File Store)
 - [Nginx](https://nginx.org/) (Not needed for development flows generally)

 > **Note:**
@@ -102,10 +103,10 @@ If using PowerShell, the command slightly differs:
 Install the required python dependencies:

 ```bash
-pip install -r onyx/backend/requirements/default.txt
-pip install -r onyx/backend/requirements/dev.txt
-pip install -r onyx/backend/requirements/ee.txt
-pip install -r onyx/backend/requirements/model_server.txt
+pip install -r backend/requirements/default.txt
+pip install -r backend/requirements/dev.txt
+pip install -r backend/requirements/ee.txt
+pip install -r backend/requirements/model_server.txt
 ```

 Install Playwright for Python (headless browser required by the Web Connector)
@@ -171,10 +172,10 @@ Otherwise, you can follow the instructions below to run the application for deve

 You will need Docker installed to run these containers.

-First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
+First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:

 ```bash
-docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache
+docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache minio
 ```

 (index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
--- a/CONTRIBUTING_VSCODE.md
+++ b/CONTRIBUTING_VSCODE.md
@@ -5,7 +5,7 @@ This guide explains how to set up and use VSCode's debugging capabilities with t
 ## Initial Setup

 1. **Environment Setup**:
-   - Copy `.vscode/.env.template` to `.vscode/.env`
+   - Copy `.vscode/env_template.txt` to `.vscode/.env`
   - Fill in the necessary environment variables in `.vscode/.env`
 2. **launch.json**:
   - Copy `.vscode/launch.template.jsonc` to `.vscode/launch.json`
@@ -17,10 +17,9 @@ Before starting, make sure the Docker Daemon is running.
 1. Open the Debug view in VSCode (Cmd+Shift+D on macOS)
 2. From the dropdown at the top, select "Clear and Restart External Volumes and Containers" and press the green play button
 3. From the dropdown at the top, select "Run All Onyx Services" and press the green play button
-4. CD into web, run "npm i" followed by npm run dev.
-5. Now, you can navigate to onyx in your browser (default is http://localhost:3000) and start using the app
-6. You can set breakpoints by clicking to the left of line numbers to help debug while the app is running
-7. Use the debug toolbar to step through code, inspect variables, etc.
+4. Now, you can navigate to onyx in your browser (default is http://localhost:3000) and start using the app
+5. You can set breakpoints by clicking to the left of line numbers to help debug while the app is running
+6. Use the debug toolbar to step through code, inspect variables, etc.

 ## Features

--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->

 <a name="readme-top"></a>

@@ -13,7 +13,7 @@
 <a href="https://docs.onyx.app/" target="_blank">
    <img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
 </a>
-<a href="https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA" target="_blank">
+<a href="https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA" target="_blank">
    <img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
 </a>
 <a href="https://discord.gg/TDJ59cGV2X" target="_blank">
@@ -57,7 +57,7 @@ https://private-user-images.githubusercontent.com/32520769/414509312-48392e83-95
 **To try it out for free and get started in seconds, check out [Onyx Cloud](https://cloud.onyx.app/signup)**.

 Onyx can also be run locally (even on a laptop) or deployed on a virtual machine with a single
-`docker compose` command. Checkout our [docs](https://docs.onyx.app/quickstart) to learn more.
+`docker compose` command. Checkout our [docs](https://docs.onyx.app/deployment/getting_started/quickstart) to learn more.

 We also have built-in support for high-availability/scalable deployment on Kubernetes.
 References [here](https://github.com/onyx-dot-app/onyx/tree/main/deployment).
@@ -97,7 +97,7 @@ Keep knowledge and access up to sync across 40+ connectors:
 - Websites
 - And more ...

-See the full list [here](https://docs.onyx.app/connectors).
+See the full list [here](https://docs.onyx.app/admin/connectors/overview).


 ## 📚 Licensing
@@ -114,3 +114,4 @@ To try the Onyx Enterprise Edition:

 ## 💡 Contributing
 Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
+
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -9,4 +9,6 @@ api_keys.py
 vespa-app.zip
 dynamic_config_storage/
 celerybeat-schedule*
-onyx/connectors/salesforce/data/
+onyx/connectors/salesforce/data/
+.test.env
+/generated
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -8,11 +8,12 @@ Edition features outside of personal development or testing purposes. Please rea
 founders@onyx.app for more information. Please visit https://github.com/onyx-dot-app/onyx"

 # Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
-ARG ONYX_VERSION=0.8-dev
+ARG ONYX_VERSION=0.0.0-dev
 # DO_NOT_TRACK is used to disable telemetry for Unstructured
 ENV ONYX_VERSION=${ONYX_VERSION} \
    DANSWER_RUNNING_IN_DOCKER="true" \
-    DO_NOT_TRACK="true"
+    DO_NOT_TRACK="true" \
+    PLAYWRIGHT_BROWSERS_PATH="/app/.cache/ms-playwright"


 RUN echo "ONYX_VERSION: ${ONYX_VERSION}"
@@ -77,6 +78,9 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/* && \
    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

+# Install postgresql-client for easy manual tests
+# Install it here to avoid it being cleaned up above
+RUN apt-get update && apt-get install -y postgresql-client

 # Pre-downloading models for setups with limited egress
 RUN python -c "from tokenizers import Tokenizer; \
@@ -85,7 +89,7 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
 # Pre-downloading NLTK for setups with limited egress
 RUN python -c "import nltk; \
 nltk.download('stopwords', quiet=True); \
-nltk.download('punkt', quiet=True);"
+nltk.download('punkt_tab', quiet=True);"
 # nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed

 # Set up application files
@@ -102,6 +106,7 @@ COPY ./alembic /app/alembic
 COPY ./alembic_tenants /app/alembic_tenants
 COPY ./alembic.ini /app/alembic.ini
 COPY supervisord.conf /usr/etc/supervisord.conf
+COPY ./static /app/static

 # Escape hatch scripts
 COPY ./scripts/debugging /app/scripts/debugging
@@ -112,6 +117,14 @@ COPY ./assets /app/assets

 ENV PYTHONPATH=/app

+# Create non-root user for security best practices
+RUN groupadd -g 1001 onyx && \
+    useradd -u 1001 -g onyx -m -s /bin/bash onyx && \
+    chown -R onyx:onyx /app && \
+    mkdir -p /var/log/onyx && \
+    chmod 755 /var/log/onyx && \
+    chown onyx:onyx /var/log/onyx
+
 # Default command which does nothing
 # This container is used by api server and background which specify their own CMD
 CMD ["tail", "-f", "/dev/null"]
--- a/backend/Dockerfile.model_server
+++ b/backend/Dockerfile.model_server
@@ -7,13 +7,38 @@ You can find it at https://hub.docker.com/r/onyx/onyx-model-server. For more det
 visit https://github.com/onyx-dot-app/onyx."

 # Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
-ARG ONYX_VERSION=0.8-dev
+ARG ONYX_VERSION=0.0.0-dev
 ENV ONYX_VERSION=${ONYX_VERSION} \
-    DANSWER_RUNNING_IN_DOCKER="true"
-
+    DANSWER_RUNNING_IN_DOCKER="true" \
+    HF_HOME=/app/.cache/huggingface

 RUN echo "ONYX_VERSION: ${ONYX_VERSION}"

+# Create non-root user for security best practices
+RUN mkdir -p /app && \
+    groupadd -g 1001 onyx && \
+    useradd -u 1001 -g onyx -m -s /bin/bash onyx  && \
+    chown -R onyx:onyx /app && \
+    mkdir -p /var/log/onyx && \
+    chmod 755 /var/log/onyx && \
+    chown onyx:onyx /var/log/onyx
+
+# --- add toolchain needed for Rust/Python builds (fastuuid) ---
+ENV RUSTUP_HOME=/usr/local/rustup \
+    CARGO_HOME=/usr/local/cargo \
+    PATH=/usr/local/cargo/bin:$PATH
+
+RUN set -eux; \
+    apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        pkg-config \
+        curl \
+        ca-certificates \
+    && rm -rf /var/lib/apt/lists/* \
+    # Install latest stable Rust (supports Cargo.lock v4)
+    && curl -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable \
+    && rustc --version && cargo --version
+
 COPY ./requirements/model_server.txt /tmp/requirements.txt
 RUN pip install --no-cache-dir --upgrade \
        --retries 5 \
@@ -31,20 +56,24 @@ RUN python -c "from transformers import AutoTokenizer; \
 AutoTokenizer.from_pretrained('distilbert-base-uncased'); \
 AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
 from huggingface_hub import snapshot_download; \
-snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3'); \
+snapshot_download(repo_id='onyx-dot-app/hybrid-intent-token-classifier'); \
+snapshot_download(repo_id='onyx-dot-app/information-content-model'); \
 snapshot_download('nomic-ai/nomic-embed-text-v1'); \
 snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
 from sentence_transformers import SentenceTransformer; \
 SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);"

-# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while
-# running Onyx, don't overwrite it with the built in cache folder
-RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface
+# In case the user has volumes mounted to /app/.cache/huggingface that they've downloaded while
+# running Onyx, move the current contents of the cache folder to a temporary location to ensure 
+# it's preserved in order to combine with the user's cache contents
+RUN mv /app/.cache/huggingface /app/.cache/temp_huggingface && \
+    chown -R onyx:onyx /app

 WORKDIR /app

 # Utils used by model server
 COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py
+COPY ./onyx/utils/middleware.py /app/onyx/utils/middleware.py

 # Place to fetch version information
 COPY ./onyx/__init__.py /app/onyx/__init__.py
--- a/backend/alembic.ini
+++ b/backend/alembic.ini
@@ -84,7 +84,7 @@ keys = console
 keys = generic

 [logger_root]
-level = WARN
+level = INFO
 handlers = console
 qualname =

--- a/backend/alembic/README.md
+++ b/backend/alembic/README.md
@@ -1,4 +1,4 @@
-<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->

 # Alembic DB Migrations

@@ -20,3 +20,44 @@ To run all un-applied migrations:
 To undo migrations:
 `alembic downgrade -X`
 where X is the number of migrations you want to undo from the current state
+
+### Multi-tenant migrations
+
+For multi-tenant deployments, you can use additional options:
+
+**Upgrade all tenants:**
+```bash
+alembic -x upgrade_all_tenants=true upgrade head
+```
+
+**Upgrade specific schemas:**
+```bash
+# Single schema
+alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012 upgrade head
+
+# Multiple schemas (comma-separated)
+alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012,public,another_tenant upgrade head
+```
+
+**Upgrade tenants within an alphabetical range:**
+```bash
+# Upgrade tenants 100-200 when sorted alphabetically (positions 100 to 200)
+alembic -x upgrade_all_tenants=true -x tenant_range_start=100 -x tenant_range_end=200 upgrade head
+
+# Upgrade tenants starting from position 1000 alphabetically
+alembic -x upgrade_all_tenants=true -x tenant_range_start=1000 upgrade head
+
+# Upgrade first 500 tenants alphabetically
+alembic -x upgrade_all_tenants=true -x tenant_range_end=500 upgrade head
+```
+
+**Continue on error (for batch operations):**
+```bash
+alembic -x upgrade_all_tenants=true -x continue=true upgrade head
+```
+
+The tenant range filtering works by:
+1. Sorting tenant IDs alphabetically
+2. Using 1-based position numbers (1st, 2nd, 3rd tenant, etc.)
+3. Filtering to the specified range of positions
+4. Non-tenant schemas (like 'public') are always included
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -1,12 +1,12 @@
 from typing import Any, Literal
-from onyx.db.engine import get_iam_auth_token
+from onyx.db.engine.iam_auth import get_iam_auth_token
 from onyx.configs.app_configs import USE_IAM_AUTH
 from onyx.configs.app_configs import POSTGRES_HOST
 from onyx.configs.app_configs import POSTGRES_PORT
 from onyx.configs.app_configs import POSTGRES_USER
 from onyx.configs.app_configs import AWS_REGION_NAME
-from onyx.db.engine import build_connection_string
-from onyx.db.engine import get_all_tenant_ids
+from onyx.db.engine.sql_engine import build_connection_string
+from onyx.db.engine.tenant_utils import get_all_tenant_ids
 from sqlalchemy import event
 from sqlalchemy import pool
 from sqlalchemy import text
@@ -21,9 +21,17 @@ from alembic import context
 from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.sql.schema import SchemaItem
 from onyx.configs.constants import SSL_CERT_FILE
-from shared_configs.configs import MULTI_TENANT, POSTGRES_DEFAULT_SCHEMA
+from shared_configs.configs import (
+    MULTI_TENANT,
+    POSTGRES_DEFAULT_SCHEMA,
+    TENANT_ID_PREFIX,
+)
 from onyx.db.models import Base
 from celery.backends.database.session import ResultModelBase  # type: ignore
+from onyx.db.engine.sql_engine import SqlEngine
+
+# Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
+# hidden! (defaults to level=WARN)

 # Alembic Config object
 config = context.config
@@ -36,6 +44,7 @@ if config.config_file_name is not None and config.attributes.get(
 target_metadata = [Base.metadata, ResultModelBase.metadata]

 EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}
+
 logger = logging.getLogger(__name__)

 ssl_context: ssl.SSLContext | None = None
@@ -64,36 +73,154 @@ def include_object(
    return True


-def get_schema_options() -> tuple[str, bool, bool]:
+def filter_tenants_by_range(
+    tenant_ids: list[str], start_range: int | None = None, end_range: int | None = None
+) -> list[str]:
+    """
+    Filter tenant IDs by alphabetical position range.
+
+    Args:
+        tenant_ids: List of tenant IDs to filter
+        start_range: Starting position in alphabetically sorted list (1-based, inclusive)
+        end_range: Ending position in alphabetically sorted list (1-based, inclusive)
+
+    Returns:
+        Filtered list of tenant IDs in their original order
+    """
+    if start_range is None and end_range is None:
+        return tenant_ids
+
+    # Separate tenant IDs from non-tenant schemas
+    tenant_schemas = [tid for tid in tenant_ids if tid.startswith(TENANT_ID_PREFIX)]
+    non_tenant_schemas = [
+        tid for tid in tenant_ids if not tid.startswith(TENANT_ID_PREFIX)
+    ]
+
+    # Sort tenant schemas alphabetically.
+    # NOTE: can cause missed schemas if a schema is created in between workers
+    # fetching of all tenant IDs. We accept this risk for now. Just re-running
+    # the migration will fix the issue.
+    sorted_tenant_schemas = sorted(tenant_schemas)
+
+    # Apply range filtering (0-based indexing)
+    start_idx = start_range if start_range is not None else 0
+    end_idx = end_range if end_range is not None else len(sorted_tenant_schemas)
+
+    # Ensure indices are within bounds
+    start_idx = max(0, start_idx)
+    end_idx = min(len(sorted_tenant_schemas), end_idx)
+
+    # Get the filtered tenant schemas
+    filtered_tenant_schemas = sorted_tenant_schemas[start_idx:end_idx]
+
+    # Combine with non-tenant schemas and preserve original order
+    filtered_tenants = []
+    for tenant_id in tenant_ids:
+        if tenant_id in filtered_tenant_schemas or tenant_id in non_tenant_schemas:
+            filtered_tenants.append(tenant_id)
+
+    return filtered_tenants
+
+
+def get_schema_options() -> (
+    tuple[bool, bool, bool, int | None, int | None, list[str] | None]
+):
    x_args_raw = context.get_x_argument()
    x_args = {}
    for arg in x_args_raw:
-        for pair in arg.split(","):
-            if "=" in pair:
-                key, value = pair.split("=", 1)
-                x_args[key.strip()] = value.strip()
-    schema_name = x_args.get("schema", POSTGRES_DEFAULT_SCHEMA)
+        if "=" in arg:
+            key, value = arg.split("=", 1)
+            x_args[key.strip()] = value.strip()
+        else:
+            raise ValueError(f"Invalid argument: {arg}")
+
    create_schema = x_args.get("create_schema", "true").lower() == "true"
    upgrade_all_tenants = x_args.get("upgrade_all_tenants", "false").lower() == "true"

-    if (
-        MULTI_TENANT
-        and schema_name == POSTGRES_DEFAULT_SCHEMA
-        and not upgrade_all_tenants
-    ):
+    # continue on error with individual tenant
+    # only applies to online migrations
+    continue_on_error = x_args.get("continue", "false").lower() == "true"
+
+    # Tenant range filtering
+    tenant_range_start = None
+    tenant_range_end = None
+
+    if "tenant_range_start" in x_args:
+        try:
+            tenant_range_start = int(x_args["tenant_range_start"])
+        except ValueError:
+            raise ValueError(
+                f"Invalid tenant_range_start value: {x_args['tenant_range_start']}. Must be an integer."
+            )
+
+    if "tenant_range_end" in x_args:
+        try:
+            tenant_range_end = int(x_args["tenant_range_end"])
+        except ValueError:
+            raise ValueError(
+                f"Invalid tenant_range_end value: {x_args['tenant_range_end']}. Must be an integer."
+            )
+
+    # Validate range
+    if tenant_range_start is not None and tenant_range_end is not None:
+        if tenant_range_start > tenant_range_end:
+            raise ValueError(
+                f"tenant_range_start ({tenant_range_start}) cannot be greater than tenant_range_end ({tenant_range_end})"
+            )
+
+    # Specific schema names filtering (replaces both schema_name and the old tenant_ids approach)
+    schemas = None
+    if "schemas" in x_args:
+        schema_names_str = x_args["schemas"].strip()
+        if schema_names_str:
+            # Split by comma and strip whitespace
+            schemas = [
+                name.strip() for name in schema_names_str.split(",") if name.strip()
+            ]
+            if schemas:
+                logger.info(f"Specific schema names specified: {schemas}")
+
+    # Validate that only one method is used at a time
+    range_filtering = tenant_range_start is not None or tenant_range_end is not None
+    specific_filtering = schemas is not None and len(schemas) > 0
+
+    if range_filtering and specific_filtering:
        raise ValueError(
-            "Cannot run default migrations in public schema when multi-tenancy is enabled. "
-            "Please specify a tenant-specific schema."
+            "Cannot use both tenant range filtering (tenant_range_start/tenant_range_end) "
+            "and specific schema filtering (schemas) at the same time. "
+            "Please use only one filtering method."
        )

-    return schema_name, create_schema, upgrade_all_tenants
+    if upgrade_all_tenants and specific_filtering:
+        raise ValueError(
+            "Cannot use both upgrade_all_tenants=true and schemas at the same time. "
+            "Use either upgrade_all_tenants=true for all tenants, or schemas for specific schemas."
+        )
+
+    # If any filtering parameters are specified, we're not doing the default single schema migration
+    if range_filtering:
+        upgrade_all_tenants = True
+
+    # Validate multi-tenant requirements
+    if MULTI_TENANT and not upgrade_all_tenants and not specific_filtering:
+        raise ValueError(
+            "In multi-tenant mode, you must specify either upgrade_all_tenants=true "
+            "or provide schemas. Cannot run default migration."
+        )
+
+    return (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    )


 def do_run_migrations(
    connection: Connection, schema_name: str, create_schema: bool
 ) -> None:
-    logger.info(f"About to migrate schema: {schema_name}")
-
    if create_schema:
        connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{schema_name}"'))
        connection.execute(text("COMMIT"))
@@ -134,7 +261,20 @@ def provide_iam_token_for_alembic(


 async def run_async_migrations() -> None:
-    schema_name, create_schema, upgrade_all_tenants = get_schema_options()
+    (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    ) = get_schema_options()
+
+    if not schemas and not MULTI_TENANT:
+        schemas = [POSTGRES_DEFAULT_SCHEMA]
+
+    # without init_engine, subsequent engine calls fail hard intentionally
+    SqlEngine.init_engine(pool_size=20, max_overflow=5)

    engine = create_async_engine(
        build_connection_string(),
@@ -149,11 +289,18 @@ async def run_async_migrations() -> None:
        ) -> None:
            provide_iam_token_for_alembic(dialect, conn_rec, cargs, cparams)

-    if upgrade_all_tenants:
-        tenant_schemas = get_all_tenant_ids()
-        for schema in tenant_schemas:
+    if schemas:
+        # Use specific schema names directly without fetching all tenants
+        logger.info(f"Migrating specific schema names: {schemas}")
+
+        i_schema = 0
+        num_schemas = len(schemas)
+        for schema in schemas:
+            i_schema += 1
+            logger.info(
+                f"Migrating schema: index={i_schema} num_schemas={num_schemas} schema={schema}"
+            )
            try:
-                logger.info(f"Migrating schema: {schema}")
                async with engine.connect() as connection:
                    await connection.run_sync(
                        do_run_migrations,
@@ -162,28 +309,108 @@ async def run_async_migrations() -> None:
                    )
            except Exception as e:
                logger.error(f"Error migrating schema {schema}: {e}")
-                raise
+                if not continue_on_error:
+                    logger.error("--continue=true is not set, raising exception!")
+                    raise
+
+                logger.warning("--continue=true is set, continuing to next schema.")
+
+    elif upgrade_all_tenants:
+        tenant_schemas = get_all_tenant_ids()
+
+        filtered_tenant_schemas = filter_tenants_by_range(
+            tenant_schemas, tenant_range_start, tenant_range_end
+        )
+
+        if tenant_range_start is not None or tenant_range_end is not None:
+            logger.info(
+                f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
+            )
+            logger.info(
+                f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
+            )
+
+        i_tenant = 0
+        num_tenants = len(filtered_tenant_schemas)
+        for schema in filtered_tenant_schemas:
+            i_tenant += 1
+            logger.info(
+                f"Migrating schema: index={i_tenant} num_tenants={num_tenants} schema={schema}"
+            )
+            try:
+                async with engine.connect() as connection:
+                    await connection.run_sync(
+                        do_run_migrations,
+                        schema_name=schema,
+                        create_schema=create_schema,
+                    )
+            except Exception as e:
+                logger.error(f"Error migrating schema {schema}: {e}")
+                if not continue_on_error:
+                    logger.error("--continue=true is not set, raising exception!")
+                    raise
+
+                logger.warning("--continue=true is set, continuing to next schema.")
+
    else:
-        try:
-            logger.info(f"Migrating schema: {schema_name}")
-            async with engine.connect() as connection:
-                await connection.run_sync(
-                    do_run_migrations,
-                    schema_name=schema_name,
-                    create_schema=create_schema,
-                )
-        except Exception as e:
-            logger.error(f"Error migrating schema {schema_name}: {e}")
-            raise
+        # This should not happen in the new design since we require either
+        # upgrade_all_tenants=true or schemas in multi-tenant mode
+        # and for non-multi-tenant mode, we should use schemas with the default schema
+        raise ValueError(
+            "No migration target specified. Use either upgrade_all_tenants=true for all tenants "
+            "or schemas for specific schemas."
+        )

    await engine.dispose()


 def run_migrations_offline() -> None:
-    schema_name, _, upgrade_all_tenants = get_schema_options()
+    """
+    NOTE(rkuo): This generates a sql script that can be used to migrate the database ...
+    instead of migrating the db live via an open connection
+
+    Not clear on when this would be used by us or if it even works.
+
+    If it is offline, then why are there calls to the db engine?
+
+    This doesn't really get used when we migrate in the cloud."""
+
+    logger.info("run_migrations_offline starting.")
+
+    # without init_engine, subsequent engine calls fail hard intentionally
+    SqlEngine.init_engine(pool_size=20, max_overflow=5)
+
+    (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    ) = get_schema_options()
    url = build_connection_string()

-    if upgrade_all_tenants:
+    if schemas:
+        # Use specific schema names directly without fetching all tenants
+        logger.info(f"Migrating specific schema names: {schemas}")
+
+        for schema in schemas:
+            logger.info(f"Migrating schema: {schema}")
+            context.configure(
+                url=url,
+                target_metadata=target_metadata,  # type: ignore
+                literal_binds=True,
+                include_object=include_object,
+                version_table_schema=schema,
+                include_schemas=True,
+                script_location=config.get_main_option("script_location"),
+                dialect_opts={"paramstyle": "named"},
+            )
+
+            with context.begin_transaction():
+                context.run_migrations()
+
+    elif upgrade_all_tenants:
        engine = create_async_engine(url)

        if USE_IAM_AUTH:
@@ -197,7 +424,19 @@ def run_migrations_offline() -> None:
        tenant_schemas = get_all_tenant_ids()
        engine.sync_engine.dispose()

-        for schema in tenant_schemas:
+        filtered_tenant_schemas = filter_tenants_by_range(
+            tenant_schemas, tenant_range_start, tenant_range_end
+        )
+
+        if tenant_range_start is not None or tenant_range_end is not None:
+            logger.info(
+                f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
+            )
+            logger.info(
+                f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
+            )
+
+        for schema in filtered_tenant_schemas:
            logger.info(f"Migrating schema: {schema}")
            context.configure(
                url=url,
@@ -213,23 +452,15 @@ def run_migrations_offline() -> None:
            with context.begin_transaction():
                context.run_migrations()
    else:
-        logger.info(f"Migrating schema: {schema_name}")
-        context.configure(
-            url=url,
-            target_metadata=target_metadata,  # type: ignore
-            literal_binds=True,
-            include_object=include_object,
-            version_table_schema=schema_name,
-            include_schemas=True,
-            script_location=config.get_main_option("script_location"),
-            dialect_opts={"paramstyle": "named"},
+        # This should not happen in the new design
+        raise ValueError(
+            "No migration target specified. Use either upgrade_all_tenants=true for all tenants "
+            "or schemas for specific schemas."
        )

-        with context.begin_transaction():
-            context.run_migrations()
-

 def run_migrations_online() -> None:
+    logger.info("run_migrations_online starting.")
    asyncio.run(run_async_migrations())


--- a/backend/alembic/versions/027381bce97c_add_shortcut_option_for_users.py
+++ b/backend/alembic/versions/027381bce97c_add_shortcut_option_for_users.py
@@ -5,6 +5,7 @@ Revises: 6fc7886d665d
 Create Date: 2025-01-14 12:14:00.814390

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/03bf8be6b53a_rework_kg_config.py
+++ b/backend/alembic/versions/03bf8be6b53a_rework_kg_config.py
@@ -0,0 +1,121 @@
+"""rework-kg-config
+
+Revision ID: 03bf8be6b53a
+Revises: 65bc6e0f8500
+Create Date: 2025-06-16 10:52:34.815335
+
+"""
+
+import json
+
+
+from datetime import datetime
+from datetime import timedelta
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import text
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "03bf8be6b53a"
+down_revision = "65bc6e0f8500"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # get current config
+    current_configs = (
+        op.get_bind()
+        .execute(text("SELECT kg_variable_name, kg_variable_values FROM kg_config"))
+        .all()
+    )
+    current_config_dict = {
+        config.kg_variable_name: (
+            config.kg_variable_values[0]
+            if config.kg_variable_name
+            not in ("KG_VENDOR_DOMAINS", "KG_IGNORE_EMAIL_DOMAINS")
+            else config.kg_variable_values
+        )
+        for config in current_configs
+        if config.kg_variable_values
+    }
+
+    # not using the KGConfigSettings model here in case it changes in the future
+    kg_config_settings = json.dumps(
+        {
+            "KG_EXPOSED": current_config_dict.get("KG_EXPOSED", False),
+            "KG_ENABLED": current_config_dict.get("KG_ENABLED", False),
+            "KG_VENDOR": current_config_dict.get("KG_VENDOR", None),
+            "KG_VENDOR_DOMAINS": current_config_dict.get("KG_VENDOR_DOMAINS", []),
+            "KG_IGNORE_EMAIL_DOMAINS": current_config_dict.get(
+                "KG_IGNORE_EMAIL_DOMAINS", []
+            ),
+            "KG_COVERAGE_START": current_config_dict.get(
+                "KG_COVERAGE_START",
+                (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d"),
+            ),
+            "KG_MAX_COVERAGE_DAYS": current_config_dict.get("KG_MAX_COVERAGE_DAYS", 90),
+            "KG_MAX_PARENT_RECURSION_DEPTH": current_config_dict.get(
+                "KG_MAX_PARENT_RECURSION_DEPTH", 2
+            ),
+            "KG_BETA_PERSONA_ID": current_config_dict.get("KG_BETA_PERSONA_ID", None),
+        }
+    )
+    op.execute(
+        f"INSERT INTO key_value_store (key, value) VALUES ('kg_config', '{kg_config_settings}')"
+    )
+
+    # drop kg config table
+    op.drop_table("kg_config")
+
+
+def downgrade() -> None:
+    # get current config
+    current_config_dict = {
+        "KG_EXPOSED": False,
+        "KG_ENABLED": False,
+        "KG_VENDOR": [],
+        "KG_VENDOR_DOMAINS": [],
+        "KG_IGNORE_EMAIL_DOMAINS": [],
+        "KG_COVERAGE_START": (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d"),
+        "KG_MAX_COVERAGE_DAYS": 90,
+        "KG_MAX_PARENT_RECURSION_DEPTH": 2,
+    }
+    current_configs = (
+        op.get_bind()
+        .execute(text("SELECT value FROM key_value_store WHERE key = 'kg_config'"))
+        .one_or_none()
+    )
+    if current_configs is not None:
+        current_config_dict.update(current_configs[0])
+    insert_values = [
+        {
+            "kg_variable_name": name,
+            "kg_variable_values": (
+                [str(val).lower() if isinstance(val, bool) else str(val)]
+                if not isinstance(val, list)
+                else val
+            ),
+        }
+        for name, val in current_config_dict.items()
+    ]
+
+    op.create_table(
+        "kg_config",
+        sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
+        sa.Column("kg_variable_name", sa.String(), nullable=False, index=True),
+        sa.Column("kg_variable_values", postgresql.ARRAY(sa.String()), nullable=False),
+        sa.UniqueConstraint("kg_variable_name", name="uq_kg_config_variable_name"),
+    )
+    op.bulk_insert(
+        sa.table(
+            "kg_config",
+            sa.column("kg_variable_name", sa.String),
+            sa.column("kg_variable_values", postgresql.ARRAY(sa.String)),
+        ),
+        insert_values,
+    )
+
+    op.execute("DELETE FROM key_value_store WHERE key = 'kg_config'")
--- a/backend/alembic/versions/0816326d83aa_add_federated_connector_tables.py
+++ b/backend/alembic/versions/0816326d83aa_add_federated_connector_tables.py
@@ -0,0 +1,72 @@
+"""add federated connector tables
+
+Revision ID: 0816326d83aa
+Revises: 12635f6655b7
+Create Date: 2025-06-29 14:09:45.109518
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision = "0816326d83aa"
+down_revision = "12635f6655b7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create federated_connector table
+    op.create_table(
+        "federated_connector",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("source", sa.String(), nullable=False),
+        sa.Column("credentials", sa.LargeBinary(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create federated_connector_oauth_token table
+    op.create_table(
+        "federated_connector_oauth_token",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("federated_connector_id", sa.Integer(), nullable=False),
+        sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("token", sa.LargeBinary(), nullable=False),
+        sa.Column("expires_at", sa.DateTime(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create federated_connector__document_set table
+    op.create_table(
+        "federated_connector__document_set",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("federated_connector_id", sa.Integer(), nullable=False),
+        sa.Column("document_set_id", sa.Integer(), nullable=False),
+        sa.Column("entities", postgresql.JSONB(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["document_set_id"], ["document_set.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint(
+            "federated_connector_id",
+            "document_set_id",
+            name="uq_federated_connector_document_set",
+        ),
+    )
+
+
+def downgrade() -> None:
+    # Drop tables in reverse order due to foreign key dependencies
+    op.drop_table("federated_connector__document_set")
+    op.drop_table("federated_connector_oauth_token")
+    op.drop_table("federated_connector")
--- a/backend/alembic/versions/08a1eda20fe1_add_earliest_indexing_to_connector.py
+++ b/backend/alembic/versions/08a1eda20fe1_add_earliest_indexing_to_connector.py
@@ -5,6 +5,7 @@ Revises: 8a87bd6ec550
 Create Date: 2024-07-23 11:12:39.462397

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/0a2b51deb0b8_add_starter_prompts.py
+++ b/backend/alembic/versions/0a2b51deb0b8_add_starter_prompts.py
@@ -5,6 +5,7 @@ Revises: 5f4b8568a221
 Create Date: 2024-03-02 23:23:49.960309

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/0a98909f2757_enable_encrypted_fields.py
+++ b/backend/alembic/versions/0a98909f2757_enable_encrypted_fields.py
@@ -5,6 +5,7 @@ Revises: 570282d33c49
 Create Date: 2024-05-05 19:30:34.317972

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.sql import table
--- a/backend/alembic/versions/0ebb1d516877_add_ccpair_deletion_failure_message.py
+++ b/backend/alembic/versions/0ebb1d516877_add_ccpair_deletion_failure_message.py
@@ -5,6 +5,7 @@ Revises: 52a219fb5233
 Create Date: 2024-09-10 15:03:48.233926

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
+++ b/backend/alembic/versions/0f7ff6d75b57_add_index_to_index_attempt_time_created.py
@@ -5,6 +5,7 @@ Revises: 369644546676
 Create Date: 2025-01-10 14:01:14.067144

 """
+
 from alembic import op

 # revision identifiers, used by Alembic.
--- a/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
+++ b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
@@ -0,0 +1,596 @@
+"""drive-canonical-ids
+
+Revision ID: 12635f6655b7
+Revises: 58c50ef19f08
+Create Date: 2025-06-20 14:44:54.241159
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from urllib.parse import urlparse, urlunparse
+from httpx import HTTPStatusError
+import httpx
+from onyx.document_index.factory import get_default_document_index
+from onyx.db.search_settings import SearchSettings
+from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
+from onyx.document_index.vespa.shared_utils.utils import (
+    replace_invalid_doc_id_characters,
+)
+from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
+from onyx.utils.logger import setup_logger
+import os
+
+logger = setup_logger()
+
+# revision identifiers, used by Alembic.
+revision = "12635f6655b7"
+down_revision = "58c50ef19f08"
+branch_labels = None
+depends_on = None
+
+SKIP_CANON_DRIVE_IDS = os.environ.get("SKIP_CANON_DRIVE_IDS", "true").lower() == "true"
+
+
+def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
+    result = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_fetch = result.fetchall()
+    search_settings = (
+        SearchSettings(**search_settings_fetch[0]._asdict())
+        if search_settings_fetch
+        else None
+    )
+
+    result2 = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_future_fetch = result2.fetchall()
+    search_settings_future = (
+        SearchSettings(**search_settings_future_fetch[0]._asdict())
+        if search_settings_future_fetch
+        else None
+    )
+
+    if not isinstance(search_settings, SearchSettings):
+        raise RuntimeError(
+            "current search settings is of type " + str(type(search_settings))
+        )
+    if (
+        not isinstance(search_settings_future, SearchSettings)
+        and search_settings_future is not None
+    ):
+        raise RuntimeError(
+            "future search settings is of type " + str(type(search_settings_future))
+        )
+
+    return search_settings, search_settings_future
+
+
+def normalize_google_drive_url(url: str) -> str:
+    """Remove query parameters from Google Drive URLs to create canonical document IDs.
+    NOTE: copied from drive doc_conversion.py
+    """
+    parsed_url = urlparse(url)
+    parsed_url = parsed_url._replace(query="")
+    spl_path = parsed_url.path.split("/")
+    if spl_path and (spl_path[-1] in ["edit", "view", "preview"]):
+        spl_path.pop()
+        parsed_url = parsed_url._replace(path="/".join(spl_path))
+    # Remove query parameters and reconstruct URL
+    return urlunparse(parsed_url)
+
+
+def get_google_drive_documents_from_database() -> list[dict]:
+    """Get all Google Drive documents from the database."""
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            """
+            SELECT d.id
+            FROM document d
+            JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id
+            JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id
+                AND dcc.credential_id = cc.credential_id
+            JOIN connector c ON cc.connector_id = c.id
+            WHERE c.source = 'GOOGLE_DRIVE'
+        """
+        )
+    )
+
+    documents = []
+    for row in result:
+        documents.append({"document_id": row.id})
+
+    return documents
+
+
+def update_document_id_in_database(
+    old_doc_id: str, new_doc_id: str, index_name: str
+) -> None:
+    """Update document IDs in all relevant database tables using copy-and-swap approach."""
+    bind = op.get_bind()
+
+    # print(f"Updating database tables for document {old_doc_id} -> {new_doc_id}")
+
+    # Check if new document ID already exists
+    result = bind.execute(
+        sa.text("SELECT COUNT(*) FROM document WHERE id = :new_id"),
+        {"new_id": new_doc_id},
+    )
+    row = result.fetchone()
+    if row and row[0] > 0:
+        # print(f"Document with ID {new_doc_id} already exists, deleting old one")
+        delete_document_from_db(old_doc_id, index_name)
+        return
+
+    # Step 1: Create a new document row with the new ID (copy all fields from old row)
+    # Use a conservative approach to handle columns that might not exist in all installations
+    try:
+        bind.execute(
+            sa.text(
+                """
+                INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
+                                    link, doc_updated_at, primary_owners, secondary_owners,
+                                    external_user_emails, external_user_group_ids, is_public,
+                                    chunk_count, last_modified, last_synced, kg_stage, kg_processing_time)
+                SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
+                       link, doc_updated_at, primary_owners, secondary_owners,
+                       external_user_emails, external_user_group_ids, is_public,
+                       chunk_count, last_modified, last_synced, kg_stage, kg_processing_time
+                FROM document
+                WHERE id = :old_id
+            """
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated database tables for document {old_doc_id} -> {new_doc_id}")
+    except Exception as e:
+        # If the full INSERT fails, try a more basic version with only core columns
+        logger.warning(f"Full INSERT failed, trying basic version: {e}")
+        bind.execute(
+            sa.text(
+                """
+                INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
+                                    link, doc_updated_at, primary_owners, secondary_owners)
+                SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
+                       link, doc_updated_at, primary_owners, secondary_owners
+                FROM document
+                WHERE id = :old_id
+            """
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+
+    # Step 2: Update all foreign key references to point to the new ID
+
+    # Update document_by_connector_credential_pair table
+    bind.execute(
+        sa.text(
+            "UPDATE document_by_connector_credential_pair SET id = :new_id WHERE id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document_by_connector_credential_pair table for document {old_doc_id} -> {new_doc_id}")
+
+    # Update search_doc table (stores search results for chat replay)
+    # This is critical for agent functionality
+    bind.execute(
+        sa.text(
+            "UPDATE search_doc SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated search_doc table for document {old_doc_id} -> {new_doc_id}")
+    # Update document_retrieval_feedback table (user feedback on documents)
+    bind.execute(
+        sa.text(
+            "UPDATE document_retrieval_feedback SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document_retrieval_feedback table for document {old_doc_id} -> {new_doc_id}")
+    # Update document__tag table (document-tag relationships)
+    bind.execute(
+        sa.text(
+            "UPDATE document__tag SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document__tag table for document {old_doc_id} -> {new_doc_id}")
+    # Update user_file table (user uploaded files linked to documents)
+    bind.execute(
+        sa.text(
+            "UPDATE user_file SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated user_file table for document {old_doc_id} -> {new_doc_id}")
+    # Update KG and chunk_stats tables (these may not exist in all installations)
+    try:
+        # Update kg_entity table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_entity SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_entity table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_entity_extraction_staging table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_entity_extraction_staging SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_entity_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_relationship table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_relationship SET source_document = :new_id WHERE source_document = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_relationship table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_relationship_extraction_staging table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_relationship_extraction_staging SET source_document = :new_id WHERE source_document = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_relationship_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
+        # Update chunk_stats table
+        bind.execute(
+            sa.text(
+                "UPDATE chunk_stats SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated chunk_stats table for document {old_doc_id} -> {new_doc_id}")
+        # Update chunk_stats ID field which includes document_id
+        bind.execute(
+            sa.text(
+                """
+                UPDATE chunk_stats
+                SET id = REPLACE(id, :old_id, :new_id)
+                WHERE id LIKE :old_id_pattern
+            """
+            ),
+            {
+                "new_id": new_doc_id,
+                "old_id": old_doc_id,
+                "old_id_pattern": f"{old_doc_id}__%",
+            },
+        )
+        # print(f"Successfully updated chunk_stats ID field for document {old_doc_id} -> {new_doc_id}")
+    except Exception as e:
+        logger.warning(f"Some KG/chunk tables may not exist or failed to update: {e}")
+
+    # Step 3: Delete the old document row (this should now be safe since all FKs point to new row)
+    bind.execute(
+        sa.text("DELETE FROM document WHERE id = :old_id"), {"old_id": old_doc_id}
+    )
+    # print(f"Successfully deleted document {old_doc_id} from database")
+
+
+def _visit_chunks(
+    *,
+    http_client: httpx.Client,
+    index_name: str,
+    selection: str,
+    continuation: str | None = None,
+) -> tuple[list[dict], str | None]:
+    """Helper that calls the /document/v1 visit API once and returns (docs, next_token)."""
+
+    # Use the same URL as the document API, but with visit-specific params
+    base_url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
+
+    params: dict[str, str] = {
+        "selection": selection,
+        "wantedDocumentCount": "1000",
+    }
+    if continuation:
+        params["continuation"] = continuation
+
+    # print(f"Visiting chunks for selection '{selection}' with params {params}")
+    resp = http_client.get(base_url, params=params, timeout=None)
+    # print(f"Visited chunks for document {selection}")
+    resp.raise_for_status()
+
+    payload = resp.json()
+    return payload.get("documents", []), payload.get("continuation")
+
+
+def delete_document_chunks_from_vespa(index_name: str, doc_id: str) -> None:
+    """Delete all chunks for *doc_id* from Vespa using continuation-token paging (no offset)."""
+
+    total_deleted = 0
+    # Use exact match instead of contains - Document Selector Language doesn't support contains
+    selection = f'{index_name}.document_id=="{doc_id}"'
+
+    with get_vespa_http_client() as http_client:
+        continuation: str | None = None
+        while True:
+            docs, continuation = _visit_chunks(
+                http_client=http_client,
+                index_name=index_name,
+                selection=selection,
+                continuation=continuation,
+            )
+
+            if not docs:
+                break
+
+            for doc in docs:
+                vespa_full_id = doc.get("id")
+                if not vespa_full_id:
+                    continue
+
+                vespa_doc_uuid = vespa_full_id.split("::")[-1]
+                delete_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
+
+                try:
+                    resp = http_client.delete(delete_url)
+                    resp.raise_for_status()
+                    total_deleted += 1
+                except Exception as e:
+                    print(f"Failed to delete chunk {vespa_doc_uuid}: {e}")
+
+            if not continuation:
+                break
+
+
+def update_document_id_in_vespa(
+    index_name: str, old_doc_id: str, new_doc_id: str
+) -> None:
+    """Update all chunks' document_id field from *old_doc_id* to *new_doc_id* using continuation paging."""
+
+    clean_new_doc_id = replace_invalid_doc_id_characters(new_doc_id)
+
+    # Use exact match instead of contains - Document Selector Language doesn't support contains
+    selection = f'{index_name}.document_id=="{old_doc_id}"'
+
+    with get_vespa_http_client() as http_client:
+        continuation: str | None = None
+        while True:
+            # print(f"Visiting chunks for document {old_doc_id} -> {new_doc_id}")
+            docs, continuation = _visit_chunks(
+                http_client=http_client,
+                index_name=index_name,
+                selection=selection,
+                continuation=continuation,
+            )
+
+            if not docs:
+                break
+
+            for doc in docs:
+                vespa_full_id = doc.get("id")
+                if not vespa_full_id:
+                    continue
+
+                vespa_doc_uuid = vespa_full_id.split("::")[-1]
+                vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
+
+                update_request = {
+                    "fields": {"document_id": {"assign": clean_new_doc_id}}
+                }
+
+                try:
+                    resp = http_client.put(vespa_url, json=update_request)
+                    resp.raise_for_status()
+                except Exception as e:
+                    print(f"Failed to update chunk {vespa_doc_uuid}: {e}")
+                    raise
+
+            if not continuation:
+                break
+
+
+def delete_document_from_db(current_doc_id: str, index_name: str) -> None:
+    # Delete all foreign key references first, then delete the document
+    try:
+        bind = op.get_bind()
+
+        # Delete from agent-related tables first (order matters due to foreign keys)
+        # Delete from agent__sub_query__search_doc first since it references search_doc
+        bind.execute(
+            sa.text(
+                """
+                DELETE FROM agent__sub_query__search_doc
+                WHERE search_doc_id IN (
+                    SELECT id FROM search_doc WHERE document_id = :doc_id
+                )
+                """
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from chat_message__search_doc
+        bind.execute(
+            sa.text(
+                """
+                DELETE FROM chat_message__search_doc
+                WHERE search_doc_id IN (
+                    SELECT id FROM search_doc WHERE document_id = :doc_id
+                )
+                """
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Now we can safely delete from search_doc
+        bind.execute(
+            sa.text("DELETE FROM search_doc WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from document_by_connector_credential_pair
+        bind.execute(
+            sa.text(
+                "DELETE FROM document_by_connector_credential_pair WHERE id = :doc_id"
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from other tables that reference this document
+        bind.execute(
+            sa.text(
+                "DELETE FROM document_retrieval_feedback WHERE document_id = :doc_id"
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        bind.execute(
+            sa.text("DELETE FROM document__tag WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        bind.execute(
+            sa.text("DELETE FROM user_file WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from KG tables if they exist
+        try:
+            bind.execute(
+                sa.text("DELETE FROM kg_entity WHERE document_id = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text(
+                    "DELETE FROM kg_entity_extraction_staging WHERE document_id = :doc_id"
+                ),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM kg_relationship WHERE source_document = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text(
+                    "DELETE FROM kg_relationship_extraction_staging WHERE source_document = :doc_id"
+                ),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM chunk_stats WHERE document_id = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM chunk_stats WHERE id LIKE :doc_id_pattern"),
+                {"doc_id_pattern": f"{current_doc_id}__%"},
+            )
+
+        except Exception as e:
+            logger.warning(
+                f"Some KG/chunk tables may not exist or failed to delete from: {e}"
+            )
+
+        # Finally delete the document itself
+        bind.execute(
+            sa.text("DELETE FROM document WHERE id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete chunks from vespa
+        delete_document_chunks_from_vespa(index_name, current_doc_id)
+
+    except Exception as e:
+        print(f"Failed to delete duplicate document {current_doc_id}: {e}")
+        # Continue with other documents instead of failing the entire migration
+
+
+def upgrade() -> None:
+    if SKIP_CANON_DRIVE_IDS:
+        return
+    current_search_settings, future_search_settings = active_search_settings()
+    document_index = get_default_document_index(
+        current_search_settings,
+        future_search_settings,
+    )
+
+    # Get the index name
+    if hasattr(document_index, "index_name"):
+        index_name = document_index.index_name
+    else:
+        # Default index name if we can't get it from the document_index
+        index_name = "danswer_index"
+
+    # Get all Google Drive documents from the database (this is faster and more reliable)
+    gdrive_documents = get_google_drive_documents_from_database()
+
+    if not gdrive_documents:
+        return
+
+    # Track normalized document IDs to detect duplicates
+    all_normalized_doc_ids = set()
+    updated_count = 0
+
+    for doc_info in gdrive_documents:
+        current_doc_id = doc_info["document_id"]
+        normalized_doc_id = normalize_google_drive_url(current_doc_id)
+
+        print(f"Processing document {current_doc_id} -> {normalized_doc_id}")
+        # Check for duplicates
+        if normalized_doc_id in all_normalized_doc_ids:
+            # print(f"Deleting duplicate document {current_doc_id}")
+            delete_document_from_db(current_doc_id, index_name)
+            continue
+
+        all_normalized_doc_ids.add(normalized_doc_id)
+
+        # If the document ID already doesn't have query parameters, skip it
+        if current_doc_id == normalized_doc_id:
+            # print(f"Skipping document {current_doc_id} -> {normalized_doc_id} because it already has no query parameters")
+            continue
+
+        try:
+            # Update both database and Vespa in order
+            # Database first to ensure consistency
+            update_document_id_in_database(
+                current_doc_id, normalized_doc_id, index_name
+            )
+
+            # For Vespa, we can now use the original document IDs since we're using contains matching
+            update_document_id_in_vespa(index_name, current_doc_id, normalized_doc_id)
+            updated_count += 1
+            # print(f"Finished updating document {current_doc_id} -> {normalized_doc_id}")
+        except Exception as e:
+            print(f"Failed to update document {current_doc_id}: {e}")
+
+            if isinstance(e, HTTPStatusError):
+                print(f"HTTPStatusError: {e}")
+                print(f"Response: {e.response.text}")
+                print(f"Status: {e.response.status_code}")
+                print(f"Headers: {e.response.headers}")
+                print(f"Request: {e.request.url}")
+                print(f"Request headers: {e.request.headers}")
+            # Note: Rollback is complex with copy-and-swap approach since the old document is already deleted
+            # In case of failure, manual intervention may be required
+            # Continue with other documents instead of failing the entire migration
+            continue
+
+    logger.info(f"Migration complete. Updated {updated_count} Google Drive documents")
+
+
+def downgrade() -> None:
+    # this is a one way migration, so no downgrade.
+    # It wouldn't make sense to store the extra query parameters
+    # and duplicate documents to allow a reversal.
+    pass
--- a/backend/alembic/versions/15326fcec57e_introduce_onyx_apis.py
+++ b/backend/alembic/versions/15326fcec57e_introduce_onyx_apis.py
@@ -5,6 +5,7 @@ Revises: 77d07dffae64
 Create Date: 2023-11-11 20:51:24.228999

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/173cae5bba26_port_config_store.py
+++ b/backend/alembic/versions/173cae5bba26_port_config_store.py
@@ -5,6 +5,7 @@ Revises: e50154680a5c
 Create Date: 2024-03-19 15:30:44.425436

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py
+++ b/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py
@@ -5,6 +5,7 @@ Revises: 4ee1287bd26a
 Create Date: 2024-11-21 11:49:04.488677

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/1a03d2c2856b_add_indexes_to_document__tag.py
+++ b/backend/alembic/versions/1a03d2c2856b_add_indexes_to_document__tag.py
@@ -5,6 +5,7 @@ Revises: 9c00a2bccb83
 Create Date: 2025-02-18 10:45:13.957807

 """
+
 from alembic import op

 # revision identifiers, used by Alembic.
--- a/backend/alembic/versions/1b10e1fda030_add_additional_data_to_notifications.py
+++ b/backend/alembic/versions/1b10e1fda030_add_additional_data_to_notifications.py
@@ -5,6 +5,7 @@ Revises: 6756efa39ada
 Create Date: 2024-10-15 19:26:44.071259

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py
+++ b/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py
@@ -5,6 +5,7 @@ Revises: 35e6853a51d5
 Create Date: 2024-09-18 11:48:59.418726

 """
+
 from alembic import op


--- a/backend/alembic/versions/213fd978c6d8_notifications.py
+++ b/backend/alembic/versions/213fd978c6d8_notifications.py
@@ -5,6 +5,7 @@ Revises: 5fc1f54cc252
 Create Date: 2024-08-10 11:13:36.070790

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/238b84885828_add_foreign_key_to_user__external_user_.py
+++ b/backend/alembic/versions/238b84885828_add_foreign_key_to_user__external_user_.py
@@ -0,0 +1,45 @@
+"""Add foreign key to user__external_user_group_id
+
+Revision ID: 238b84885828
+Revises: a7688ab35c45
+Create Date: 2025-05-19 17:15:33.424584
+
+"""
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "238b84885828"
+down_revision = "a7688ab35c45"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # First, clean up any entries that don't have a valid cc_pair_id
+    op.execute(
+        """
+        DELETE FROM user__external_user_group_id
+        WHERE cc_pair_id NOT IN (SELECT id FROM connector_credential_pair)
+        """
+    )
+
+    # Add foreign key constraint with cascade delete
+    op.create_foreign_key(
+        "fk_user__external_user_group_id_cc_pair_id",
+        "user__external_user_group_id",
+        "connector_credential_pair",
+        ["cc_pair_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+
+def downgrade() -> None:
+    # Drop the foreign key constraint
+    op.drop_constraint(
+        "fk_user__external_user_group_id_cc_pair_id",
+        "user__external_user_group_id",
+        type_="foreignkey",
+    )
--- a/backend/alembic/versions/23957775e5f5_remove_feedback_foreignkey_constraint.py
+++ b/backend/alembic/versions/23957775e5f5_remove_feedback_foreignkey_constraint.py
@@ -5,6 +5,7 @@ Revises: bc9771dccadf
 Create Date: 2024-06-27 16:04:51.480437

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/2666d766cb9b_google_oauth2.py
+++ b/backend/alembic/versions/2666d766cb9b_google_oauth2.py
@@ -5,6 +5,7 @@ Revises: 6d387b3196c2
 Create Date: 2023-05-05 15:49:35.716016

 """
+
 import fastapi_users_db_sqlalchemy
 import sqlalchemy as sa
 from alembic import op
--- a/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py
+++ b/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py
@@ -5,6 +5,7 @@ Revises: 2daa494a0851
 Create Date: 2024-11-12 13:23:29.858995

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/27c6ecc08586_permission_framework.py
+++ b/backend/alembic/versions/27c6ecc08586_permission_framework.py
@@ -5,6 +5,7 @@ Revises: 2666d766cb9b
 Create Date: 2023-05-24 18:45:17.244495

 """
+
 import fastapi_users_db_sqlalchemy
 import sqlalchemy as sa
 from alembic import op
@@ -143,27 +144,34 @@ def upgrade() -> None:

 def downgrade() -> None:
    op.execute("TRUNCATE TABLE index_attempt")
-    op.add_column(
-        "index_attempt",
-        sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
-    )
-    op.add_column(
-        "index_attempt",
-        sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
-    )
-    op.add_column(
-        "index_attempt",
-        sa.Column(
-            "connector_specific_config",
-            postgresql.JSONB(astext_type=sa.Text()),
-            autoincrement=False,
-            nullable=False,
-        ),
-    )
-
-    # Check if the constraint exists before dropping
    conn = op.get_bind()
    inspector = sa.inspect(conn)
+    existing_columns = {col["name"] for col in inspector.get_columns("index_attempt")}
+
+    if "input_type" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
+        )
+
+    if "source" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
+        )
+
+    if "connector_specific_config" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column(
+                "connector_specific_config",
+                postgresql.JSONB(astext_type=sa.Text()),
+                autoincrement=False,
+                nullable=False,
+            ),
+        )
+
+    # Check if the constraint exists before dropping
    constraints = inspector.get_foreign_keys("index_attempt")

    if any(
@@ -182,8 +190,12 @@ def downgrade() -> None:
            "fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
        )

-    op.drop_column("index_attempt", "credential_id")
-    op.drop_column("index_attempt", "connector_id")
-    op.drop_table("connector_credential_pair")
-    op.drop_table("credential")
-    op.drop_table("connector")
+    if "credential_id" in existing_columns:
+        op.drop_column("index_attempt", "credential_id")
+
+    if "connector_id" in existing_columns:
+        op.drop_column("index_attempt", "connector_id")
+
+    op.execute("DROP TABLE IF EXISTS connector_credential_pair CASCADE")
+    op.execute("DROP TABLE IF EXISTS credential CASCADE")
+    op.execute("DROP TABLE IF EXISTS connector CASCADE")
--- a/backend/alembic/versions/2955778aa44c_add_chunk_count_to_document.py
+++ b/backend/alembic/versions/2955778aa44c_add_chunk_count_to_document.py
@@ -5,6 +5,7 @@ Revises: c0aab6edb6dd
 Create Date: 2025-01-04 11:39:43.268612

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/2cdeff6d8c93_set_built_in_to_default.py
+++ b/backend/alembic/versions/2cdeff6d8c93_set_built_in_to_default.py
@@ -5,6 +5,7 @@ Revises: f5437cc136c5
 Create Date: 2025-02-11 14:57:51.308775

 """
+
 from alembic import op


--- a/backend/alembic/versions/2d2304e27d8c_add_above_below_to_persona.py
+++ b/backend/alembic/versions/2d2304e27d8c_add_above_below_to_persona.py
@@ -5,6 +5,7 @@ Revises: 4b08d97e175a
 Create Date: 2024-08-21 19:15:15.762948

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/2daa494a0851_add_group_sync_time.py
+++ b/backend/alembic/versions/2daa494a0851_add_group_sync_time.py
@@ -5,6 +5,7 @@ Revises: c0fd6e4da83a
 Create Date: 2024-11-11 10:57:22.991157

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/2f80c6a2550f_add_chat_session_specific_temperature_.py
+++ b/backend/alembic/versions/2f80c6a2550f_add_chat_session_specific_temperature_.py
@@ -5,6 +5,7 @@ Revises: 33ea50e88f24
 Create Date: 2025-01-31 10:30:27.289646

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
+++ b/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
@@ -0,0 +1,115 @@
+"""add_indexing_coordination
+
+Revision ID: 2f95e36923e6
+Revises: 0816326d83aa
+Create Date: 2025-07-10 16:17:57.762182
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "2f95e36923e6"
+down_revision = "0816326d83aa"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add database-based coordination fields (replacing Redis fencing)
+    op.add_column(
+        "index_attempt", sa.Column("celery_task_id", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "cancellation_requested",
+            sa.Boolean(),
+            nullable=False,
+            server_default="false",
+        ),
+    )
+
+    # Add batch coordination fields (replacing FileStore state)
+    op.add_column(
+        "index_attempt", sa.Column("total_batches", sa.Integer(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "completed_batches", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "total_failures_batch_level",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("total_chunks", sa.Integer(), nullable=False, server_default="0"),
+    )
+
+    # Progress tracking for stall detection
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_progress_time", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_batches_completed_count",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+
+    # Heartbeat tracking for worker liveness detection
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "heartbeat_counter", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_heartbeat_value", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_heartbeat_time", sa.DateTime(timezone=True), nullable=True),
+    )
+
+    # Add index for coordination queries
+    op.create_index(
+        "ix_index_attempt_active_coordination",
+        "index_attempt",
+        ["connector_credential_pair_id", "search_settings_id", "status"],
+    )
+
+
+def downgrade() -> None:
+    # Remove the new index
+    op.drop_index("ix_index_attempt_active_coordination", table_name="index_attempt")
+
+    # Remove the new columns
+    op.drop_column("index_attempt", "last_batches_completed_count")
+    op.drop_column("index_attempt", "last_progress_time")
+    op.drop_column("index_attempt", "last_heartbeat_time")
+    op.drop_column("index_attempt", "last_heartbeat_value")
+    op.drop_column("index_attempt", "heartbeat_counter")
+    op.drop_column("index_attempt", "total_chunks")
+    op.drop_column("index_attempt", "total_failures_batch_level")
+    op.drop_column("index_attempt", "completed_batches")
+    op.drop_column("index_attempt", "total_batches")
+    op.drop_column("index_attempt", "cancellation_requested")
+    op.drop_column("index_attempt", "celery_task_id")
--- a/backend/alembic/versions/30c1d5744104_persona_datetime_aware.py
+++ b/backend/alembic/versions/30c1d5744104_persona_datetime_aware.py
@@ -5,6 +5,7 @@ Revises: 7f99be1cb9f5
 Create Date: 2023-10-16 23:21:01.283424

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/325975216eb3_add_icon_color_and_icon_shape_to_persona.py
+++ b/backend/alembic/versions/325975216eb3_add_icon_color_and_icon_shape_to_persona.py
@@ -5,6 +5,7 @@ Revises: 91ffac7e65b3
 Create Date: 2024-07-24 21:29:31.784562

 """
+
 import random
 from alembic import op
 import sqlalchemy as sa
--- a/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py
+++ b/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py
@@ -5,6 +5,7 @@ Revises: 5b29123cd710
 Create Date: 2024-11-01 12:51:01.535003

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/33ea50e88f24_foreign_key_input_prompts.py
+++ b/backend/alembic/versions/33ea50e88f24_foreign_key_input_prompts.py
@@ -5,6 +5,7 @@ Revises: a6df6b88ef81
 Create Date: 2025-01-29 10:54:22.141765

 """
+
 from alembic import op


--- a/backend/alembic/versions/351faebd379d_add_curator_fields.py
+++ b/backend/alembic/versions/351faebd379d_add_curator_fields.py
@@ -5,6 +5,7 @@ Revises: ee3f4b47fad5
 Create Date: 2024-08-15 22:37:08.397052

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/35e518e0ddf4_properly_cascade.py
+++ b/backend/alembic/versions/35e518e0ddf4_properly_cascade.py
@@ -5,6 +5,7 @@ Revises: 91a0a4d62b14
 Create Date: 2024-09-20 21:24:04.891018

 """
+
 from alembic import op


--- a/backend/alembic/versions/35e6853a51d5_server_default_chosen_assistants.py
+++ b/backend/alembic/versions/35e6853a51d5_server_default_chosen_assistants.py
@@ -5,6 +5,7 @@ Revises: c99d76fcd298
 Create Date: 2024-09-13 13:20:32.885317

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/369644546676_add_composite_index_for_index_attempt_.py
+++ b/backend/alembic/versions/369644546676_add_composite_index_for_index_attempt_.py
@@ -5,6 +5,7 @@ Revises: 2955778aa44c
 Create Date: 2025-01-08 15:38:17.224380

 """
+
 from alembic import op
 from sqlalchemy import text

--- a/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
+++ b/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
@@ -0,0 +1,136 @@
+"""update_kg_trigger_functions
+
+Revision ID: 36e9220ab794
+Revises: c9e2cd766c29
+Create Date: 2025-06-22 17:33:25.833733
+
+"""
+
+from alembic import op
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
+
+# revision identifiers, used by Alembic.
+revision = "36e9220ab794"
+down_revision = "c9e2cd766c29"
+branch_labels = None
+depends_on = None
+
+
+def _get_tenant_contextvar(session: Session) -> str:
+    """Get the current schema for the migration"""
+    current_tenant = session.execute(text("SELECT current_schema()")).scalar()
+    if isinstance(current_tenant, str):
+        return current_tenant
+    else:
+        raise ValueError("Current tenant is not a string")
+
+
+def upgrade() -> None:
+
+    bind = op.get_bind()
+    session = Session(bind=bind)
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    tenant_id = _get_tenant_contextvar(session)
+    alphanum_pattern = r"[^a-z0-9]+"
+    truncate_length = 1000
+    function = "update_kg_entity_name"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                name text;
+                cleaned_name text;
+            BEGIN
+                -- Set name to semantic_id if document_id is not NULL
+                IF NEW.document_id IS NOT NULL THEN
+                    SELECT lower(semantic_id) INTO name
+                    FROM "{tenant_id}".document
+                    WHERE id = NEW.document_id;
+                ELSE
+                    name = lower(NEW.name);
+                END IF;
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams
+                NEW.name = name;
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".kg_entity')
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            BEFORE INSERT OR UPDATE OF name
+            ON "{tenant_id}".kg_entity
+            FOR EACH ROW
+            EXECUTE FUNCTION "{tenant_id}".{function}();
+        """
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    function = "update_kg_entity_name_from_doc"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                doc_name text;
+                cleaned_name text;
+            BEGIN
+                doc_name = lower(NEW.semantic_id);
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    doc_name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams for all entities referencing this document
+                UPDATE "{tenant_id}".kg_entity
+                SET
+                    name = doc_name,
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
+                WHERE document_id = NEW.id;
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".document')
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            AFTER UPDATE OF semantic_id
+            ON "{tenant_id}".document
+            FOR EACH ROW
+            EXECUTE FUNCTION "{tenant_id}".{function}();
+        """
+    )
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/3781a5eb12cb_add_chunk_stats_table.py
+++ b/backend/alembic/versions/3781a5eb12cb_add_chunk_stats_table.py
@@ -0,0 +1,52 @@
+"""add chunk stats table
+
+Revision ID: 3781a5eb12cb
+Revises: df46c75b714e
+Create Date: 2025-03-10 10:02:30.586666
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "3781a5eb12cb"
+down_revision = "df46c75b714e"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "chunk_stats",
+        sa.Column("id", sa.String(), primary_key=True, index=True),
+        sa.Column(
+            "document_id",
+            sa.String(),
+            sa.ForeignKey("document.id"),
+            nullable=False,
+            index=True,
+        ),
+        sa.Column("chunk_in_doc_id", sa.Integer(), nullable=False),
+        sa.Column("information_content_boost", sa.Float(), nullable=True),
+        sa.Column(
+            "last_modified",
+            sa.DateTime(timezone=True),
+            nullable=False,
+            index=True,
+            server_default=sa.func.now(),
+        ),
+        sa.Column("last_synced", sa.DateTime(timezone=True), nullable=True, index=True),
+        sa.UniqueConstraint(
+            "document_id", "chunk_in_doc_id", name="uq_chunk_stats_doc_chunk"
+        ),
+    )
+
+    op.create_index(
+        "ix_chunk_sync_status", "chunk_stats", ["last_modified", "last_synced"]
+    )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_chunk_sync_status", table_name="chunk_stats")
+    op.drop_table("chunk_stats")
--- a/backend/alembic/versions/3879338f8ba1_add_tool_table.py
+++ b/backend/alembic/versions/3879338f8ba1_add_tool_table.py
@@ -5,6 +5,7 @@ Revises: f1c6478c3fd8
 Create Date: 2024-05-11 16:11:23.718084

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/38eda64af7fe_add_chat_session_sharing.py
+++ b/backend/alembic/versions/38eda64af7fe_add_chat_session_sharing.py
@@ -5,6 +5,7 @@ Revises: 776b3bbe9092
 Create Date: 2024-03-27 19:41:29.073594

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/3934b1bc7b62_update_github_connector_repo_name_to_.py
+++ b/backend/alembic/versions/3934b1bc7b62_update_github_connector_repo_name_to_.py
@@ -0,0 +1,126 @@
+"""Update GitHub connector repo_name to repositories
+
+Revision ID: 3934b1bc7b62
+Revises: b7c2b63c4a03
+Create Date: 2025-03-05 10:50:30.516962
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+import json
+import logging
+
+# revision identifiers, used by Alembic.
+revision = "3934b1bc7b62"
+down_revision = "b7c2b63c4a03"
+branch_labels = None
+depends_on = None
+
+logger = logging.getLogger("alembic.runtime.migration")
+
+
+def upgrade() -> None:
+    # Get all GitHub connectors
+    conn = op.get_bind()
+
+    # First get all GitHub connectors
+    github_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'GITHUB'
+            """
+        )
+    ).fetchall()
+
+    # Update each connector's config
+    updated_count = 0
+    for connector_id, config in github_connectors:
+        try:
+            if not config:
+                logger.warning(f"Connector {connector_id} has no config, skipping")
+                continue
+
+            # Parse the config if it's a string
+            if isinstance(config, str):
+                config = json.loads(config)
+
+            if "repo_name" not in config:
+                continue
+
+            # Create new config with repositories instead of repo_name
+            new_config = dict(config)
+            repo_name_value = new_config.pop("repo_name")
+            new_config["repositories"] = repo_name_value
+
+            # Update the connector with the new config
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE connector
+                    SET connector_specific_config = :new_config
+                    WHERE id = :connector_id
+                    """
+                ),
+                {"connector_id": connector_id, "new_config": json.dumps(new_config)},
+            )
+            updated_count += 1
+        except Exception as e:
+            logger.error(f"Error updating connector {connector_id}: {str(e)}")
+
+
+def downgrade() -> None:
+    # Get all GitHub connectors
+    conn = op.get_bind()
+
+    logger.debug(
+        "Starting rollback of GitHub connectors from repositories to repo_name"
+    )
+
+    github_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'GITHUB'
+            """
+        )
+    ).fetchall()
+
+    logger.debug(f"Found {len(github_connectors)} GitHub connectors to rollback")
+
+    # Revert each GitHub connector to use repo_name instead of repositories
+    reverted_count = 0
+    for connector_id, config in github_connectors:
+        try:
+            if not config:
+                continue
+
+            # Parse the config if it's a string
+            if isinstance(config, str):
+                config = json.loads(config)
+
+            if "repositories" not in config:
+                continue
+
+            # Create new config with repo_name instead of repositories
+            new_config = dict(config)
+            repositories_value = new_config.pop("repositories")
+            new_config["repo_name"] = repositories_value
+
+            # Update the connector with the new config
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE connector
+                    SET connector_specific_config = :new_config
+                    WHERE id = :connector_id
+                    """
+                ),
+                {"new_config": json.dumps(new_config), "connector_id": connector_id},
+            )
+            reverted_count += 1
+        except Exception as e:
+            logger.error(f"Error reverting connector {connector_id}: {str(e)}")
--- a/backend/alembic/versions/3b25685ff73c_move_is_public_to_cc_pair.py
+++ b/backend/alembic/versions/3b25685ff73c_move_is_public_to_cc_pair.py
@@ -5,6 +5,7 @@ Revises: e0a68a81d434
 Create Date: 2023-10-05 18:47:09.582849

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/3bd4c84fe72f_improved_index.py
+++ b/backend/alembic/versions/3bd4c84fe72f_improved_index.py
@@ -5,9 +5,9 @@ Revises: 8f43500ee275
 Create Date: 2025-02-26 13:07:56.217791

 """
+
 from alembic import op
-import time
-from sqlalchemy import text
+

 # revision identifiers, used by Alembic.
 revision = "3bd4c84fe72f"
@@ -21,374 +21,59 @@ depends_on = None
 # an outage by creating an index without using CONCURRENTLY. This migration:
 #
 # 1. Creates more efficient full-text search capabilities using tsvector columns and GIN indexes
-# 2. Uses CONCURRENTLY for all index creation to prevent table locking
-# 3. Explicitly manages transactions with COMMIT statements to allow CONCURRENTLY to work
-# (see: https://www.postgresql.org/docs/9.4/sql-createindex.html#SQL-CREATEINDEX-CONCURRENTLY)
-# (see: https://github.com/sqlalchemy/alembic/issues/277)
-# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
+# 2. Adds indexes to both chat_message and chat_session tables for comprehensive search
+# 3. Note: CONCURRENTLY was removed due to operational issues


-def upgrade():
-    # --- PART 1: chat_message table ---
-    # Step 1: Add nullable column (quick, minimal locking)
-    # op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv")
-    # op.execute("DROP TRIGGER IF EXISTS chat_message_tsv_trigger ON chat_message")
-    # op.execute("DROP FUNCTION IF EXISTS update_chat_message_tsv()")
-    # op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv")
-    # # Drop chat_session tsv trigger if it exists
-    # op.execute("DROP TRIGGER IF EXISTS chat_session_tsv_trigger ON chat_session")
-    # op.execute("DROP FUNCTION IF EXISTS update_chat_session_tsv()")
-    # op.execute("ALTER TABLE chat_session DROP COLUMN IF EXISTS title_tsv")
-    # raise Exception("Stop here")
-    time.time()
-    op.execute("ALTER TABLE chat_message ADD COLUMN IF NOT EXISTS message_tsv tsvector")
+def upgrade() -> None:
+    # First, drop any existing indexes to avoid conflicts
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_message_lower;")

-    # Step 2: Create function and trigger for new/updated rows
+    # Drop existing columns if they exist
+    op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv;")
+    op.execute("ALTER TABLE chat_session DROP COLUMN IF EXISTS description_tsv;")
+
+    # Create a GIN index for full-text search on chat_message.message
    op.execute(
        """
-    CREATE OR REPLACE FUNCTION update_chat_message_tsv()
-    RETURNS TRIGGER AS $$
-    BEGIN
-      NEW.message_tsv = to_tsvector('english', NEW.message);
-      RETURN NEW;
-    END;
-    $$ LANGUAGE plpgsql
-    """
+        ALTER TABLE chat_message
+        ADD COLUMN message_tsv tsvector
+        GENERATED ALWAYS AS (to_tsvector('english', message)) STORED;
+        """
    )

-    # Create trigger in a separate execute call
    op.execute(
        """
-    CREATE TRIGGER chat_message_tsv_trigger
-    BEFORE INSERT OR UPDATE ON chat_message
-    FOR EACH ROW EXECUTE FUNCTION update_chat_message_tsv()
-    """
+        CREATE INDEX IF NOT EXISTS idx_chat_message_tsv
+        ON chat_message
+        USING GIN (message_tsv)
+        """
    )

-    # Step 3: Update existing rows in batches using Python
-    time.time()
-
-    # Get connection and count total rows
-    connection = op.get_bind()
-    total_count_result = connection.execute(
-        text("SELECT COUNT(*) FROM chat_message")
-    ).scalar()
-    total_count = total_count_result if total_count_result is not None else 0
-    batch_size = 5000
-    batches = 0
-
-    # Calculate total batches needed
-    total_batches = (
-        (total_count + batch_size - 1) // batch_size if total_count > 0 else 0
+    # Also add a stored tsvector column for chat_session.description
+    op.execute(
+        """
+        ALTER TABLE chat_session
+        ADD COLUMN description_tsv tsvector
+        GENERATED ALWAYS AS (to_tsvector('english', coalesce(description, ''))) STORED;
+        """
    )

-    # Process in batches - properly handling UUIDs by using OFFSET/LIMIT approach
-    for batch_num in range(total_batches):
-        offset = batch_num * batch_size
-
-        # Execute update for this batch using OFFSET/LIMIT which works with UUIDs
-        connection.execute(
-            text(
-                """
-            UPDATE chat_message
-            SET message_tsv = to_tsvector('english', message)
-            WHERE id IN (
-                SELECT id FROM chat_message
-                WHERE message_tsv IS NULL
-                ORDER BY id
-                LIMIT :batch_size OFFSET :offset
-            )
-            """
-            ).bindparams(batch_size=batch_size, offset=offset)
-        )
-
-        # Commit each batch
-        connection.execute(text("COMMIT"))
-        # Start a new transaction
-        connection.execute(text("BEGIN"))
-
-        batches += 1
-
-    # Final check for any remaining NULL values
-    connection.execute(
-        text(
-            """
-    UPDATE chat_message SET message_tsv = to_tsvector('english', message)
-    WHERE message_tsv IS NULL
-    """
-        )
-    )
-
-    # Create GIN index concurrently
-    connection.execute(text("COMMIT"))
-
-    time.time()
-
-    connection.execute(
-        text(
-            """
-    CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
-    ON chat_message USING GIN (message_tsv)
-    """
-        )
-    )
-
-    # First drop the trigger as it won't be needed anymore
-    connection.execute(
-        text(
-            """
-    DROP TRIGGER IF EXISTS chat_message_tsv_trigger ON chat_message;
-    """
-        )
-    )
-
-    connection.execute(
-        text(
-            """
-    DROP FUNCTION IF EXISTS update_chat_message_tsv();
-    """
-        )
-    )
-
-    # Add new generated column
-    time.time()
-    connection.execute(
-        text(
-            """
-    ALTER TABLE chat_message
-    ADD COLUMN message_tsv_gen tsvector
-    GENERATED ALWAYS AS (to_tsvector('english', message)) STORED;
-    """
-        )
-    )
-
-    connection.execute(text("COMMIT"))
-
-    time.time()
-
-    connection.execute(
-        text(
-            """
-    CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv_gen
-    ON chat_message USING GIN (message_tsv_gen)
-    """
-        )
-    )
-
-    # Drop old index and column
-    connection.execute(text("COMMIT"))
-
-    connection.execute(
-        text(
-            """
-    DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;
-    """
-        )
-    )
-    connection.execute(text("COMMIT"))
-    connection.execute(
-        text(
-            """
-    ALTER TABLE chat_message DROP COLUMN message_tsv;
-    """
-        )
-    )
-
-    # Rename new column to old name
-    connection.execute(
-        text(
-            """
-    ALTER TABLE chat_message RENAME COLUMN message_tsv_gen TO message_tsv;
-    """
-        )
-    )
-
-    # --- PART 2: chat_session table ---
-
-    # Step 1: Add nullable column (quick, minimal locking)
-    time.time()
-    connection.execute(
-        text(
-            "ALTER TABLE chat_session ADD COLUMN IF NOT EXISTS description_tsv tsvector"
-        )
-    )
-
-    # Step 2: Create function and trigger for new/updated rows - SPLIT INTO SEPARATE CALLS
-    connection.execute(
-        text(
-            """
-    CREATE OR REPLACE FUNCTION update_chat_session_tsv()
-    RETURNS TRIGGER AS $$
-    BEGIN
-      NEW.description_tsv = to_tsvector('english', COALESCE(NEW.description, ''));
-      RETURN NEW;
-    END;
-    $$ LANGUAGE plpgsql
-    """
-        )
-    )
-
-    # Create trigger in a separate execute call
-    connection.execute(
-        text(
-            """
-    CREATE TRIGGER chat_session_tsv_trigger
-    BEFORE INSERT OR UPDATE ON chat_session
-    FOR EACH ROW EXECUTE FUNCTION update_chat_session_tsv()
-    """
-        )
-    )
-
-    # Step 3: Update existing rows in batches using Python
-    time.time()
-
-    # Get the maximum ID to determine batch count
-    # Cast id to text for MAX function since it's a UUID
-    max_id_result = connection.execute(
-        text("SELECT COALESCE(MAX(id::text), '0') FROM chat_session")
-    ).scalar()
-    max_id_result if max_id_result is not None else "0"
-    batch_size = 5000
-    batches = 0
-
-    # Get all IDs ordered to process in batches
-    rows = connection.execute(
-        text("SELECT id FROM chat_session ORDER BY id")
-    ).fetchall()
-    total_rows = len(rows)
-
-    # Process in batches
-    for batch_num, batch_start in enumerate(range(0, total_rows, batch_size)):
-        batch_end = min(batch_start + batch_size, total_rows)
-        batch_ids = [row[0] for row in rows[batch_start:batch_end]]
-
-        if not batch_ids:
-            continue
-
-        # Use IN clause instead of BETWEEN for UUIDs
-        placeholders = ", ".join([f":id{i}" for i in range(len(batch_ids))])
-        params = {f"id{i}": id_val for i, id_val in enumerate(batch_ids)}
-
-        # Execute update for this batch
-        connection.execute(
-            text(
-                f"""
-            UPDATE chat_session
-            SET description_tsv = to_tsvector('english', COALESCE(description, ''))
-            WHERE id IN ({placeholders})
-            AND description_tsv IS NULL
-            """
-            ).bindparams(**params)
-        )
-
-        # Commit each batch
-        connection.execute(text("COMMIT"))
-        # Start a new transaction
-        connection.execute(text("BEGIN"))
-
-        batches += 1
-
-    # Final check for any remaining NULL values
-    connection.execute(
-        text(
-            """
-    UPDATE chat_session SET description_tsv = to_tsvector('english', COALESCE(description, ''))
-    WHERE description_tsv IS NULL
-    """
-        )
-    )
-
-    # Create GIN index concurrently
-    connection.execute(text("COMMIT"))
-
-    time.time()
-    connection.execute(
-        text(
-            """
-    CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
-    ON chat_session USING GIN (description_tsv)
-    """
-        )
-    )
-
-    # After Final check for chat_session
-    # First drop the trigger as it won't be needed anymore
-    connection.execute(
-        text(
-            """
-    DROP TRIGGER IF EXISTS chat_session_tsv_trigger ON chat_session;
-    """
-        )
-    )
-
-    connection.execute(
-        text(
-            """
-    DROP FUNCTION IF EXISTS update_chat_session_tsv();
-    """
-        )
-    )
-    # Add new generated column
-    time.time()
-    connection.execute(
-        text(
-            """
-    ALTER TABLE chat_session
-    ADD COLUMN description_tsv_gen tsvector
-    GENERATED ALWAYS AS (to_tsvector('english', COALESCE(description, ''))) STORED;
-    """
-        )
-    )
-
-    # Create new index on generated column
-    connection.execute(text("COMMIT"))
-
-    time.time()
-    connection.execute(
-        text(
-            """
-    CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv_gen
-    ON chat_session USING GIN (description_tsv_gen)
-    """
-        )
-    )
-
-    # Drop old index and column
-    connection.execute(text("COMMIT"))
-
-    connection.execute(
-        text(
-            """
-    DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;
-    """
-        )
-    )
-    connection.execute(text("COMMIT"))
-    connection.execute(
-        text(
-            """
-    ALTER TABLE chat_session DROP COLUMN description_tsv;
-    """
-        )
-    )
-
-    # Rename new column to old name
-    connection.execute(
-        text(
-            """
-    ALTER TABLE chat_session RENAME COLUMN description_tsv_gen TO description_tsv;
-    """
-        )
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_chat_session_desc_tsv
+        ON chat_session
+        USING GIN (description_tsv)
+        """
    )


 def downgrade() -> None:
-    # Drop the indexes first (use CONCURRENTLY for dropping too)
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
-
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
+    # Drop the indexes first
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")

    # Then drop the columns
    op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv;")
--- a/backend/alembic/versions/3c5e35aa9af0_polling_document_count.py
+++ b/backend/alembic/versions/3c5e35aa9af0_polling_document_count.py
@@ -5,6 +5,7 @@ Revises: 27c6ecc08586
 Create Date: 2023-06-14 23:45:51.760440

 """
+
 import sqlalchemy as sa
 from alembic import op

--- a/backend/alembic/versions/3c6531f32351_add_back_input_prompts.py
+++ b/backend/alembic/versions/3c6531f32351_add_back_input_prompts.py
@@ -5,6 +5,7 @@ Revises: aeda5f2df4f6
 Create Date: 2025-01-13 12:49:51.705235

 """
+
 from alembic import op
 import sqlalchemy as sa
 import fastapi_users_db_sqlalchemy
--- a/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
+++ b/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
@@ -0,0 +1,30 @@
+"""add_doc_metadata_field_in_document_model
+
+Revision ID: 3fc5d75723b3
+Revises: 2f95e36923e6
+Create Date: 2025-07-28 18:45:37.985406
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "3fc5d75723b3"
+down_revision = "2f95e36923e6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document",
+        sa.Column(
+            "doc_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("document", "doc_metadata")
--- a/backend/alembic/versions/401c1ac29467_add_tables_for_ui_based_llm_.py
+++ b/backend/alembic/versions/401c1ac29467_add_tables_for_ui_based_llm_.py
@@ -5,6 +5,7 @@ Revises: 703313b75876
 Create Date: 2024-04-13 18:07:29.153817

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py
+++ b/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py
@@ -5,6 +5,7 @@ Revises: e1392f05e840
 Create Date: 2024-08-01 12:38:54.466081

 """
+
 from alembic import op

 # revision identifiers, used by Alembic.
--- a/backend/alembic/versions/44f856ae2a4a_add_cloud_embedding_model.py
+++ b/backend/alembic/versions/44f856ae2a4a_add_cloud_embedding_model.py
@@ -5,6 +5,7 @@ Revises: d716b0791ddd
 Create Date: 2024-06-28 20:01:05.927647

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/4505fd7302e1_added_is_internet_to_dbdoc.py
+++ b/backend/alembic/versions/4505fd7302e1_added_is_internet_to_dbdoc.py
@@ -5,6 +5,7 @@ Revises: c18cdf4b497e
 Create Date: 2024-06-18 20:46:09.095034

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/465f78d9b7f9_larger_access_tokens_for_oauth.py
+++ b/backend/alembic/versions/465f78d9b7f9_larger_access_tokens_for_oauth.py
@@ -5,6 +5,7 @@ Revises: 3c5e35aa9af0
 Create Date: 2023-07-18 17:33:40.365034

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/46625e4745d4_remove_native_enum.py
+++ b/backend/alembic/versions/46625e4745d4_remove_native_enum.py
@@ -5,6 +5,7 @@ Revises: 9d97fecfab7f
 Create Date: 2023-10-27 11:38:33.803145

 """
+
 from alembic import op
 from sqlalchemy import String

--- a/backend/alembic/versions/46b7a812670f_fix_user__external_user_group_id_fk.py
+++ b/backend/alembic/versions/46b7a812670f_fix_user__external_user_group_id_fk.py
@@ -5,6 +5,7 @@ Revises: f32615f71aeb
 Create Date: 2024-09-23 12:58:03.894038

 """
+
 from alembic import op

 # revision identifiers, used by Alembic.
--- a/backend/alembic/versions/4738e4b3bae1_pg_file_store.py
+++ b/backend/alembic/versions/4738e4b3bae1_pg_file_store.py
@@ -5,6 +5,7 @@ Revises: e91df4e935ef
 Create Date: 2024-03-20 18:53:32.461518

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/47433d30de82_create_indexattempt_table.py
+++ b/backend/alembic/versions/47433d30de82_create_indexattempt_table.py
@@ -5,6 +5,7 @@ Revises:
 Create Date: 2023-05-04 00:55:32.971991

 """
+
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/475fcefe8826_add_name_to_api_key.py
+++ b/backend/alembic/versions/475fcefe8826_add_name_to_api_key.py
@@ -5,6 +5,7 @@ Revises: ecab2b3f1a3b
 Create Date: 2024-04-11 11:05:18.414438

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/4794bc13e484_update_prompt_length.py
+++ b/backend/alembic/versions/4794bc13e484_update_prompt_length.py
@@ -0,0 +1,51 @@
+"""update prompt length
+
+Revision ID: 4794bc13e484
+Revises: f7505c5b0284
+Create Date: 2025-04-02 11:26:36.180328
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "4794bc13e484"
+down_revision = "f7505c5b0284"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.alter_column(
+        "prompt",
+        "system_prompt",
+        existing_type=sa.TEXT(),
+        type_=sa.String(length=5000000),
+        existing_nullable=False,
+    )
+    op.alter_column(
+        "prompt",
+        "task_prompt",
+        existing_type=sa.TEXT(),
+        type_=sa.String(length=5000000),
+        existing_nullable=False,
+    )
+
+
+def downgrade() -> None:
+    op.alter_column(
+        "prompt",
+        "system_prompt",
+        existing_type=sa.String(length=5000000),
+        type_=sa.TEXT(),
+        existing_nullable=False,
+    )
+    op.alter_column(
+        "prompt",
+        "task_prompt",
+        existing_type=sa.String(length=5000000),
+        type_=sa.TEXT(),
+        existing_nullable=False,
+    )
--- a/backend/alembic/versions/47a07e1a38f1_fix_invalid_model_configurations_state.py
+++ b/backend/alembic/versions/47a07e1a38f1_fix_invalid_model_configurations_state.py
@@ -0,0 +1,150 @@
+"""Fix invalid model-configurations state
+
+Revision ID: 47a07e1a38f1
+Revises: 7a70b7664e37
+Create Date: 2025-04-23 15:39:43.159504
+
+"""
+
+from alembic import op
+from pydantic import BaseModel, ConfigDict
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from onyx.llm.llm_provider_options import (
+    fetch_model_names_for_provider_as_set,
+    fetch_visible_model_names_for_provider_as_set,
+)
+
+
+# revision identifiers, used by Alembic.
+revision = "47a07e1a38f1"
+down_revision = "7a70b7664e37"
+branch_labels = None
+depends_on = None
+
+
+class _SimpleModelConfiguration(BaseModel):
+    # Configure model to read from attributes
+    model_config = ConfigDict(from_attributes=True)
+
+    id: int
+    llm_provider_id: int
+    name: str
+    is_visible: bool
+    max_input_tokens: int | None
+
+
+def upgrade() -> None:
+    llm_provider_table = sa.sql.table(
+        "llm_provider",
+        sa.column("id", sa.Integer),
+        sa.column("provider", sa.String),
+        sa.column("model_names", postgresql.ARRAY(sa.String)),
+        sa.column("display_model_names", postgresql.ARRAY(sa.String)),
+        sa.column("default_model_name", sa.String),
+        sa.column("fast_default_model_name", sa.String),
+    )
+    model_configuration_table = sa.sql.table(
+        "model_configuration",
+        sa.column("id", sa.Integer),
+        sa.column("llm_provider_id", sa.Integer),
+        sa.column("name", sa.String),
+        sa.column("is_visible", sa.Boolean),
+        sa.column("max_input_tokens", sa.Integer),
+    )
+
+    connection = op.get_bind()
+
+    llm_providers = connection.execute(
+        sa.select(
+            llm_provider_table.c.id,
+            llm_provider_table.c.provider,
+        )
+    ).fetchall()
+
+    for llm_provider in llm_providers:
+        llm_provider_id, provider_name = llm_provider
+
+        default_models = fetch_model_names_for_provider_as_set(provider_name)
+        display_models = fetch_visible_model_names_for_provider_as_set(
+            provider_name=provider_name
+        )
+
+        # if `fetch_model_names_for_provider_as_set` returns `None`, then
+        # that means that `provider_name` is not a well-known llm provider.
+        if not default_models:
+            continue
+
+        if not display_models:
+            raise RuntimeError(
+                "If `default_models` is non-None, `display_models` must be non-None too."
+            )
+
+        model_configurations = [
+            _SimpleModelConfiguration.model_validate(model_configuration)
+            for model_configuration in connection.execute(
+                sa.select(
+                    model_configuration_table.c.id,
+                    model_configuration_table.c.llm_provider_id,
+                    model_configuration_table.c.name,
+                    model_configuration_table.c.is_visible,
+                    model_configuration_table.c.max_input_tokens,
+                ).where(model_configuration_table.c.llm_provider_id == llm_provider_id)
+            ).fetchall()
+        ]
+
+        if model_configurations:
+            at_least_one_is_visible = any(
+                [
+                    model_configuration.is_visible
+                    for model_configuration in model_configurations
+                ]
+            )
+
+            # If there is at least one model which is public, this is a valid state.
+            # Therefore, don't touch it and move on to the next one.
+            if at_least_one_is_visible:
+                continue
+
+            existing_visible_model_names: set[str] = set(
+                [
+                    model_configuration.name
+                    for model_configuration in model_configurations
+                    if model_configuration.is_visible
+                ]
+            )
+
+            difference = display_models.difference(existing_visible_model_names)
+
+            for model_name in difference:
+                if not model_name:
+                    continue
+
+                insert_statement = postgresql.insert(model_configuration_table).values(
+                    llm_provider_id=llm_provider_id,
+                    name=model_name,
+                    is_visible=True,
+                    max_input_tokens=None,
+                )
+
+                connection.execute(
+                    insert_statement.on_conflict_do_update(
+                        index_elements=["llm_provider_id", "name"],
+                        set_={"is_visible": insert_statement.excluded.is_visible},
+                    )
+                )
+        else:
+            for model_name in default_models:
+                connection.execute(
+                    model_configuration_table.insert().values(
+                        llm_provider_id=llm_provider_id,
+                        name=model_name,
+                        is_visible=model_name in display_models,
+                        max_input_tokens=None,
+                    )
+                )
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py
+++ b/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py
@@ -5,6 +5,7 @@ Revises: dfbe9e93d3c7
 Create Date: 2024-11-05 18:55:02.221064

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/48d14957fe80_add_support_for_custom_tools.py
+++ b/backend/alembic/versions/48d14957fe80_add_support_for_custom_tools.py
@@ -5,6 +5,7 @@ Revises: b85f02ec1308
 Create Date: 2024-06-09 14:58:19.946509

 """
+
 from alembic import op
 import fastapi_users_db_sqlalchemy
 import sqlalchemy as sa
--- a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
+++ b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
@@ -0,0 +1,691 @@
+"""create knowledge graph tables
+
+Revision ID: 495cb26ce93e
+Revises: ca04500b9ee8
+Create Date: 2025-03-19 08:51:14.341989
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import text
+from datetime import datetime, timedelta
+
+from onyx.configs.app_configs import DB_READONLY_USER
+from onyx.configs.app_configs import DB_READONLY_PASSWORD
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
+
+
+# revision identifiers, used by Alembic.
+revision = "495cb26ce93e"
+down_revision = "ca04500b9ee8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+
+    # Create a new permission-less user to be later used for knowledge graph queries.
+    # The user will later get temporary read privileges for a specific view that will be
+    # ad hoc generated specific to a knowledge graph query.
+    #
+    # Note: in order for the migration to run, the DB_READONLY_USER and DB_READONLY_PASSWORD
+    # environment variables MUST be set. Otherwise, an exception will be raised.
+
+    if not MULTI_TENANT:
+
+        # Enable pg_trgm extension if not already enabled
+        op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
+
+        # Create read-only db user here only in single tenant mode. For multi-tenant mode,
+        # the user is created in the alembic_tenants migration.
+        if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
+            raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
+
+        op.execute(
+            text(
+                f"""
+                DO $$
+                BEGIN
+                    -- Check if the read-only user already exists
+                    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                        -- Create the read-only user with the specified password
+                        EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
+                        -- First revoke all privileges to ensure a clean slate
+                        EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
+                        -- Grant only the CONNECT privilege to allow the user to connect to the database
+                        -- but not perform any operations without additional specific grants
+                        EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
+                    END IF;
+                END
+                $$;
+                """
+            )
+        )
+
+    # Grant usage on current schema to readonly user
+    op.execute(
+        text(
+            f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    EXECUTE format('GRANT USAGE ON SCHEMA %I TO %I', current_schema(), '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+            """
+        )
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_config CASCADE")
+    op.create_table(
+        "kg_config",
+        sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
+        sa.Column("kg_variable_name", sa.String(), nullable=False, index=True),
+        sa.Column("kg_variable_values", postgresql.ARRAY(sa.String()), nullable=False),
+        sa.UniqueConstraint("kg_variable_name", name="uq_kg_config_variable_name"),
+    )
+
+    # Insert initial data into kg_config table
+    op.bulk_insert(
+        sa.table(
+            "kg_config",
+            sa.column("kg_variable_name", sa.String),
+            sa.column("kg_variable_values", postgresql.ARRAY(sa.String)),
+        ),
+        [
+            {"kg_variable_name": "KG_EXPOSED", "kg_variable_values": ["false"]},
+            {"kg_variable_name": "KG_ENABLED", "kg_variable_values": ["false"]},
+            {"kg_variable_name": "KG_VENDOR", "kg_variable_values": []},
+            {"kg_variable_name": "KG_VENDOR_DOMAINS", "kg_variable_values": []},
+            {"kg_variable_name": "KG_IGNORE_EMAIL_DOMAINS", "kg_variable_values": []},
+            {
+                "kg_variable_name": "KG_EXTRACTION_IN_PROGRESS",
+                "kg_variable_values": ["false"],
+            },
+            {
+                "kg_variable_name": "KG_CLUSTERING_IN_PROGRESS",
+                "kg_variable_values": ["false"],
+            },
+            {
+                "kg_variable_name": "KG_COVERAGE_START",
+                "kg_variable_values": [
+                    (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
+                ],
+            },
+            {"kg_variable_name": "KG_MAX_COVERAGE_DAYS", "kg_variable_values": ["90"]},
+            {
+                "kg_variable_name": "KG_MAX_PARENT_RECURSION_DEPTH",
+                "kg_variable_values": ["2"],
+            },
+        ],
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_entity_type CASCADE")
+    op.create_table(
+        "kg_entity_type",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("grounding", sa.String(), nullable=False),
+        sa.Column(
+            "attributes",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("active", sa.Boolean(), nullable=False, default=False),
+        sa.Column("deep_extraction", sa.Boolean(), nullable=False, default=False),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.Column("grounded_source_name", sa.String(), nullable=True),
+        sa.Column("entity_values", postgresql.ARRAY(sa.String()), nullable=True),
+        sa.Column(
+            "clustering",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship_type CASCADE")
+    # Create KGRelationshipType table
+    op.create_table(
+        "kg_relationship_type",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column(
+            "source_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column(
+            "target_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column("definition", sa.Boolean(), nullable=False, default=False),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("active", sa.Boolean(), nullable=False, default=True),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.Column(
+            "clustering",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.ForeignKeyConstraint(
+            ["source_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+        sa.ForeignKeyConstraint(
+            ["target_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship_type_extraction_staging CASCADE")
+    # Create KGRelationshipTypeExtractionStaging table
+    op.create_table(
+        "kg_relationship_type_extraction_staging",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column(
+            "source_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column(
+            "target_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column("definition", sa.Boolean(), nullable=False, default=False),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("active", sa.Boolean(), nullable=False, default=True),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.Column(
+            "clustering",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("transferred", sa.Boolean(), nullable=False, server_default="false"),
+        sa.ForeignKeyConstraint(
+            ["source_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+        sa.ForeignKeyConstraint(
+            ["target_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_entity CASCADE")
+
+    # Create KGEntity table
+    op.create_table(
+        "kg_entity",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column("entity_class", sa.String(), nullable=True, index=True),
+        sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
+        sa.Column("entity_key", sa.String(), nullable=True, index=True),
+        sa.Column("name_trigrams", postgresql.ARRAY(sa.String(3)), nullable=True),
+        sa.Column("document_id", sa.String(), nullable=True, index=True),
+        sa.Column(
+            "alternative_names",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("entity_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column(
+            "keywords",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column(
+            "acl", postgresql.ARRAY(sa.String()), nullable=False, server_default="{}"
+        ),
+        sa.Column("boosts", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("attributes", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("event_time", sa.DateTime(timezone=True), nullable=True),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(["entity_type_id_name"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
+        sa.UniqueConstraint(
+            "name",
+            "entity_type_id_name",
+            "document_id",
+            name="uq_kg_entity_name_type_doc",
+        ),
+    )
+    op.create_index("ix_entity_type_acl", "kg_entity", ["entity_type_id_name", "acl"])
+    op.create_index(
+        "ix_entity_name_search", "kg_entity", ["name", "entity_type_id_name"]
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_entity_extraction_staging CASCADE")
+    # Create KGEntityExtractionStaging table
+    op.create_table(
+        "kg_entity_extraction_staging",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column("document_id", sa.String(), nullable=True, index=True),
+        sa.Column(
+            "alternative_names",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("entity_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column(
+            "keywords",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column(
+            "acl", postgresql.ARRAY(sa.String()), nullable=False, server_default="{}"
+        ),
+        sa.Column("boosts", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("attributes", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("transferred_id_name", sa.String(), nullable=True, default=None),
+        sa.Column("entity_class", sa.String(), nullable=True, index=True),
+        sa.Column("entity_key", sa.String(), nullable=True, index=True),
+        sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
+        sa.Column("parent_key", sa.String(), nullable=True, index=True),
+        sa.Column("event_time", sa.DateTime(timezone=True), nullable=True),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(["entity_type_id_name"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
+    )
+    op.create_index(
+        "ix_entity_extraction_staging_acl",
+        "kg_entity_extraction_staging",
+        ["entity_type_id_name", "acl"],
+    )
+    op.create_index(
+        "ix_entity_extraction_staging_name_search",
+        "kg_entity_extraction_staging",
+        ["name", "entity_type_id_name"],
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship CASCADE")
+    # Create KGRelationship table
+    op.create_table(
+        "kg_relationship",
+        sa.Column("id_name", sa.String(), nullable=False, index=True),
+        sa.Column("source_node", sa.String(), nullable=False, index=True),
+        sa.Column("target_node", sa.String(), nullable=False, index=True),
+        sa.Column("source_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("target_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("source_document", sa.String(), nullable=True, index=True),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("relationship_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(["source_node"], ["kg_entity.id_name"]),
+        sa.ForeignKeyConstraint(["target_node"], ["kg_entity.id_name"]),
+        sa.ForeignKeyConstraint(["source_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["target_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["source_document"], ["document.id"]),
+        sa.ForeignKeyConstraint(
+            ["relationship_type_id_name"], ["kg_relationship_type.id_name"]
+        ),
+        sa.UniqueConstraint(
+            "source_node",
+            "target_node",
+            "type",
+            name="uq_kg_relationship_source_target_type",
+        ),
+        sa.PrimaryKeyConstraint("id_name", "source_document"),
+    )
+    op.create_index(
+        "ix_kg_relationship_nodes", "kg_relationship", ["source_node", "target_node"]
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship_extraction_staging CASCADE")
+    # Create KGRelationshipExtractionStaging table
+    op.create_table(
+        "kg_relationship_extraction_staging",
+        sa.Column("id_name", sa.String(), nullable=False, index=True),
+        sa.Column("source_node", sa.String(), nullable=False, index=True),
+        sa.Column("target_node", sa.String(), nullable=False, index=True),
+        sa.Column("source_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("target_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("source_document", sa.String(), nullable=True, index=True),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("relationship_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("transferred", sa.Boolean(), nullable=False, server_default="false"),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(
+            ["source_node"], ["kg_entity_extraction_staging.id_name"]
+        ),
+        sa.ForeignKeyConstraint(
+            ["target_node"], ["kg_entity_extraction_staging.id_name"]
+        ),
+        sa.ForeignKeyConstraint(["source_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["target_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["source_document"], ["document.id"]),
+        sa.ForeignKeyConstraint(
+            ["relationship_type_id_name"],
+            ["kg_relationship_type_extraction_staging.id_name"],
+        ),
+        sa.UniqueConstraint(
+            "source_node",
+            "target_node",
+            "type",
+            name="uq_kg_relationship_extraction_staging_source_target_type",
+        ),
+        sa.PrimaryKeyConstraint("id_name", "source_document"),
+    )
+    op.create_index(
+        "ix_kg_relationship_extraction_staging_nodes",
+        "kg_relationship_extraction_staging",
+        ["source_node", "target_node"],
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_term CASCADE")
+    # Create KGTerm table
+    op.create_table(
+        "kg_term",
+        sa.Column("id_term", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column(
+            "entity_types",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+    )
+    op.create_index("ix_search_term_entities", "kg_term", ["entity_types"])
+    op.create_index("ix_search_term_term", "kg_term", ["id_term"])
+
+    op.add_column(
+        "document",
+        sa.Column("kg_stage", sa.String(), nullable=True, index=True),
+    )
+    op.add_column(
+        "document",
+        sa.Column("kg_processing_time", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "connector",
+        sa.Column(
+            "kg_processing_enabled",
+            sa.Boolean(),
+            nullable=True,
+            server_default="false",
+        ),
+    )
+
+    op.add_column(
+        "connector",
+        sa.Column(
+            "kg_coverage_days",
+            sa.Integer(),
+            nullable=True,
+            server_default=None,
+        ),
+    )
+
+    # Create GIN index for clustering and normalization
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS idx_kg_entity_clustering_trigrams "
+        f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA}.gin_trgm_ops)"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS idx_kg_entity_normalization_trigrams "
+        "ON kg_entity USING GIN (name_trigrams)"
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    alphanum_pattern = r"[^a-z0-9]+"
+    truncate_length = 1000
+    function = "update_kg_entity_name"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION {function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                name text;
+                cleaned_name text;
+            BEGIN
+                -- Set name to semantic_id if document_id is not NULL
+                IF NEW.document_id IS NOT NULL THEN
+                    SELECT lower(semantic_id) INTO name
+                    FROM document
+                    WHERE id = NEW.document_id;
+                ELSE
+                    name = lower(NEW.name);
+                END IF;
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams
+                NEW.name = name;
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f"DROP TRIGGER IF EXISTS {trigger} ON kg_entity")
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            BEFORE INSERT OR UPDATE OF name
+            ON kg_entity
+            FOR EACH ROW
+            EXECUTE FUNCTION {function}();
+        """
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    function = "update_kg_entity_name_from_doc"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION {function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                doc_name text;
+                cleaned_name text;
+            BEGIN
+                doc_name = lower(NEW.semantic_id);
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    doc_name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams for all entities referencing this document
+                UPDATE kg_entity
+                SET
+                    name = doc_name,
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
+                WHERE document_id = NEW.id;
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f"DROP TRIGGER IF EXISTS {trigger} ON document")
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            AFTER UPDATE OF semantic_id
+            ON document
+            FOR EACH ROW
+            EXECUTE FUNCTION {function}();
+        """
+    )
+
+
+def downgrade() -> None:
+
+    #  Drop all views that start with 'kg_'
+    op.execute(
+        """
+                DO $$
+                DECLARE
+                    view_name text;
+                BEGIN
+                    FOR view_name IN
+                        SELECT c.relname
+                        FROM pg_catalog.pg_class c
+                        JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+                        WHERE c.relkind = 'v'
+                        AND n.nspname = current_schema()
+                        AND c.relname LIKE 'kg_relationships_with_access%'
+                    LOOP
+                        EXECUTE 'DROP VIEW IF EXISTS ' || quote_ident(view_name);
+                    END LOOP;
+                END $$;
+            """
+    )
+
+    op.execute(
+        """
+                DO $$
+                DECLARE
+                    view_name text;
+                BEGIN
+                    FOR view_name IN
+                        SELECT c.relname
+                        FROM pg_catalog.pg_class c
+                        JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+                        WHERE c.relkind = 'v'
+                        AND n.nspname = current_schema()
+                        AND c.relname LIKE 'allowed_docs%'
+                    LOOP
+                        EXECUTE 'DROP VIEW IF EXISTS ' || quote_ident(view_name);
+                    END LOOP;
+                END $$;
+            """
+    )
+
+    for table, function in (
+        ("kg_entity", "update_kg_entity_name"),
+        ("document", "update_kg_entity_name_from_doc"),
+    ):
+        op.execute(f"DROP TRIGGER IF EXISTS {function}_trigger ON {table}")
+        op.execute(f"DROP FUNCTION IF EXISTS {function}()")
+
+    # Drop index
+    op.execute("DROP INDEX IF EXISTS idx_kg_entity_clustering_trigrams")
+    op.execute("DROP INDEX IF EXISTS idx_kg_entity_normalization_trigrams")
+
+    # Drop tables in reverse order of creation to handle dependencies
+    op.drop_table("kg_term")
+    op.drop_table("kg_relationship")
+    op.drop_table("kg_entity")
+    op.drop_table("kg_relationship_type")
+    op.drop_table("kg_relationship_extraction_staging")
+    op.drop_table("kg_relationship_type_extraction_staging")
+    op.drop_table("kg_entity_extraction_staging")
+    op.drop_table("kg_entity_type")
+    op.drop_column("connector", "kg_processing_enabled")
+    op.drop_column("connector", "kg_coverage_days")
+    op.drop_column("document", "kg_stage")
+    op.drop_column("document", "kg_processing_time")
+    op.drop_table("kg_config")
+
+    # Revoke usage on current schema for the readonly user
+    op.execute(
+        text(
+            f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    EXECUTE format('REVOKE ALL ON SCHEMA %I FROM %I', current_schema(), '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+            """
+        )
+    )
+
+    if not MULTI_TENANT:
+        # Drop read-only db user here only in single tenant mode. For multi-tenant mode,
+        # the user is dropped in the alembic_tenants migration.
+
+        op.execute(
+            text(
+                f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    -- First revoke all privileges from the database
+                    EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
+                    -- Then drop the user
+                    EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+        """
+            )
+        )
+        op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))
--- a/backend/alembic/versions/4a951134c801_moved_status_to_connector_credential_.py
+++ b/backend/alembic/versions/4a951134c801_moved_status_to_connector_credential_.py
@@ -5,6 +5,7 @@ Revises: 7477a5f5d728
 Create Date: 2024-08-10 19:20:34.527559

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/4b08d97e175a_change_default_prune_freq.py
+++ b/backend/alembic/versions/4b08d97e175a_change_default_prune_freq.py
@@ -5,6 +5,7 @@ Revises: d9ec13955951
 Create Date: 2024-08-20 15:28:52.993827

 """
+
 from alembic import op

 # revision identifiers, used by Alembic.
--- a/backend/alembic/versions/4d58345da04a_lowercase_user_emails.py
+++ b/backend/alembic/versions/4d58345da04a_lowercase_user_emails.py
@@ -5,7 +5,11 @@ Revises: f1ca58b2f2ec
 Create Date: 2025-01-29 07:48:46.784041

 """
+
+import logging
+from typing import cast
 from alembic import op
+from sqlalchemy.exc import IntegrityError
 from sqlalchemy.sql import text


@@ -15,21 +19,45 @@ down_revision = "f1ca58b2f2ec"
 branch_labels = None
 depends_on = None

+logger = logging.getLogger("alembic.runtime.migration")
+

 def upgrade() -> None:
-    # Get database connection
+    """Conflicts on lowercasing will result in the uppercased email getting a
+    unique integer suffix when converted to lowercase."""
+
    connection = op.get_bind()

-    # Update all user emails to lowercase
-    connection.execute(
-        text(
-            """
-            UPDATE "user"
-            SET email = LOWER(email)
-            WHERE email != LOWER(email)
-            """
-        )
-    )
+    # Fetch all user emails that are not already lowercase
+    user_emails = connection.execute(
+        text('SELECT id, email FROM "user" WHERE email != LOWER(email)')
+    ).fetchall()
+
+    for user_id, email in user_emails:
+        email = cast(str, email)
+        username, domain = email.rsplit("@", 1)
+        new_email = f"{username.lower()}@{domain.lower()}"
+        attempt = 1
+
+        while True:
+            try:
+                # Try updating the email
+                connection.execute(
+                    text('UPDATE "user" SET email = :new_email WHERE id = :user_id'),
+                    {"new_email": new_email, "user_id": user_id},
+                )
+                break  # Success, exit loop
+            except IntegrityError:
+                next_email = f"{username.lower()}_{attempt}@{domain.lower()}"
+                # Email conflict occurred, append `_1`, `_2`, etc., to the username
+                logger.warning(
+                    f"Conflict while lowercasing email: "
+                    f"old_email={email} "
+                    f"conflicting_email={new_email} "
+                    f"next_email={next_email}"
+                )
+                new_email = next_email
+                attempt += 1


 def downgrade() -> None:
--- a/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py
+++ b/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py
@@ -5,6 +5,7 @@ Revises: 47e5bef3a1d7
 Create Date: 2024-11-06 13:15:53.302644

 """
+
 from typing import cast
 from alembic import op
 import sqlalchemy as sa
--- a/backend/alembic/versions/50b683a8295c_add_additional_retrieval_controls_to_.py
+++ b/backend/alembic/versions/50b683a8295c_add_additional_retrieval_controls_to_.py
@@ -5,6 +5,7 @@ Revises: 7da0ae5ad583
 Create Date: 2023-11-27 17:23:29.668422

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py
+++ b/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py
@@ -5,6 +5,7 @@ Revises: f7e58d357687
 Create Date: 2024-08-28 17:40:46.077470

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.sql import func
--- a/backend/alembic/versions/54a74a0417fc_danswerbot_onyxbot.py
+++ b/backend/alembic/versions/54a74a0417fc_danswerbot_onyxbot.py
@@ -5,6 +5,7 @@ Revises: 94dc3d0236f8
 Create Date: 2024-12-11 18:05:05.490737

 """
+
 from alembic import op


--- a/backend/alembic/versions/55546a7967ee_assistant_rework.py
+++ b/backend/alembic/versions/55546a7967ee_assistant_rework.py
@@ -5,6 +5,7 @@ Revises: 61ff3651add4
 Create Date: 2024-09-18 17:00:23.755399

 """
+
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.dialects import postgresql
--- a/backend/alembic/versions/570282d33c49_track_onyxbot_explicitly.py
+++ b/backend/alembic/versions/570282d33c49_track_onyxbot_explicitly.py
@@ -5,6 +5,7 @@ Revises: 7547d982db8f
 Create Date: 2024-05-04 17:49:28.568109

 """
+
 from alembic import op
 import sqlalchemy as sa

--- a/backend/alembic/versions/57b53544726e_add_document_set_tables.py
+++ b/backend/alembic/versions/57b53544726e_add_document_set_tables.py
@@ -5,6 +5,7 @@ Revises: 800f48024ae9
 Create Date: 2023-09-20 16:59:39.097177

 """
+
 from alembic import op
 import fastapi_users_db_sqlalchemy
 import sqlalchemy as sa
--- a/Show More
+++ b/Show More